1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Shared glue code for 128bit block ciphers, AVX2 assembler macros 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 6*4882a593Smuzhiyun */ 7*4882a593Smuzhiyun 8*4882a593Smuzhiyun#define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \ 9*4882a593Smuzhiyun vmovdqu (0*32)(src), x0; \ 10*4882a593Smuzhiyun vmovdqu (1*32)(src), x1; \ 11*4882a593Smuzhiyun vmovdqu (2*32)(src), x2; \ 12*4882a593Smuzhiyun vmovdqu (3*32)(src), x3; \ 13*4882a593Smuzhiyun vmovdqu (4*32)(src), x4; \ 14*4882a593Smuzhiyun vmovdqu (5*32)(src), x5; \ 15*4882a593Smuzhiyun vmovdqu (6*32)(src), x6; \ 16*4882a593Smuzhiyun vmovdqu (7*32)(src), x7; 17*4882a593Smuzhiyun 18*4882a593Smuzhiyun#define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ 19*4882a593Smuzhiyun vmovdqu x0, (0*32)(dst); \ 20*4882a593Smuzhiyun vmovdqu x1, (1*32)(dst); \ 21*4882a593Smuzhiyun vmovdqu x2, (2*32)(dst); \ 22*4882a593Smuzhiyun vmovdqu x3, (3*32)(dst); \ 23*4882a593Smuzhiyun vmovdqu x4, (4*32)(dst); \ 24*4882a593Smuzhiyun vmovdqu x5, (5*32)(dst); \ 25*4882a593Smuzhiyun vmovdqu x6, (6*32)(dst); \ 26*4882a593Smuzhiyun vmovdqu x7, (7*32)(dst); 27*4882a593Smuzhiyun 28*4882a593Smuzhiyun#define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \ 29*4882a593Smuzhiyun vpxor t0, t0, t0; \ 30*4882a593Smuzhiyun vinserti128 $1, (src), t0, t0; \ 31*4882a593Smuzhiyun vpxor t0, x0, x0; \ 32*4882a593Smuzhiyun vpxor (0*32+16)(src), x1, x1; \ 33*4882a593Smuzhiyun vpxor (1*32+16)(src), x2, x2; \ 34*4882a593Smuzhiyun vpxor (2*32+16)(src), x3, x3; \ 35*4882a593Smuzhiyun vpxor (3*32+16)(src), x4, x4; \ 36*4882a593Smuzhiyun vpxor (4*32+16)(src), x5, x5; \ 37*4882a593Smuzhiyun vpxor (5*32+16)(src), x6, x6; \ 38*4882a593Smuzhiyun vpxor (6*32+16)(src), x7, x7; \ 39*4882a593Smuzhiyun store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); 40*4882a593Smuzhiyun 41*4882a593Smuzhiyun#define inc_le128(x, minus_one, tmp) \ 42*4882a593Smuzhiyun vpcmpeqq minus_one, x, tmp; \ 43*4882a593Smuzhiyun vpsubq minus_one, x, x; \ 44*4882a593Smuzhiyun vpslldq $8, tmp, tmp; \ 45*4882a593Smuzhiyun vpsubq tmp, x, x; 46*4882a593Smuzhiyun 47*4882a593Smuzhiyun#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ 48*4882a593Smuzhiyun vpcmpeqq minus_one, x, tmp1; \ 49*4882a593Smuzhiyun vpcmpeqq minus_two, x, tmp2; \ 50*4882a593Smuzhiyun vpsubq minus_two, x, x; \ 51*4882a593Smuzhiyun vpor tmp2, tmp1, tmp1; \ 52*4882a593Smuzhiyun vpslldq $8, tmp1, tmp1; \ 53*4882a593Smuzhiyun vpsubq tmp1, x, x; 54*4882a593Smuzhiyun 55*4882a593Smuzhiyun#define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \ 56*4882a593Smuzhiyun t1x, t2, t2x, t3, t3x, t4, t5) \ 57*4882a593Smuzhiyun vpcmpeqd t0, t0, t0; \ 58*4882a593Smuzhiyun vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \ 59*4882a593Smuzhiyun vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\ 60*4882a593Smuzhiyun \ 61*4882a593Smuzhiyun /* load IV and byteswap */ \ 62*4882a593Smuzhiyun vmovdqu (iv), t2x; \ 63*4882a593Smuzhiyun vmovdqa t2x, t3x; \ 64*4882a593Smuzhiyun inc_le128(t2x, t0x, t1x); \ 65*4882a593Smuzhiyun vbroadcasti128 bswap, t1; \ 66*4882a593Smuzhiyun vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \ 67*4882a593Smuzhiyun vpshufb t1, t2, x0; \ 68*4882a593Smuzhiyun \ 69*4882a593Smuzhiyun /* construct IVs */ \ 70*4882a593Smuzhiyun add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \ 71*4882a593Smuzhiyun vpshufb t1, t2, x1; \ 72*4882a593Smuzhiyun add2_le128(t2, t0, t4, t3, t5); \ 73*4882a593Smuzhiyun vpshufb t1, t2, x2; \ 74*4882a593Smuzhiyun add2_le128(t2, t0, t4, t3, t5); \ 75*4882a593Smuzhiyun vpshufb t1, t2, x3; \ 76*4882a593Smuzhiyun add2_le128(t2, t0, t4, t3, t5); \ 77*4882a593Smuzhiyun vpshufb t1, t2, x4; \ 78*4882a593Smuzhiyun add2_le128(t2, t0, t4, t3, t5); \ 79*4882a593Smuzhiyun vpshufb t1, t2, x5; \ 80*4882a593Smuzhiyun add2_le128(t2, t0, t4, t3, t5); \ 81*4882a593Smuzhiyun vpshufb t1, t2, x6; \ 82*4882a593Smuzhiyun add2_le128(t2, t0, t4, t3, t5); \ 83*4882a593Smuzhiyun vpshufb t1, t2, x7; \ 84*4882a593Smuzhiyun vextracti128 $1, t2, t2x; \ 85*4882a593Smuzhiyun inc_le128(t2x, t0x, t3x); \ 86*4882a593Smuzhiyun vmovdqu t2x, (iv); 87*4882a593Smuzhiyun 88*4882a593Smuzhiyun#define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ 89*4882a593Smuzhiyun vpxor (0*32)(src), x0, x0; \ 90*4882a593Smuzhiyun vpxor (1*32)(src), x1, x1; \ 91*4882a593Smuzhiyun vpxor (2*32)(src), x2, x2; \ 92*4882a593Smuzhiyun vpxor (3*32)(src), x3, x3; \ 93*4882a593Smuzhiyun vpxor (4*32)(src), x4, x4; \ 94*4882a593Smuzhiyun vpxor (5*32)(src), x5, x5; \ 95*4882a593Smuzhiyun vpxor (6*32)(src), x6, x6; \ 96*4882a593Smuzhiyun vpxor (7*32)(src), x7, x7; \ 97*4882a593Smuzhiyun store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); 98*4882a593Smuzhiyun 99*4882a593Smuzhiyun#define gf128mul_x_ble(iv, mask, tmp) \ 100*4882a593Smuzhiyun vpsrad $31, iv, tmp; \ 101*4882a593Smuzhiyun vpaddq iv, iv, iv; \ 102*4882a593Smuzhiyun vpshufd $0x13, tmp, tmp; \ 103*4882a593Smuzhiyun vpand mask, tmp, tmp; \ 104*4882a593Smuzhiyun vpxor tmp, iv, iv; 105*4882a593Smuzhiyun 106*4882a593Smuzhiyun#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ 107*4882a593Smuzhiyun vpsrad $31, iv, tmp0; \ 108*4882a593Smuzhiyun vpaddq iv, iv, tmp1; \ 109*4882a593Smuzhiyun vpsllq $2, iv, iv; \ 110*4882a593Smuzhiyun vpshufd $0x13, tmp0, tmp0; \ 111*4882a593Smuzhiyun vpsrad $31, tmp1, tmp1; \ 112*4882a593Smuzhiyun vpand mask2, tmp0, tmp0; \ 113*4882a593Smuzhiyun vpshufd $0x13, tmp1, tmp1; \ 114*4882a593Smuzhiyun vpxor tmp0, iv, iv; \ 115*4882a593Smuzhiyun vpand mask1, tmp1, tmp1; \ 116*4882a593Smuzhiyun vpxor tmp1, iv, iv; 117*4882a593Smuzhiyun 118*4882a593Smuzhiyun#define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \ 119*4882a593Smuzhiyun tivx, t0, t0x, t1, t1x, t2, t2x, t3, \ 120*4882a593Smuzhiyun xts_gf128mul_and_shl1_mask_0, \ 121*4882a593Smuzhiyun xts_gf128mul_and_shl1_mask_1) \ 122*4882a593Smuzhiyun vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \ 123*4882a593Smuzhiyun \ 124*4882a593Smuzhiyun /* load IV and construct second IV */ \ 125*4882a593Smuzhiyun vmovdqu (iv), tivx; \ 126*4882a593Smuzhiyun vmovdqa tivx, t0x; \ 127*4882a593Smuzhiyun gf128mul_x_ble(tivx, t1x, t2x); \ 128*4882a593Smuzhiyun vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \ 129*4882a593Smuzhiyun vinserti128 $1, tivx, t0, tiv; \ 130*4882a593Smuzhiyun vpxor (0*32)(src), tiv, x0; \ 131*4882a593Smuzhiyun vmovdqu tiv, (0*32)(dst); \ 132*4882a593Smuzhiyun \ 133*4882a593Smuzhiyun /* construct and store IVs, also xor with source */ \ 134*4882a593Smuzhiyun gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 135*4882a593Smuzhiyun vpxor (1*32)(src), tiv, x1; \ 136*4882a593Smuzhiyun vmovdqu tiv, (1*32)(dst); \ 137*4882a593Smuzhiyun \ 138*4882a593Smuzhiyun gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 139*4882a593Smuzhiyun vpxor (2*32)(src), tiv, x2; \ 140*4882a593Smuzhiyun vmovdqu tiv, (2*32)(dst); \ 141*4882a593Smuzhiyun \ 142*4882a593Smuzhiyun gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 143*4882a593Smuzhiyun vpxor (3*32)(src), tiv, x3; \ 144*4882a593Smuzhiyun vmovdqu tiv, (3*32)(dst); \ 145*4882a593Smuzhiyun \ 146*4882a593Smuzhiyun gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 147*4882a593Smuzhiyun vpxor (4*32)(src), tiv, x4; \ 148*4882a593Smuzhiyun vmovdqu tiv, (4*32)(dst); \ 149*4882a593Smuzhiyun \ 150*4882a593Smuzhiyun gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 151*4882a593Smuzhiyun vpxor (5*32)(src), tiv, x5; \ 152*4882a593Smuzhiyun vmovdqu tiv, (5*32)(dst); \ 153*4882a593Smuzhiyun \ 154*4882a593Smuzhiyun gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 155*4882a593Smuzhiyun vpxor (6*32)(src), tiv, x6; \ 156*4882a593Smuzhiyun vmovdqu tiv, (6*32)(dst); \ 157*4882a593Smuzhiyun \ 158*4882a593Smuzhiyun gf128mul_x2_ble(tiv, t1, t2, t0, t3); \ 159*4882a593Smuzhiyun vpxor (7*32)(src), tiv, x7; \ 160*4882a593Smuzhiyun vmovdqu tiv, (7*32)(dst); \ 161*4882a593Smuzhiyun \ 162*4882a593Smuzhiyun vextracti128 $1, tiv, tivx; \ 163*4882a593Smuzhiyun gf128mul_x_ble(tivx, t1x, t2x); \ 164*4882a593Smuzhiyun vmovdqu tivx, (iv); 165*4882a593Smuzhiyun 166*4882a593Smuzhiyun#define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ 167*4882a593Smuzhiyun vpxor (0*32)(dst), x0, x0; \ 168*4882a593Smuzhiyun vpxor (1*32)(dst), x1, x1; \ 169*4882a593Smuzhiyun vpxor (2*32)(dst), x2, x2; \ 170*4882a593Smuzhiyun vpxor (3*32)(dst), x3, x3; \ 171*4882a593Smuzhiyun vpxor (4*32)(dst), x4, x4; \ 172*4882a593Smuzhiyun vpxor (5*32)(dst), x5, x5; \ 173*4882a593Smuzhiyun vpxor (6*32)(dst), x6, x6; \ 174*4882a593Smuzhiyun vpxor (7*32)(dst), x7, x7; \ 175*4882a593Smuzhiyun store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7); 176