1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Shared glue code for 128bit block ciphers, AVX assembler macros 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 6*4882a593Smuzhiyun */ 7*4882a593Smuzhiyun 8*4882a593Smuzhiyun#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \ 9*4882a593Smuzhiyun vmovdqu (0*16)(src), x0; \ 10*4882a593Smuzhiyun vmovdqu (1*16)(src), x1; \ 11*4882a593Smuzhiyun vmovdqu (2*16)(src), x2; \ 12*4882a593Smuzhiyun vmovdqu (3*16)(src), x3; \ 13*4882a593Smuzhiyun vmovdqu (4*16)(src), x4; \ 14*4882a593Smuzhiyun vmovdqu (5*16)(src), x5; \ 15*4882a593Smuzhiyun vmovdqu (6*16)(src), x6; \ 16*4882a593Smuzhiyun vmovdqu (7*16)(src), x7; 17*4882a593Smuzhiyun 18*4882a593Smuzhiyun#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ 19*4882a593Smuzhiyun vmovdqu x0, (0*16)(dst); \ 20*4882a593Smuzhiyun vmovdqu x1, (1*16)(dst); \ 21*4882a593Smuzhiyun vmovdqu x2, (2*16)(dst); \ 22*4882a593Smuzhiyun vmovdqu x3, (3*16)(dst); \ 23*4882a593Smuzhiyun vmovdqu x4, (4*16)(dst); \ 24*4882a593Smuzhiyun vmovdqu x5, (5*16)(dst); \ 25*4882a593Smuzhiyun vmovdqu x6, (6*16)(dst); \ 26*4882a593Smuzhiyun vmovdqu x7, (7*16)(dst); 27*4882a593Smuzhiyun 28*4882a593Smuzhiyun#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ 29*4882a593Smuzhiyun vpxor (0*16)(src), x1, x1; \ 30*4882a593Smuzhiyun vpxor (1*16)(src), x2, x2; \ 31*4882a593Smuzhiyun vpxor (2*16)(src), x3, x3; \ 32*4882a593Smuzhiyun vpxor (3*16)(src), x4, x4; \ 33*4882a593Smuzhiyun vpxor (4*16)(src), x5, x5; \ 34*4882a593Smuzhiyun vpxor (5*16)(src), x6, x6; \ 35*4882a593Smuzhiyun vpxor (6*16)(src), x7, x7; \ 36*4882a593Smuzhiyun store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); 37*4882a593Smuzhiyun 38*4882a593Smuzhiyun#define inc_le128(x, minus_one, tmp) \ 39*4882a593Smuzhiyun vpcmpeqq minus_one, x, tmp; \ 40*4882a593Smuzhiyun vpsubq minus_one, x, x; \ 41*4882a593Smuzhiyun vpslldq $8, tmp, tmp; \ 42*4882a593Smuzhiyun vpsubq tmp, x, x; 43*4882a593Smuzhiyun 44*4882a593Smuzhiyun#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \ 45*4882a593Smuzhiyun vpcmpeqd t0, t0, t0; \ 46*4882a593Smuzhiyun vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \ 47*4882a593Smuzhiyun vmovdqa bswap, t1; \ 48*4882a593Smuzhiyun \ 49*4882a593Smuzhiyun /* load IV and byteswap */ \ 50*4882a593Smuzhiyun vmovdqu (iv), x7; \ 51*4882a593Smuzhiyun vpshufb t1, x7, x0; \ 52*4882a593Smuzhiyun \ 53*4882a593Smuzhiyun /* construct IVs */ \ 54*4882a593Smuzhiyun inc_le128(x7, t0, t2); \ 55*4882a593Smuzhiyun vpshufb t1, x7, x1; \ 56*4882a593Smuzhiyun inc_le128(x7, t0, t2); \ 57*4882a593Smuzhiyun vpshufb t1, x7, x2; \ 58*4882a593Smuzhiyun inc_le128(x7, t0, t2); \ 59*4882a593Smuzhiyun vpshufb t1, x7, x3; \ 60*4882a593Smuzhiyun inc_le128(x7, t0, t2); \ 61*4882a593Smuzhiyun vpshufb t1, x7, x4; \ 62*4882a593Smuzhiyun inc_le128(x7, t0, t2); \ 63*4882a593Smuzhiyun vpshufb t1, x7, x5; \ 64*4882a593Smuzhiyun inc_le128(x7, t0, t2); \ 65*4882a593Smuzhiyun vpshufb t1, x7, x6; \ 66*4882a593Smuzhiyun inc_le128(x7, t0, t2); \ 67*4882a593Smuzhiyun vmovdqa x7, t2; \ 68*4882a593Smuzhiyun vpshufb t1, x7, x7; \ 69*4882a593Smuzhiyun inc_le128(t2, t0, t1); \ 70*4882a593Smuzhiyun vmovdqu t2, (iv); 71*4882a593Smuzhiyun 72*4882a593Smuzhiyun#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ 73*4882a593Smuzhiyun vpxor (0*16)(src), x0, x0; \ 74*4882a593Smuzhiyun vpxor (1*16)(src), x1, x1; \ 75*4882a593Smuzhiyun vpxor (2*16)(src), x2, x2; \ 76*4882a593Smuzhiyun vpxor (3*16)(src), x3, x3; \ 77*4882a593Smuzhiyun vpxor (4*16)(src), x4, x4; \ 78*4882a593Smuzhiyun vpxor (5*16)(src), x5, x5; \ 79*4882a593Smuzhiyun vpxor (6*16)(src), x6, x6; \ 80*4882a593Smuzhiyun vpxor (7*16)(src), x7, x7; \ 81*4882a593Smuzhiyun store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); 82*4882a593Smuzhiyun 83*4882a593Smuzhiyun#define gf128mul_x_ble(iv, mask, tmp) \ 84*4882a593Smuzhiyun vpsrad $31, iv, tmp; \ 85*4882a593Smuzhiyun vpaddq iv, iv, iv; \ 86*4882a593Smuzhiyun vpshufd $0x13, tmp, tmp; \ 87*4882a593Smuzhiyun vpand mask, tmp, tmp; \ 88*4882a593Smuzhiyun vpxor tmp, iv, iv; 89*4882a593Smuzhiyun 90*4882a593Smuzhiyun#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \ 91*4882a593Smuzhiyun t1, xts_gf128mul_and_shl1_mask) \ 92*4882a593Smuzhiyun vmovdqa xts_gf128mul_and_shl1_mask, t0; \ 93*4882a593Smuzhiyun \ 94*4882a593Smuzhiyun /* load IV */ \ 95*4882a593Smuzhiyun vmovdqu (iv), tiv; \ 96*4882a593Smuzhiyun vpxor (0*16)(src), tiv, x0; \ 97*4882a593Smuzhiyun vmovdqu tiv, (0*16)(dst); \ 98*4882a593Smuzhiyun \ 99*4882a593Smuzhiyun /* construct and store IVs, also xor with source */ \ 100*4882a593Smuzhiyun gf128mul_x_ble(tiv, t0, t1); \ 101*4882a593Smuzhiyun vpxor (1*16)(src), tiv, x1; \ 102*4882a593Smuzhiyun vmovdqu tiv, (1*16)(dst); \ 103*4882a593Smuzhiyun \ 104*4882a593Smuzhiyun gf128mul_x_ble(tiv, t0, t1); \ 105*4882a593Smuzhiyun vpxor (2*16)(src), tiv, x2; \ 106*4882a593Smuzhiyun vmovdqu tiv, (2*16)(dst); \ 107*4882a593Smuzhiyun \ 108*4882a593Smuzhiyun gf128mul_x_ble(tiv, t0, t1); \ 109*4882a593Smuzhiyun vpxor (3*16)(src), tiv, x3; \ 110*4882a593Smuzhiyun vmovdqu tiv, (3*16)(dst); \ 111*4882a593Smuzhiyun \ 112*4882a593Smuzhiyun gf128mul_x_ble(tiv, t0, t1); \ 113*4882a593Smuzhiyun vpxor (4*16)(src), tiv, x4; \ 114*4882a593Smuzhiyun vmovdqu tiv, (4*16)(dst); \ 115*4882a593Smuzhiyun \ 116*4882a593Smuzhiyun gf128mul_x_ble(tiv, t0, t1); \ 117*4882a593Smuzhiyun vpxor (5*16)(src), tiv, x5; \ 118*4882a593Smuzhiyun vmovdqu tiv, (5*16)(dst); \ 119*4882a593Smuzhiyun \ 120*4882a593Smuzhiyun gf128mul_x_ble(tiv, t0, t1); \ 121*4882a593Smuzhiyun vpxor (6*16)(src), tiv, x6; \ 122*4882a593Smuzhiyun vmovdqu tiv, (6*16)(dst); \ 123*4882a593Smuzhiyun \ 124*4882a593Smuzhiyun gf128mul_x_ble(tiv, t0, t1); \ 125*4882a593Smuzhiyun vpxor (7*16)(src), tiv, x7; \ 126*4882a593Smuzhiyun vmovdqu tiv, (7*16)(dst); \ 127*4882a593Smuzhiyun \ 128*4882a593Smuzhiyun gf128mul_x_ble(tiv, t0, t1); \ 129*4882a593Smuzhiyun vmovdqu tiv, (iv); 130*4882a593Smuzhiyun 131*4882a593Smuzhiyun#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ 132*4882a593Smuzhiyun vpxor (0*16)(dst), x0, x0; \ 133*4882a593Smuzhiyun vpxor (1*16)(dst), x1, x1; \ 134*4882a593Smuzhiyun vpxor (2*16)(dst), x2, x2; \ 135*4882a593Smuzhiyun vpxor (3*16)(dst), x3, x3; \ 136*4882a593Smuzhiyun vpxor (4*16)(dst), x4, x4; \ 137*4882a593Smuzhiyun vpxor (5*16)(dst), x5, x5; \ 138*4882a593Smuzhiyun vpxor (6*16)(dst), x6, x6; \ 139*4882a593Smuzhiyun vpxor (7*16)(dst), x7, x7; \ 140*4882a593Smuzhiyun store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); 141