xref: /OK3568_Linux_fs/kernel/arch/x86/crypto/glue_helper-asm-avx.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Shared glue code for 128bit block ciphers, AVX assembler macros
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
6*4882a593Smuzhiyun */
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
9*4882a593Smuzhiyun	vmovdqu (0*16)(src), x0; \
10*4882a593Smuzhiyun	vmovdqu (1*16)(src), x1; \
11*4882a593Smuzhiyun	vmovdqu (2*16)(src), x2; \
12*4882a593Smuzhiyun	vmovdqu (3*16)(src), x3; \
13*4882a593Smuzhiyun	vmovdqu (4*16)(src), x4; \
14*4882a593Smuzhiyun	vmovdqu (5*16)(src), x5; \
15*4882a593Smuzhiyun	vmovdqu (6*16)(src), x6; \
16*4882a593Smuzhiyun	vmovdqu (7*16)(src), x7;
17*4882a593Smuzhiyun
18*4882a593Smuzhiyun#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
19*4882a593Smuzhiyun	vmovdqu x0, (0*16)(dst); \
20*4882a593Smuzhiyun	vmovdqu x1, (1*16)(dst); \
21*4882a593Smuzhiyun	vmovdqu x2, (2*16)(dst); \
22*4882a593Smuzhiyun	vmovdqu x3, (3*16)(dst); \
23*4882a593Smuzhiyun	vmovdqu x4, (4*16)(dst); \
24*4882a593Smuzhiyun	vmovdqu x5, (5*16)(dst); \
25*4882a593Smuzhiyun	vmovdqu x6, (6*16)(dst); \
26*4882a593Smuzhiyun	vmovdqu x7, (7*16)(dst);
27*4882a593Smuzhiyun
28*4882a593Smuzhiyun#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
29*4882a593Smuzhiyun	vpxor (0*16)(src), x1, x1; \
30*4882a593Smuzhiyun	vpxor (1*16)(src), x2, x2; \
31*4882a593Smuzhiyun	vpxor (2*16)(src), x3, x3; \
32*4882a593Smuzhiyun	vpxor (3*16)(src), x4, x4; \
33*4882a593Smuzhiyun	vpxor (4*16)(src), x5, x5; \
34*4882a593Smuzhiyun	vpxor (5*16)(src), x6, x6; \
35*4882a593Smuzhiyun	vpxor (6*16)(src), x7, x7; \
36*4882a593Smuzhiyun	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
37*4882a593Smuzhiyun
38*4882a593Smuzhiyun#define inc_le128(x, minus_one, tmp) \
39*4882a593Smuzhiyun	vpcmpeqq minus_one, x, tmp; \
40*4882a593Smuzhiyun	vpsubq minus_one, x, x; \
41*4882a593Smuzhiyun	vpslldq $8, tmp, tmp; \
42*4882a593Smuzhiyun	vpsubq tmp, x, x;
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \
45*4882a593Smuzhiyun	vpcmpeqd t0, t0, t0; \
46*4882a593Smuzhiyun	vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \
47*4882a593Smuzhiyun	vmovdqa bswap, t1; \
48*4882a593Smuzhiyun	\
49*4882a593Smuzhiyun	/* load IV and byteswap */ \
50*4882a593Smuzhiyun	vmovdqu (iv), x7; \
51*4882a593Smuzhiyun	vpshufb t1, x7, x0; \
52*4882a593Smuzhiyun	\
53*4882a593Smuzhiyun	/* construct IVs */ \
54*4882a593Smuzhiyun	inc_le128(x7, t0, t2); \
55*4882a593Smuzhiyun	vpshufb t1, x7, x1; \
56*4882a593Smuzhiyun	inc_le128(x7, t0, t2); \
57*4882a593Smuzhiyun	vpshufb t1, x7, x2; \
58*4882a593Smuzhiyun	inc_le128(x7, t0, t2); \
59*4882a593Smuzhiyun	vpshufb t1, x7, x3; \
60*4882a593Smuzhiyun	inc_le128(x7, t0, t2); \
61*4882a593Smuzhiyun	vpshufb t1, x7, x4; \
62*4882a593Smuzhiyun	inc_le128(x7, t0, t2); \
63*4882a593Smuzhiyun	vpshufb t1, x7, x5; \
64*4882a593Smuzhiyun	inc_le128(x7, t0, t2); \
65*4882a593Smuzhiyun	vpshufb t1, x7, x6; \
66*4882a593Smuzhiyun	inc_le128(x7, t0, t2); \
67*4882a593Smuzhiyun	vmovdqa x7, t2; \
68*4882a593Smuzhiyun	vpshufb t1, x7, x7; \
69*4882a593Smuzhiyun	inc_le128(t2, t0, t1); \
70*4882a593Smuzhiyun	vmovdqu t2, (iv);
71*4882a593Smuzhiyun
72*4882a593Smuzhiyun#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
73*4882a593Smuzhiyun	vpxor (0*16)(src), x0, x0; \
74*4882a593Smuzhiyun	vpxor (1*16)(src), x1, x1; \
75*4882a593Smuzhiyun	vpxor (2*16)(src), x2, x2; \
76*4882a593Smuzhiyun	vpxor (3*16)(src), x3, x3; \
77*4882a593Smuzhiyun	vpxor (4*16)(src), x4, x4; \
78*4882a593Smuzhiyun	vpxor (5*16)(src), x5, x5; \
79*4882a593Smuzhiyun	vpxor (6*16)(src), x6, x6; \
80*4882a593Smuzhiyun	vpxor (7*16)(src), x7, x7; \
81*4882a593Smuzhiyun	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
82*4882a593Smuzhiyun
83*4882a593Smuzhiyun#define gf128mul_x_ble(iv, mask, tmp) \
84*4882a593Smuzhiyun	vpsrad $31, iv, tmp; \
85*4882a593Smuzhiyun	vpaddq iv, iv, iv; \
86*4882a593Smuzhiyun	vpshufd $0x13, tmp, tmp; \
87*4882a593Smuzhiyun	vpand mask, tmp, tmp; \
88*4882a593Smuzhiyun	vpxor tmp, iv, iv;
89*4882a593Smuzhiyun
90*4882a593Smuzhiyun#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \
91*4882a593Smuzhiyun		      t1, xts_gf128mul_and_shl1_mask) \
92*4882a593Smuzhiyun	vmovdqa xts_gf128mul_and_shl1_mask, t0; \
93*4882a593Smuzhiyun	\
94*4882a593Smuzhiyun	/* load IV */ \
95*4882a593Smuzhiyun	vmovdqu (iv), tiv; \
96*4882a593Smuzhiyun	vpxor (0*16)(src), tiv, x0; \
97*4882a593Smuzhiyun	vmovdqu tiv, (0*16)(dst); \
98*4882a593Smuzhiyun	\
99*4882a593Smuzhiyun	/* construct and store IVs, also xor with source */ \
100*4882a593Smuzhiyun	gf128mul_x_ble(tiv, t0, t1); \
101*4882a593Smuzhiyun	vpxor (1*16)(src), tiv, x1; \
102*4882a593Smuzhiyun	vmovdqu tiv, (1*16)(dst); \
103*4882a593Smuzhiyun	\
104*4882a593Smuzhiyun	gf128mul_x_ble(tiv, t0, t1); \
105*4882a593Smuzhiyun	vpxor (2*16)(src), tiv, x2; \
106*4882a593Smuzhiyun	vmovdqu tiv, (2*16)(dst); \
107*4882a593Smuzhiyun	\
108*4882a593Smuzhiyun	gf128mul_x_ble(tiv, t0, t1); \
109*4882a593Smuzhiyun	vpxor (3*16)(src), tiv, x3; \
110*4882a593Smuzhiyun	vmovdqu tiv, (3*16)(dst); \
111*4882a593Smuzhiyun	\
112*4882a593Smuzhiyun	gf128mul_x_ble(tiv, t0, t1); \
113*4882a593Smuzhiyun	vpxor (4*16)(src), tiv, x4; \
114*4882a593Smuzhiyun	vmovdqu tiv, (4*16)(dst); \
115*4882a593Smuzhiyun	\
116*4882a593Smuzhiyun	gf128mul_x_ble(tiv, t0, t1); \
117*4882a593Smuzhiyun	vpxor (5*16)(src), tiv, x5; \
118*4882a593Smuzhiyun	vmovdqu tiv, (5*16)(dst); \
119*4882a593Smuzhiyun	\
120*4882a593Smuzhiyun	gf128mul_x_ble(tiv, t0, t1); \
121*4882a593Smuzhiyun	vpxor (6*16)(src), tiv, x6; \
122*4882a593Smuzhiyun	vmovdqu tiv, (6*16)(dst); \
123*4882a593Smuzhiyun	\
124*4882a593Smuzhiyun	gf128mul_x_ble(tiv, t0, t1); \
125*4882a593Smuzhiyun	vpxor (7*16)(src), tiv, x7; \
126*4882a593Smuzhiyun	vmovdqu tiv, (7*16)(dst); \
127*4882a593Smuzhiyun	\
128*4882a593Smuzhiyun	gf128mul_x_ble(tiv, t0, t1); \
129*4882a593Smuzhiyun	vmovdqu tiv, (iv);
130*4882a593Smuzhiyun
131*4882a593Smuzhiyun#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
132*4882a593Smuzhiyun	vpxor (0*16)(dst), x0, x0; \
133*4882a593Smuzhiyun	vpxor (1*16)(dst), x1, x1; \
134*4882a593Smuzhiyun	vpxor (2*16)(dst), x2, x2; \
135*4882a593Smuzhiyun	vpxor (3*16)(dst), x3, x3; \
136*4882a593Smuzhiyun	vpxor (4*16)(dst), x4, x4; \
137*4882a593Smuzhiyun	vpxor (5*16)(dst), x5, x5; \
138*4882a593Smuzhiyun	vpxor (6*16)(dst), x6, x6; \
139*4882a593Smuzhiyun	vpxor (7*16)(dst), x7, x7; \
140*4882a593Smuzhiyun	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
141