xref: /OK3568_Linux_fs/kernel/arch/x86/crypto/serpent-avx-x86_64-asm_64.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Serpent Cipher 8-way parallel algorithm (x86_64/AVX)
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 2012 Johannes Goetzfried
6*4882a593Smuzhiyun *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
7*4882a593Smuzhiyun *
8*4882a593Smuzhiyun * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
9*4882a593Smuzhiyun */
10*4882a593Smuzhiyun
11*4882a593Smuzhiyun#include <linux/linkage.h>
12*4882a593Smuzhiyun#include <asm/frame.h>
13*4882a593Smuzhiyun#include "glue_helper-asm-avx.S"
14*4882a593Smuzhiyun
15*4882a593Smuzhiyun.file "serpent-avx-x86_64-asm_64.S"
16*4882a593Smuzhiyun
17*4882a593Smuzhiyun.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
18*4882a593Smuzhiyun.align 16
19*4882a593Smuzhiyun.Lbswap128_mask:
20*4882a593Smuzhiyun	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
21*4882a593Smuzhiyun.section	.rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
22*4882a593Smuzhiyun.align 16
23*4882a593Smuzhiyun.Lxts_gf128mul_and_shl1_mask:
24*4882a593Smuzhiyun	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
25*4882a593Smuzhiyun
26*4882a593Smuzhiyun.text
27*4882a593Smuzhiyun
28*4882a593Smuzhiyun#define CTX %rdi
29*4882a593Smuzhiyun
30*4882a593Smuzhiyun/**********************************************************************
31*4882a593Smuzhiyun  8-way AVX serpent
32*4882a593Smuzhiyun **********************************************************************/
33*4882a593Smuzhiyun#define RA1 %xmm0
34*4882a593Smuzhiyun#define RB1 %xmm1
35*4882a593Smuzhiyun#define RC1 %xmm2
36*4882a593Smuzhiyun#define RD1 %xmm3
37*4882a593Smuzhiyun#define RE1 %xmm4
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun#define tp  %xmm5
40*4882a593Smuzhiyun
41*4882a593Smuzhiyun#define RA2 %xmm6
42*4882a593Smuzhiyun#define RB2 %xmm7
43*4882a593Smuzhiyun#define RC2 %xmm8
44*4882a593Smuzhiyun#define RD2 %xmm9
45*4882a593Smuzhiyun#define RE2 %xmm10
46*4882a593Smuzhiyun
47*4882a593Smuzhiyun#define RNOT %xmm11
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun#define RK0 %xmm12
50*4882a593Smuzhiyun#define RK1 %xmm13
51*4882a593Smuzhiyun#define RK2 %xmm14
52*4882a593Smuzhiyun#define RK3 %xmm15
53*4882a593Smuzhiyun
54*4882a593Smuzhiyun
55*4882a593Smuzhiyun#define S0_1(x0, x1, x2, x3, x4)      \
56*4882a593Smuzhiyun	vpor		x0,   x3, tp; \
57*4882a593Smuzhiyun	vpxor		x3,   x0, x0; \
58*4882a593Smuzhiyun	vpxor		x2,   x3, x4; \
59*4882a593Smuzhiyun	vpxor		RNOT, x4, x4; \
60*4882a593Smuzhiyun	vpxor		x1,   tp, x3; \
61*4882a593Smuzhiyun	vpand		x0,   x1, x1; \
62*4882a593Smuzhiyun	vpxor		x4,   x1, x1; \
63*4882a593Smuzhiyun	vpxor		x0,   x2, x2;
64*4882a593Smuzhiyun#define S0_2(x0, x1, x2, x3, x4)      \
65*4882a593Smuzhiyun	vpxor		x3,   x0, x0; \
66*4882a593Smuzhiyun	vpor		x0,   x4, x4; \
67*4882a593Smuzhiyun	vpxor		x2,   x0, x0; \
68*4882a593Smuzhiyun	vpand		x1,   x2, x2; \
69*4882a593Smuzhiyun	vpxor		x2,   x3, x3; \
70*4882a593Smuzhiyun	vpxor		RNOT, x1, x1; \
71*4882a593Smuzhiyun	vpxor		x4,   x2, x2; \
72*4882a593Smuzhiyun	vpxor		x2,   x1, x1;
73*4882a593Smuzhiyun
74*4882a593Smuzhiyun#define S1_1(x0, x1, x2, x3, x4)      \
75*4882a593Smuzhiyun	vpxor		x0,   x1, tp; \
76*4882a593Smuzhiyun	vpxor		x3,   x0, x0; \
77*4882a593Smuzhiyun	vpxor		RNOT, x3, x3; \
78*4882a593Smuzhiyun	vpand		tp,   x1, x4; \
79*4882a593Smuzhiyun	vpor		tp,   x0, x0; \
80*4882a593Smuzhiyun	vpxor		x2,   x3, x3; \
81*4882a593Smuzhiyun	vpxor		x3,   x0, x0; \
82*4882a593Smuzhiyun	vpxor		x3,   tp, x1;
83*4882a593Smuzhiyun#define S1_2(x0, x1, x2, x3, x4)      \
84*4882a593Smuzhiyun	vpxor		x4,   x3, x3; \
85*4882a593Smuzhiyun	vpor		x4,   x1, x1; \
86*4882a593Smuzhiyun	vpxor		x2,   x4, x4; \
87*4882a593Smuzhiyun	vpand		x0,   x2, x2; \
88*4882a593Smuzhiyun	vpxor		x1,   x2, x2; \
89*4882a593Smuzhiyun	vpor		x0,   x1, x1; \
90*4882a593Smuzhiyun	vpxor		RNOT, x0, x0; \
91*4882a593Smuzhiyun	vpxor		x2,   x0, x0; \
92*4882a593Smuzhiyun	vpxor		x1,   x4, x4;
93*4882a593Smuzhiyun
94*4882a593Smuzhiyun#define S2_1(x0, x1, x2, x3, x4)      \
95*4882a593Smuzhiyun	vpxor		RNOT, x3, x3; \
96*4882a593Smuzhiyun	vpxor		x0,   x1, x1; \
97*4882a593Smuzhiyun	vpand		x2,   x0, tp; \
98*4882a593Smuzhiyun	vpxor		x3,   tp, tp; \
99*4882a593Smuzhiyun	vpor		x0,   x3, x3; \
100*4882a593Smuzhiyun	vpxor		x1,   x2, x2; \
101*4882a593Smuzhiyun	vpxor		x1,   x3, x3; \
102*4882a593Smuzhiyun	vpand		tp,   x1, x1;
103*4882a593Smuzhiyun#define S2_2(x0, x1, x2, x3, x4)      \
104*4882a593Smuzhiyun	vpxor		x2,   tp, tp; \
105*4882a593Smuzhiyun	vpand		x3,   x2, x2; \
106*4882a593Smuzhiyun	vpor		x1,   x3, x3; \
107*4882a593Smuzhiyun	vpxor		RNOT, tp, tp; \
108*4882a593Smuzhiyun	vpxor		tp,   x3, x3; \
109*4882a593Smuzhiyun	vpxor		tp,   x0, x4; \
110*4882a593Smuzhiyun	vpxor		x2,   tp, x0; \
111*4882a593Smuzhiyun	vpor		x2,   x1, x1;
112*4882a593Smuzhiyun
113*4882a593Smuzhiyun#define S3_1(x0, x1, x2, x3, x4)      \
114*4882a593Smuzhiyun	vpxor		x3,   x1, tp; \
115*4882a593Smuzhiyun	vpor		x0,   x3, x3; \
116*4882a593Smuzhiyun	vpand		x0,   x1, x4; \
117*4882a593Smuzhiyun	vpxor		x2,   x0, x0; \
118*4882a593Smuzhiyun	vpxor		tp,   x2, x2; \
119*4882a593Smuzhiyun	vpand		x3,   tp, x1; \
120*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
121*4882a593Smuzhiyun	vpor		x4,   x0, x0; \
122*4882a593Smuzhiyun	vpxor		x3,   x4, x4;
123*4882a593Smuzhiyun#define S3_2(x0, x1, x2, x3, x4)      \
124*4882a593Smuzhiyun	vpxor		x0,   x1, x1; \
125*4882a593Smuzhiyun	vpand		x3,   x0, x0; \
126*4882a593Smuzhiyun	vpand		x4,   x3, x3; \
127*4882a593Smuzhiyun	vpxor		x2,   x3, x3; \
128*4882a593Smuzhiyun	vpor		x1,   x4, x4; \
129*4882a593Smuzhiyun	vpand		x1,   x2, x2; \
130*4882a593Smuzhiyun	vpxor		x3,   x4, x4; \
131*4882a593Smuzhiyun	vpxor		x3,   x0, x0; \
132*4882a593Smuzhiyun	vpxor		x2,   x3, x3;
133*4882a593Smuzhiyun
134*4882a593Smuzhiyun#define S4_1(x0, x1, x2, x3, x4)      \
135*4882a593Smuzhiyun	vpand		x0,   x3, tp; \
136*4882a593Smuzhiyun	vpxor		x3,   x0, x0; \
137*4882a593Smuzhiyun	vpxor		x2,   tp, tp; \
138*4882a593Smuzhiyun	vpor		x3,   x2, x2; \
139*4882a593Smuzhiyun	vpxor		x1,   x0, x0; \
140*4882a593Smuzhiyun	vpxor		tp,   x3, x4; \
141*4882a593Smuzhiyun	vpor		x0,   x2, x2; \
142*4882a593Smuzhiyun	vpxor		x1,   x2, x2;
143*4882a593Smuzhiyun#define S4_2(x0, x1, x2, x3, x4)      \
144*4882a593Smuzhiyun	vpand		x0,   x1, x1; \
145*4882a593Smuzhiyun	vpxor		x4,   x1, x1; \
146*4882a593Smuzhiyun	vpand		x2,   x4, x4; \
147*4882a593Smuzhiyun	vpxor		tp,   x2, x2; \
148*4882a593Smuzhiyun	vpxor		x0,   x4, x4; \
149*4882a593Smuzhiyun	vpor		x1,   tp, x3; \
150*4882a593Smuzhiyun	vpxor		RNOT, x1, x1; \
151*4882a593Smuzhiyun	vpxor		x0,   x3, x3;
152*4882a593Smuzhiyun
153*4882a593Smuzhiyun#define S5_1(x0, x1, x2, x3, x4)      \
154*4882a593Smuzhiyun	vpor		x0,   x1, tp; \
155*4882a593Smuzhiyun	vpxor		tp,   x2, x2; \
156*4882a593Smuzhiyun	vpxor		RNOT, x3, x3; \
157*4882a593Smuzhiyun	vpxor		x0,   x1, x4; \
158*4882a593Smuzhiyun	vpxor		x2,   x0, x0; \
159*4882a593Smuzhiyun	vpand		x4,   tp, x1; \
160*4882a593Smuzhiyun	vpor		x3,   x4, x4; \
161*4882a593Smuzhiyun	vpxor		x0,   x4, x4;
162*4882a593Smuzhiyun#define S5_2(x0, x1, x2, x3, x4)      \
163*4882a593Smuzhiyun	vpand		x3,   x0, x0; \
164*4882a593Smuzhiyun	vpxor		x3,   x1, x1; \
165*4882a593Smuzhiyun	vpxor		x2,   x3, x3; \
166*4882a593Smuzhiyun	vpxor		x1,   x0, x0; \
167*4882a593Smuzhiyun	vpand		x4,   x2, x2; \
168*4882a593Smuzhiyun	vpxor		x2,   x1, x1; \
169*4882a593Smuzhiyun	vpand		x0,   x2, x2; \
170*4882a593Smuzhiyun	vpxor		x2,   x3, x3;
171*4882a593Smuzhiyun
172*4882a593Smuzhiyun#define S6_1(x0, x1, x2, x3, x4)      \
173*4882a593Smuzhiyun	vpxor		x0,   x3, x3; \
174*4882a593Smuzhiyun	vpxor		x2,   x1, tp; \
175*4882a593Smuzhiyun	vpxor		x0,   x2, x2; \
176*4882a593Smuzhiyun	vpand		x3,   x0, x0; \
177*4882a593Smuzhiyun	vpor		x3,   tp, tp; \
178*4882a593Smuzhiyun	vpxor		RNOT, x1, x4; \
179*4882a593Smuzhiyun	vpxor		tp,   x0, x0; \
180*4882a593Smuzhiyun	vpxor		x2,   tp, x1;
181*4882a593Smuzhiyun#define S6_2(x0, x1, x2, x3, x4)      \
182*4882a593Smuzhiyun	vpxor		x4,   x3, x3; \
183*4882a593Smuzhiyun	vpxor		x0,   x4, x4; \
184*4882a593Smuzhiyun	vpand		x0,   x2, x2; \
185*4882a593Smuzhiyun	vpxor		x1,   x4, x4; \
186*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
187*4882a593Smuzhiyun	vpand		x1,   x3, x3; \
188*4882a593Smuzhiyun	vpxor		x0,   x3, x3; \
189*4882a593Smuzhiyun	vpxor		x2,   x1, x1;
190*4882a593Smuzhiyun
191*4882a593Smuzhiyun#define S7_1(x0, x1, x2, x3, x4)      \
192*4882a593Smuzhiyun	vpxor		RNOT, x1, tp; \
193*4882a593Smuzhiyun	vpxor		RNOT, x0, x0; \
194*4882a593Smuzhiyun	vpand		x2,   tp, x1; \
195*4882a593Smuzhiyun	vpxor		x3,   x1, x1; \
196*4882a593Smuzhiyun	vpor		tp,   x3, x3; \
197*4882a593Smuzhiyun	vpxor		x2,   tp, x4; \
198*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
199*4882a593Smuzhiyun	vpxor		x0,   x3, x3; \
200*4882a593Smuzhiyun	vpor		x1,   x0, x0;
201*4882a593Smuzhiyun#define S7_2(x0, x1, x2, x3, x4)      \
202*4882a593Smuzhiyun	vpand		x0,   x2, x2; \
203*4882a593Smuzhiyun	vpxor		x4,   x0, x0; \
204*4882a593Smuzhiyun	vpxor		x3,   x4, x4; \
205*4882a593Smuzhiyun	vpand		x0,   x3, x3; \
206*4882a593Smuzhiyun	vpxor		x1,   x4, x4; \
207*4882a593Smuzhiyun	vpxor		x4,   x2, x2; \
208*4882a593Smuzhiyun	vpxor		x1,   x3, x3; \
209*4882a593Smuzhiyun	vpor		x0,   x4, x4; \
210*4882a593Smuzhiyun	vpxor		x1,   x4, x4;
211*4882a593Smuzhiyun
212*4882a593Smuzhiyun#define SI0_1(x0, x1, x2, x3, x4)     \
213*4882a593Smuzhiyun	vpxor		x0,   x1, x1; \
214*4882a593Smuzhiyun	vpor		x1,   x3, tp; \
215*4882a593Smuzhiyun	vpxor		x1,   x3, x4; \
216*4882a593Smuzhiyun	vpxor		RNOT, x0, x0; \
217*4882a593Smuzhiyun	vpxor		tp,   x2, x2; \
218*4882a593Smuzhiyun	vpxor		x0,   tp, x3; \
219*4882a593Smuzhiyun	vpand		x1,   x0, x0; \
220*4882a593Smuzhiyun	vpxor		x2,   x0, x0;
221*4882a593Smuzhiyun#define SI0_2(x0, x1, x2, x3, x4)     \
222*4882a593Smuzhiyun	vpand		x3,   x2, x2; \
223*4882a593Smuzhiyun	vpxor		x4,   x3, x3; \
224*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
225*4882a593Smuzhiyun	vpxor		x3,   x1, x1; \
226*4882a593Smuzhiyun	vpand		x0,   x3, x3; \
227*4882a593Smuzhiyun	vpxor		x0,   x1, x1; \
228*4882a593Smuzhiyun	vpxor		x2,   x0, x0; \
229*4882a593Smuzhiyun	vpxor		x3,   x4, x4;
230*4882a593Smuzhiyun
231*4882a593Smuzhiyun#define SI1_1(x0, x1, x2, x3, x4)     \
232*4882a593Smuzhiyun	vpxor		x3,   x1, x1; \
233*4882a593Smuzhiyun	vpxor		x2,   x0, tp; \
234*4882a593Smuzhiyun	vpxor		RNOT, x2, x2; \
235*4882a593Smuzhiyun	vpor		x1,   x0, x4; \
236*4882a593Smuzhiyun	vpxor		x3,   x4, x4; \
237*4882a593Smuzhiyun	vpand		x1,   x3, x3; \
238*4882a593Smuzhiyun	vpxor		x2,   x1, x1; \
239*4882a593Smuzhiyun	vpand		x4,   x2, x2;
240*4882a593Smuzhiyun#define SI1_2(x0, x1, x2, x3, x4)     \
241*4882a593Smuzhiyun	vpxor		x1,   x4, x4; \
242*4882a593Smuzhiyun	vpor		x3,   x1, x1; \
243*4882a593Smuzhiyun	vpxor		tp,   x3, x3; \
244*4882a593Smuzhiyun	vpxor		tp,   x2, x2; \
245*4882a593Smuzhiyun	vpor		x4,   tp, x0; \
246*4882a593Smuzhiyun	vpxor		x4,   x2, x2; \
247*4882a593Smuzhiyun	vpxor		x0,   x1, x1; \
248*4882a593Smuzhiyun	vpxor		x1,   x4, x4;
249*4882a593Smuzhiyun
250*4882a593Smuzhiyun#define SI2_1(x0, x1, x2, x3, x4)     \
251*4882a593Smuzhiyun	vpxor		x1,   x2, x2; \
252*4882a593Smuzhiyun	vpxor		RNOT, x3, tp; \
253*4882a593Smuzhiyun	vpor		x2,   tp, tp; \
254*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
255*4882a593Smuzhiyun	vpxor		x0,   x3, x4; \
256*4882a593Smuzhiyun	vpxor		x1,   tp, x3; \
257*4882a593Smuzhiyun	vpor		x2,   x1, x1; \
258*4882a593Smuzhiyun	vpxor		x0,   x2, x2;
259*4882a593Smuzhiyun#define SI2_2(x0, x1, x2, x3, x4)     \
260*4882a593Smuzhiyun	vpxor		x4,   x1, x1; \
261*4882a593Smuzhiyun	vpor		x3,   x4, x4; \
262*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
263*4882a593Smuzhiyun	vpxor		x2,   x4, x4; \
264*4882a593Smuzhiyun	vpand		x1,   x2, x2; \
265*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
266*4882a593Smuzhiyun	vpxor		x4,   x3, x3; \
267*4882a593Smuzhiyun	vpxor		x0,   x4, x4;
268*4882a593Smuzhiyun
269*4882a593Smuzhiyun#define SI3_1(x0, x1, x2, x3, x4)     \
270*4882a593Smuzhiyun	vpxor		x1,   x2, x2; \
271*4882a593Smuzhiyun	vpand		x2,   x1, tp; \
272*4882a593Smuzhiyun	vpxor		x0,   tp, tp; \
273*4882a593Smuzhiyun	vpor		x1,   x0, x0; \
274*4882a593Smuzhiyun	vpxor		x3,   x1, x4; \
275*4882a593Smuzhiyun	vpxor		x3,   x0, x0; \
276*4882a593Smuzhiyun	vpor		tp,   x3, x3; \
277*4882a593Smuzhiyun	vpxor		x2,   tp, x1;
278*4882a593Smuzhiyun#define SI3_2(x0, x1, x2, x3, x4)     \
279*4882a593Smuzhiyun	vpxor		x3,   x1, x1; \
280*4882a593Smuzhiyun	vpxor		x2,   x0, x0; \
281*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
282*4882a593Smuzhiyun	vpand		x1,   x3, x3; \
283*4882a593Smuzhiyun	vpxor		x0,   x1, x1; \
284*4882a593Smuzhiyun	vpand		x2,   x0, x0; \
285*4882a593Smuzhiyun	vpxor		x3,   x4, x4; \
286*4882a593Smuzhiyun	vpxor		x0,   x3, x3; \
287*4882a593Smuzhiyun	vpxor		x1,   x0, x0;
288*4882a593Smuzhiyun
289*4882a593Smuzhiyun#define SI4_1(x0, x1, x2, x3, x4)     \
290*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
291*4882a593Smuzhiyun	vpand		x1,   x0, tp; \
292*4882a593Smuzhiyun	vpxor		x2,   tp, tp; \
293*4882a593Smuzhiyun	vpor		x3,   x2, x2; \
294*4882a593Smuzhiyun	vpxor		RNOT, x0, x4; \
295*4882a593Smuzhiyun	vpxor		tp,   x1, x1; \
296*4882a593Smuzhiyun	vpxor		x2,   tp, x0; \
297*4882a593Smuzhiyun	vpand		x4,   x2, x2;
298*4882a593Smuzhiyun#define SI4_2(x0, x1, x2, x3, x4)     \
299*4882a593Smuzhiyun	vpxor		x0,   x2, x2; \
300*4882a593Smuzhiyun	vpor		x4,   x0, x0; \
301*4882a593Smuzhiyun	vpxor		x3,   x0, x0; \
302*4882a593Smuzhiyun	vpand		x2,   x3, x3; \
303*4882a593Smuzhiyun	vpxor		x3,   x4, x4; \
304*4882a593Smuzhiyun	vpxor		x1,   x3, x3; \
305*4882a593Smuzhiyun	vpand		x0,   x1, x1; \
306*4882a593Smuzhiyun	vpxor		x1,   x4, x4; \
307*4882a593Smuzhiyun	vpxor		x3,   x0, x0;
308*4882a593Smuzhiyun
309*4882a593Smuzhiyun#define SI5_1(x0, x1, x2, x3, x4)     \
310*4882a593Smuzhiyun	vpor		x2,   x1, tp; \
311*4882a593Smuzhiyun	vpxor		x1,   x2, x2; \
312*4882a593Smuzhiyun	vpxor		x3,   tp, tp; \
313*4882a593Smuzhiyun	vpand		x1,   x3, x3; \
314*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
315*4882a593Smuzhiyun	vpor		x0,   x3, x3; \
316*4882a593Smuzhiyun	vpxor		RNOT, x0, x0; \
317*4882a593Smuzhiyun	vpxor		x2,   x3, x3; \
318*4882a593Smuzhiyun	vpor		x0,   x2, x2;
319*4882a593Smuzhiyun#define SI5_2(x0, x1, x2, x3, x4)     \
320*4882a593Smuzhiyun	vpxor		tp,   x1, x4; \
321*4882a593Smuzhiyun	vpxor		x4,   x2, x2; \
322*4882a593Smuzhiyun	vpand		x0,   x4, x4; \
323*4882a593Smuzhiyun	vpxor		tp,   x0, x0; \
324*4882a593Smuzhiyun	vpxor		x3,   tp, x1; \
325*4882a593Smuzhiyun	vpand		x2,   x0, x0; \
326*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
327*4882a593Smuzhiyun	vpxor		x2,   x0, x0; \
328*4882a593Smuzhiyun	vpxor		x4,   x2, x2; \
329*4882a593Smuzhiyun	vpxor		x3,   x4, x4;
330*4882a593Smuzhiyun
331*4882a593Smuzhiyun#define SI6_1(x0, x1, x2, x3, x4)     \
332*4882a593Smuzhiyun	vpxor		x2,   x0, x0; \
333*4882a593Smuzhiyun	vpand		x3,   x0, tp; \
334*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
335*4882a593Smuzhiyun	vpxor		x2,   tp, tp; \
336*4882a593Smuzhiyun	vpxor		x1,   x3, x3; \
337*4882a593Smuzhiyun	vpor		x0,   x2, x2; \
338*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
339*4882a593Smuzhiyun	vpand		tp,   x3, x3;
340*4882a593Smuzhiyun#define SI6_2(x0, x1, x2, x3, x4)     \
341*4882a593Smuzhiyun	vpxor		RNOT, tp, tp; \
342*4882a593Smuzhiyun	vpxor		x1,   x3, x3; \
343*4882a593Smuzhiyun	vpand		x2,   x1, x1; \
344*4882a593Smuzhiyun	vpxor		tp,   x0, x4; \
345*4882a593Smuzhiyun	vpxor		x4,   x3, x3; \
346*4882a593Smuzhiyun	vpxor		x2,   x4, x4; \
347*4882a593Smuzhiyun	vpxor		x1,   tp, x0; \
348*4882a593Smuzhiyun	vpxor		x0,   x2, x2;
349*4882a593Smuzhiyun
350*4882a593Smuzhiyun#define SI7_1(x0, x1, x2, x3, x4)     \
351*4882a593Smuzhiyun	vpand		x0,   x3, tp; \
352*4882a593Smuzhiyun	vpxor		x2,   x0, x0; \
353*4882a593Smuzhiyun	vpor		x3,   x2, x2; \
354*4882a593Smuzhiyun	vpxor		x1,   x3, x4; \
355*4882a593Smuzhiyun	vpxor		RNOT, x0, x0; \
356*4882a593Smuzhiyun	vpor		tp,   x1, x1; \
357*4882a593Smuzhiyun	vpxor		x0,   x4, x4; \
358*4882a593Smuzhiyun	vpand		x2,   x0, x0; \
359*4882a593Smuzhiyun	vpxor		x1,   x0, x0;
360*4882a593Smuzhiyun#define SI7_2(x0, x1, x2, x3, x4)     \
361*4882a593Smuzhiyun	vpand		x2,   x1, x1; \
362*4882a593Smuzhiyun	vpxor		x2,   tp, x3; \
363*4882a593Smuzhiyun	vpxor		x3,   x4, x4; \
364*4882a593Smuzhiyun	vpand		x3,   x2, x2; \
365*4882a593Smuzhiyun	vpor		x0,   x3, x3; \
366*4882a593Smuzhiyun	vpxor		x4,   x1, x1; \
367*4882a593Smuzhiyun	vpxor		x4,   x3, x3; \
368*4882a593Smuzhiyun	vpand		x0,   x4, x4; \
369*4882a593Smuzhiyun	vpxor		x2,   x4, x4;
370*4882a593Smuzhiyun
371*4882a593Smuzhiyun#define get_key(i, j, t) \
372*4882a593Smuzhiyun	vbroadcastss (4*(i)+(j))*4(CTX), t;
373*4882a593Smuzhiyun
374*4882a593Smuzhiyun#define K2(x0, x1, x2, x3, x4, i) \
375*4882a593Smuzhiyun	get_key(i, 0, RK0); \
376*4882a593Smuzhiyun	get_key(i, 1, RK1); \
377*4882a593Smuzhiyun	get_key(i, 2, RK2); \
378*4882a593Smuzhiyun	get_key(i, 3, RK3); \
379*4882a593Smuzhiyun	vpxor RK0,	x0 ## 1, x0 ## 1; \
380*4882a593Smuzhiyun	vpxor RK1,	x1 ## 1, x1 ## 1; \
381*4882a593Smuzhiyun	vpxor RK2,	x2 ## 1, x2 ## 1; \
382*4882a593Smuzhiyun	vpxor RK3,	x3 ## 1, x3 ## 1; \
383*4882a593Smuzhiyun		vpxor RK0,	x0 ## 2, x0 ## 2; \
384*4882a593Smuzhiyun		vpxor RK1,	x1 ## 2, x1 ## 2; \
385*4882a593Smuzhiyun		vpxor RK2,	x2 ## 2, x2 ## 2; \
386*4882a593Smuzhiyun		vpxor RK3,	x3 ## 2, x3 ## 2;
387*4882a593Smuzhiyun
388*4882a593Smuzhiyun#define LK2(x0, x1, x2, x3, x4, i) \
389*4882a593Smuzhiyun	vpslld $13,		x0 ## 1, x4 ## 1;          \
390*4882a593Smuzhiyun	vpsrld $(32 - 13),	x0 ## 1, x0 ## 1;          \
391*4882a593Smuzhiyun	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
392*4882a593Smuzhiyun	vpxor			x0 ## 1, x1 ## 1, x1 ## 1; \
393*4882a593Smuzhiyun	vpslld $3,		x2 ## 1, x4 ## 1;          \
394*4882a593Smuzhiyun	vpsrld $(32 - 3),	x2 ## 1, x2 ## 1;          \
395*4882a593Smuzhiyun	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
396*4882a593Smuzhiyun	vpxor			x2 ## 1, x1 ## 1, x1 ## 1; \
397*4882a593Smuzhiyun		vpslld $13,		x0 ## 2, x4 ## 2;          \
398*4882a593Smuzhiyun		vpsrld $(32 - 13),	x0 ## 2, x0 ## 2;          \
399*4882a593Smuzhiyun		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
400*4882a593Smuzhiyun		vpxor			x0 ## 2, x1 ## 2, x1 ## 2; \
401*4882a593Smuzhiyun		vpslld $3,		x2 ## 2, x4 ## 2;          \
402*4882a593Smuzhiyun		vpsrld $(32 - 3),	x2 ## 2, x2 ## 2;          \
403*4882a593Smuzhiyun		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
404*4882a593Smuzhiyun		vpxor			x2 ## 2, x1 ## 2, x1 ## 2; \
405*4882a593Smuzhiyun	vpslld $1,		x1 ## 1, x4 ## 1;          \
406*4882a593Smuzhiyun	vpsrld $(32 - 1),	x1 ## 1, x1 ## 1;          \
407*4882a593Smuzhiyun	vpor			x4 ## 1, x1 ## 1, x1 ## 1; \
408*4882a593Smuzhiyun	vpslld $3,		x0 ## 1, x4 ## 1;          \
409*4882a593Smuzhiyun	vpxor			x2 ## 1, x3 ## 1, x3 ## 1; \
410*4882a593Smuzhiyun	vpxor			x4 ## 1, x3 ## 1, x3 ## 1; \
411*4882a593Smuzhiyun	get_key(i, 1, RK1); \
412*4882a593Smuzhiyun		vpslld $1,		x1 ## 2, x4 ## 2;          \
413*4882a593Smuzhiyun		vpsrld $(32 - 1),	x1 ## 2, x1 ## 2;          \
414*4882a593Smuzhiyun		vpor			x4 ## 2, x1 ## 2, x1 ## 2; \
415*4882a593Smuzhiyun		vpslld $3,		x0 ## 2, x4 ## 2;          \
416*4882a593Smuzhiyun		vpxor			x2 ## 2, x3 ## 2, x3 ## 2; \
417*4882a593Smuzhiyun		vpxor			x4 ## 2, x3 ## 2, x3 ## 2; \
418*4882a593Smuzhiyun		get_key(i, 3, RK3); \
419*4882a593Smuzhiyun	vpslld $7,		x3 ## 1, x4 ## 1;          \
420*4882a593Smuzhiyun	vpsrld $(32 - 7),	x3 ## 1, x3 ## 1;          \
421*4882a593Smuzhiyun	vpor			x4 ## 1, x3 ## 1, x3 ## 1; \
422*4882a593Smuzhiyun	vpslld $7,		x1 ## 1, x4 ## 1;          \
423*4882a593Smuzhiyun	vpxor			x1 ## 1, x0 ## 1, x0 ## 1; \
424*4882a593Smuzhiyun	vpxor			x3 ## 1, x0 ## 1, x0 ## 1; \
425*4882a593Smuzhiyun	vpxor			x3 ## 1, x2 ## 1, x2 ## 1; \
426*4882a593Smuzhiyun	vpxor			x4 ## 1, x2 ## 1, x2 ## 1; \
427*4882a593Smuzhiyun	get_key(i, 0, RK0); \
428*4882a593Smuzhiyun		vpslld $7,		x3 ## 2, x4 ## 2;          \
429*4882a593Smuzhiyun		vpsrld $(32 - 7),	x3 ## 2, x3 ## 2;          \
430*4882a593Smuzhiyun		vpor			x4 ## 2, x3 ## 2, x3 ## 2; \
431*4882a593Smuzhiyun		vpslld $7,		x1 ## 2, x4 ## 2;          \
432*4882a593Smuzhiyun		vpxor			x1 ## 2, x0 ## 2, x0 ## 2; \
433*4882a593Smuzhiyun		vpxor			x3 ## 2, x0 ## 2, x0 ## 2; \
434*4882a593Smuzhiyun		vpxor			x3 ## 2, x2 ## 2, x2 ## 2; \
435*4882a593Smuzhiyun		vpxor			x4 ## 2, x2 ## 2, x2 ## 2; \
436*4882a593Smuzhiyun		get_key(i, 2, RK2); \
437*4882a593Smuzhiyun	vpxor			RK1, x1 ## 1, x1 ## 1;     \
438*4882a593Smuzhiyun	vpxor			RK3, x3 ## 1, x3 ## 1;     \
439*4882a593Smuzhiyun	vpslld $5,		x0 ## 1, x4 ## 1;          \
440*4882a593Smuzhiyun	vpsrld $(32 - 5),	x0 ## 1, x0 ## 1;          \
441*4882a593Smuzhiyun	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
442*4882a593Smuzhiyun	vpslld $22,		x2 ## 1, x4 ## 1;          \
443*4882a593Smuzhiyun	vpsrld $(32 - 22),	x2 ## 1, x2 ## 1;          \
444*4882a593Smuzhiyun	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
445*4882a593Smuzhiyun	vpxor			RK0, x0 ## 1, x0 ## 1;     \
446*4882a593Smuzhiyun	vpxor			RK2, x2 ## 1, x2 ## 1;     \
447*4882a593Smuzhiyun		vpxor			RK1, x1 ## 2, x1 ## 2;     \
448*4882a593Smuzhiyun		vpxor			RK3, x3 ## 2, x3 ## 2;     \
449*4882a593Smuzhiyun		vpslld $5,		x0 ## 2, x4 ## 2;          \
450*4882a593Smuzhiyun		vpsrld $(32 - 5),	x0 ## 2, x0 ## 2;          \
451*4882a593Smuzhiyun		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
452*4882a593Smuzhiyun		vpslld $22,		x2 ## 2, x4 ## 2;          \
453*4882a593Smuzhiyun		vpsrld $(32 - 22),	x2 ## 2, x2 ## 2;          \
454*4882a593Smuzhiyun		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
455*4882a593Smuzhiyun		vpxor			RK0, x0 ## 2, x0 ## 2;     \
456*4882a593Smuzhiyun		vpxor			RK2, x2 ## 2, x2 ## 2;
457*4882a593Smuzhiyun
458*4882a593Smuzhiyun#define KL2(x0, x1, x2, x3, x4, i) \
459*4882a593Smuzhiyun	vpxor			RK0, x0 ## 1, x0 ## 1;     \
460*4882a593Smuzhiyun	vpxor			RK2, x2 ## 1, x2 ## 1;     \
461*4882a593Smuzhiyun	vpsrld $5,		x0 ## 1, x4 ## 1;          \
462*4882a593Smuzhiyun	vpslld $(32 - 5),	x0 ## 1, x0 ## 1;          \
463*4882a593Smuzhiyun	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
464*4882a593Smuzhiyun	vpxor			RK3, x3 ## 1, x3 ## 1;     \
465*4882a593Smuzhiyun	vpxor			RK1, x1 ## 1, x1 ## 1;     \
466*4882a593Smuzhiyun	vpsrld $22,		x2 ## 1, x4 ## 1;          \
467*4882a593Smuzhiyun	vpslld $(32 - 22),	x2 ## 1, x2 ## 1;          \
468*4882a593Smuzhiyun	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
469*4882a593Smuzhiyun	vpxor			x3 ## 1, x2 ## 1, x2 ## 1; \
470*4882a593Smuzhiyun		vpxor			RK0, x0 ## 2, x0 ## 2;     \
471*4882a593Smuzhiyun		vpxor			RK2, x2 ## 2, x2 ## 2;     \
472*4882a593Smuzhiyun		vpsrld $5,		x0 ## 2, x4 ## 2;          \
473*4882a593Smuzhiyun		vpslld $(32 - 5),	x0 ## 2, x0 ## 2;          \
474*4882a593Smuzhiyun		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
475*4882a593Smuzhiyun		vpxor			RK3, x3 ## 2, x3 ## 2;     \
476*4882a593Smuzhiyun		vpxor			RK1, x1 ## 2, x1 ## 2;     \
477*4882a593Smuzhiyun		vpsrld $22,		x2 ## 2, x4 ## 2;          \
478*4882a593Smuzhiyun		vpslld $(32 - 22),	x2 ## 2, x2 ## 2;          \
479*4882a593Smuzhiyun		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
480*4882a593Smuzhiyun		vpxor			x3 ## 2, x2 ## 2, x2 ## 2; \
481*4882a593Smuzhiyun	vpxor			x3 ## 1, x0 ## 1, x0 ## 1; \
482*4882a593Smuzhiyun	vpslld $7,		x1 ## 1, x4 ## 1;          \
483*4882a593Smuzhiyun	vpxor			x1 ## 1, x0 ## 1, x0 ## 1; \
484*4882a593Smuzhiyun	vpxor			x4 ## 1, x2 ## 1, x2 ## 1; \
485*4882a593Smuzhiyun	vpsrld $1,		x1 ## 1, x4 ## 1;          \
486*4882a593Smuzhiyun	vpslld $(32 - 1),	x1 ## 1, x1 ## 1;          \
487*4882a593Smuzhiyun	vpor			x4 ## 1, x1 ## 1, x1 ## 1; \
488*4882a593Smuzhiyun		vpxor			x3 ## 2, x0 ## 2, x0 ## 2; \
489*4882a593Smuzhiyun		vpslld $7,		x1 ## 2, x4 ## 2;          \
490*4882a593Smuzhiyun		vpxor			x1 ## 2, x0 ## 2, x0 ## 2; \
491*4882a593Smuzhiyun		vpxor			x4 ## 2, x2 ## 2, x2 ## 2; \
492*4882a593Smuzhiyun		vpsrld $1,		x1 ## 2, x4 ## 2;          \
493*4882a593Smuzhiyun		vpslld $(32 - 1),	x1 ## 2, x1 ## 2;          \
494*4882a593Smuzhiyun		vpor			x4 ## 2, x1 ## 2, x1 ## 2; \
495*4882a593Smuzhiyun	vpsrld $7,		x3 ## 1, x4 ## 1;          \
496*4882a593Smuzhiyun	vpslld $(32 - 7),	x3 ## 1, x3 ## 1;          \
497*4882a593Smuzhiyun	vpor			x4 ## 1, x3 ## 1, x3 ## 1; \
498*4882a593Smuzhiyun	vpxor			x0 ## 1, x1 ## 1, x1 ## 1; \
499*4882a593Smuzhiyun	vpslld $3,		x0 ## 1, x4 ## 1;          \
500*4882a593Smuzhiyun	vpxor			x4 ## 1, x3 ## 1, x3 ## 1; \
501*4882a593Smuzhiyun		vpsrld $7,		x3 ## 2, x4 ## 2;          \
502*4882a593Smuzhiyun		vpslld $(32 - 7),	x3 ## 2, x3 ## 2;          \
503*4882a593Smuzhiyun		vpor			x4 ## 2, x3 ## 2, x3 ## 2; \
504*4882a593Smuzhiyun		vpxor			x0 ## 2, x1 ## 2, x1 ## 2; \
505*4882a593Smuzhiyun		vpslld $3,		x0 ## 2, x4 ## 2;          \
506*4882a593Smuzhiyun		vpxor			x4 ## 2, x3 ## 2, x3 ## 2; \
507*4882a593Smuzhiyun	vpsrld $13,		x0 ## 1, x4 ## 1;          \
508*4882a593Smuzhiyun	vpslld $(32 - 13),	x0 ## 1, x0 ## 1;          \
509*4882a593Smuzhiyun	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
510*4882a593Smuzhiyun	vpxor			x2 ## 1, x1 ## 1, x1 ## 1; \
511*4882a593Smuzhiyun	vpxor			x2 ## 1, x3 ## 1, x3 ## 1; \
512*4882a593Smuzhiyun	vpsrld $3,		x2 ## 1, x4 ## 1;          \
513*4882a593Smuzhiyun	vpslld $(32 - 3),	x2 ## 1, x2 ## 1;          \
514*4882a593Smuzhiyun	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
515*4882a593Smuzhiyun		vpsrld $13,		x0 ## 2, x4 ## 2;          \
516*4882a593Smuzhiyun		vpslld $(32 - 13),	x0 ## 2, x0 ## 2;          \
517*4882a593Smuzhiyun		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
518*4882a593Smuzhiyun		vpxor			x2 ## 2, x1 ## 2, x1 ## 2; \
519*4882a593Smuzhiyun		vpxor			x2 ## 2, x3 ## 2, x3 ## 2; \
520*4882a593Smuzhiyun		vpsrld $3,		x2 ## 2, x4 ## 2;          \
521*4882a593Smuzhiyun		vpslld $(32 - 3),	x2 ## 2, x2 ## 2;          \
522*4882a593Smuzhiyun		vpor			x4 ## 2, x2 ## 2, x2 ## 2;
523*4882a593Smuzhiyun
524*4882a593Smuzhiyun#define S(SBOX, x0, x1, x2, x3, x4) \
525*4882a593Smuzhiyun	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
526*4882a593Smuzhiyun	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
527*4882a593Smuzhiyun	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
528*4882a593Smuzhiyun	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
529*4882a593Smuzhiyun
530*4882a593Smuzhiyun#define SP(SBOX, x0, x1, x2, x3, x4, i) \
531*4882a593Smuzhiyun	get_key(i, 0, RK0); \
532*4882a593Smuzhiyun	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
533*4882a593Smuzhiyun	get_key(i, 2, RK2); \
534*4882a593Smuzhiyun	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
535*4882a593Smuzhiyun	get_key(i, 3, RK3); \
536*4882a593Smuzhiyun	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
537*4882a593Smuzhiyun	get_key(i, 1, RK1); \
538*4882a593Smuzhiyun	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
539*4882a593Smuzhiyun
540*4882a593Smuzhiyun#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
541*4882a593Smuzhiyun	vpunpckldq		x1, x0, t0; \
542*4882a593Smuzhiyun	vpunpckhdq		x1, x0, t2; \
543*4882a593Smuzhiyun	vpunpckldq		x3, x2, t1; \
544*4882a593Smuzhiyun	vpunpckhdq		x3, x2, x3; \
545*4882a593Smuzhiyun	\
546*4882a593Smuzhiyun	vpunpcklqdq		t1, t0, x0; \
547*4882a593Smuzhiyun	vpunpckhqdq		t1, t0, x1; \
548*4882a593Smuzhiyun	vpunpcklqdq		x3, t2, x2; \
549*4882a593Smuzhiyun	vpunpckhqdq		x3, t2, x3;
550*4882a593Smuzhiyun
551*4882a593Smuzhiyun#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
552*4882a593Smuzhiyun	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
553*4882a593Smuzhiyun
554*4882a593Smuzhiyun#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
555*4882a593Smuzhiyun	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
556*4882a593Smuzhiyun
557*4882a593Smuzhiyun.align 8
558*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx)
559*4882a593Smuzhiyun	/* input:
560*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
561*4882a593Smuzhiyun	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
562*4882a593Smuzhiyun	 * output:
563*4882a593Smuzhiyun	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
564*4882a593Smuzhiyun	 */
565*4882a593Smuzhiyun
566*4882a593Smuzhiyun	vpcmpeqd RNOT, RNOT, RNOT;
567*4882a593Smuzhiyun
568*4882a593Smuzhiyun	read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
569*4882a593Smuzhiyun	read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
570*4882a593Smuzhiyun
571*4882a593Smuzhiyun						 K2(RA, RB, RC, RD, RE, 0);
572*4882a593Smuzhiyun	S(S0, RA, RB, RC, RD, RE);		LK2(RC, RB, RD, RA, RE, 1);
573*4882a593Smuzhiyun	S(S1, RC, RB, RD, RA, RE);		LK2(RE, RD, RA, RC, RB, 2);
574*4882a593Smuzhiyun	S(S2, RE, RD, RA, RC, RB);		LK2(RB, RD, RE, RC, RA, 3);
575*4882a593Smuzhiyun	S(S3, RB, RD, RE, RC, RA);		LK2(RC, RA, RD, RB, RE, 4);
576*4882a593Smuzhiyun	S(S4, RC, RA, RD, RB, RE);		LK2(RA, RD, RB, RE, RC, 5);
577*4882a593Smuzhiyun	S(S5, RA, RD, RB, RE, RC);		LK2(RC, RA, RD, RE, RB, 6);
578*4882a593Smuzhiyun	S(S6, RC, RA, RD, RE, RB);		LK2(RD, RB, RA, RE, RC, 7);
579*4882a593Smuzhiyun	S(S7, RD, RB, RA, RE, RC);		LK2(RC, RA, RE, RD, RB, 8);
580*4882a593Smuzhiyun	S(S0, RC, RA, RE, RD, RB);		LK2(RE, RA, RD, RC, RB, 9);
581*4882a593Smuzhiyun	S(S1, RE, RA, RD, RC, RB);		LK2(RB, RD, RC, RE, RA, 10);
582*4882a593Smuzhiyun	S(S2, RB, RD, RC, RE, RA);		LK2(RA, RD, RB, RE, RC, 11);
583*4882a593Smuzhiyun	S(S3, RA, RD, RB, RE, RC);		LK2(RE, RC, RD, RA, RB, 12);
584*4882a593Smuzhiyun	S(S4, RE, RC, RD, RA, RB);		LK2(RC, RD, RA, RB, RE, 13);
585*4882a593Smuzhiyun	S(S5, RC, RD, RA, RB, RE);		LK2(RE, RC, RD, RB, RA, 14);
586*4882a593Smuzhiyun	S(S6, RE, RC, RD, RB, RA);		LK2(RD, RA, RC, RB, RE, 15);
587*4882a593Smuzhiyun	S(S7, RD, RA, RC, RB, RE);		LK2(RE, RC, RB, RD, RA, 16);
588*4882a593Smuzhiyun	S(S0, RE, RC, RB, RD, RA);		LK2(RB, RC, RD, RE, RA, 17);
589*4882a593Smuzhiyun	S(S1, RB, RC, RD, RE, RA);		LK2(RA, RD, RE, RB, RC, 18);
590*4882a593Smuzhiyun	S(S2, RA, RD, RE, RB, RC);		LK2(RC, RD, RA, RB, RE, 19);
591*4882a593Smuzhiyun	S(S3, RC, RD, RA, RB, RE);		LK2(RB, RE, RD, RC, RA, 20);
592*4882a593Smuzhiyun	S(S4, RB, RE, RD, RC, RA);		LK2(RE, RD, RC, RA, RB, 21);
593*4882a593Smuzhiyun	S(S5, RE, RD, RC, RA, RB);		LK2(RB, RE, RD, RA, RC, 22);
594*4882a593Smuzhiyun	S(S6, RB, RE, RD, RA, RC);		LK2(RD, RC, RE, RA, RB, 23);
595*4882a593Smuzhiyun	S(S7, RD, RC, RE, RA, RB);		LK2(RB, RE, RA, RD, RC, 24);
596*4882a593Smuzhiyun	S(S0, RB, RE, RA, RD, RC);		LK2(RA, RE, RD, RB, RC, 25);
597*4882a593Smuzhiyun	S(S1, RA, RE, RD, RB, RC);		LK2(RC, RD, RB, RA, RE, 26);
598*4882a593Smuzhiyun	S(S2, RC, RD, RB, RA, RE);		LK2(RE, RD, RC, RA, RB, 27);
599*4882a593Smuzhiyun	S(S3, RE, RD, RC, RA, RB);		LK2(RA, RB, RD, RE, RC, 28);
600*4882a593Smuzhiyun	S(S4, RA, RB, RD, RE, RC);		LK2(RB, RD, RE, RC, RA, 29);
601*4882a593Smuzhiyun	S(S5, RB, RD, RE, RC, RA);		LK2(RA, RB, RD, RC, RE, 30);
602*4882a593Smuzhiyun	S(S6, RA, RB, RD, RC, RE);		LK2(RD, RE, RB, RC, RA, 31);
603*4882a593Smuzhiyun	S(S7, RD, RE, RB, RC, RA);		 K2(RA, RB, RC, RD, RE, 32);
604*4882a593Smuzhiyun
605*4882a593Smuzhiyun	write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
606*4882a593Smuzhiyun	write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
607*4882a593Smuzhiyun
608*4882a593Smuzhiyun	RET;
609*4882a593SmuzhiyunSYM_FUNC_END(__serpent_enc_blk8_avx)
610*4882a593Smuzhiyun
611*4882a593Smuzhiyun.align 8
612*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx)
613*4882a593Smuzhiyun	/* input:
614*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
615*4882a593Smuzhiyun	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
616*4882a593Smuzhiyun	 * output:
617*4882a593Smuzhiyun	 *	RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks
618*4882a593Smuzhiyun	 */
619*4882a593Smuzhiyun
620*4882a593Smuzhiyun	vpcmpeqd RNOT, RNOT, RNOT;
621*4882a593Smuzhiyun
622*4882a593Smuzhiyun	read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
623*4882a593Smuzhiyun	read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
624*4882a593Smuzhiyun
625*4882a593Smuzhiyun						 K2(RA, RB, RC, RD, RE, 32);
626*4882a593Smuzhiyun	SP(SI7, RA, RB, RC, RD, RE, 31);	KL2(RB, RD, RA, RE, RC, 31);
627*4882a593Smuzhiyun	SP(SI6, RB, RD, RA, RE, RC, 30);	KL2(RA, RC, RE, RB, RD, 30);
628*4882a593Smuzhiyun	SP(SI5, RA, RC, RE, RB, RD, 29);	KL2(RC, RD, RA, RE, RB, 29);
629*4882a593Smuzhiyun	SP(SI4, RC, RD, RA, RE, RB, 28);	KL2(RC, RA, RB, RE, RD, 28);
630*4882a593Smuzhiyun	SP(SI3, RC, RA, RB, RE, RD, 27);	KL2(RB, RC, RD, RE, RA, 27);
631*4882a593Smuzhiyun	SP(SI2, RB, RC, RD, RE, RA, 26);	KL2(RC, RA, RE, RD, RB, 26);
632*4882a593Smuzhiyun	SP(SI1, RC, RA, RE, RD, RB, 25);	KL2(RB, RA, RE, RD, RC, 25);
633*4882a593Smuzhiyun	SP(SI0, RB, RA, RE, RD, RC, 24);	KL2(RE, RC, RA, RB, RD, 24);
634*4882a593Smuzhiyun	SP(SI7, RE, RC, RA, RB, RD, 23);	KL2(RC, RB, RE, RD, RA, 23);
635*4882a593Smuzhiyun	SP(SI6, RC, RB, RE, RD, RA, 22);	KL2(RE, RA, RD, RC, RB, 22);
636*4882a593Smuzhiyun	SP(SI5, RE, RA, RD, RC, RB, 21);	KL2(RA, RB, RE, RD, RC, 21);
637*4882a593Smuzhiyun	SP(SI4, RA, RB, RE, RD, RC, 20);	KL2(RA, RE, RC, RD, RB, 20);
638*4882a593Smuzhiyun	SP(SI3, RA, RE, RC, RD, RB, 19);	KL2(RC, RA, RB, RD, RE, 19);
639*4882a593Smuzhiyun	SP(SI2, RC, RA, RB, RD, RE, 18);	KL2(RA, RE, RD, RB, RC, 18);
640*4882a593Smuzhiyun	SP(SI1, RA, RE, RD, RB, RC, 17);	KL2(RC, RE, RD, RB, RA, 17);
641*4882a593Smuzhiyun	SP(SI0, RC, RE, RD, RB, RA, 16);	KL2(RD, RA, RE, RC, RB, 16);
642*4882a593Smuzhiyun	SP(SI7, RD, RA, RE, RC, RB, 15);	KL2(RA, RC, RD, RB, RE, 15);
643*4882a593Smuzhiyun	SP(SI6, RA, RC, RD, RB, RE, 14);	KL2(RD, RE, RB, RA, RC, 14);
644*4882a593Smuzhiyun	SP(SI5, RD, RE, RB, RA, RC, 13);	KL2(RE, RC, RD, RB, RA, 13);
645*4882a593Smuzhiyun	SP(SI4, RE, RC, RD, RB, RA, 12);	KL2(RE, RD, RA, RB, RC, 12);
646*4882a593Smuzhiyun	SP(SI3, RE, RD, RA, RB, RC, 11);	KL2(RA, RE, RC, RB, RD, 11);
647*4882a593Smuzhiyun	SP(SI2, RA, RE, RC, RB, RD, 10);	KL2(RE, RD, RB, RC, RA, 10);
648*4882a593Smuzhiyun	SP(SI1, RE, RD, RB, RC, RA, 9);		KL2(RA, RD, RB, RC, RE, 9);
649*4882a593Smuzhiyun	SP(SI0, RA, RD, RB, RC, RE, 8);		KL2(RB, RE, RD, RA, RC, 8);
650*4882a593Smuzhiyun	SP(SI7, RB, RE, RD, RA, RC, 7);		KL2(RE, RA, RB, RC, RD, 7);
651*4882a593Smuzhiyun	SP(SI6, RE, RA, RB, RC, RD, 6);		KL2(RB, RD, RC, RE, RA, 6);
652*4882a593Smuzhiyun	SP(SI5, RB, RD, RC, RE, RA, 5);		KL2(RD, RA, RB, RC, RE, 5);
653*4882a593Smuzhiyun	SP(SI4, RD, RA, RB, RC, RE, 4);		KL2(RD, RB, RE, RC, RA, 4);
654*4882a593Smuzhiyun	SP(SI3, RD, RB, RE, RC, RA, 3);		KL2(RE, RD, RA, RC, RB, 3);
655*4882a593Smuzhiyun	SP(SI2, RE, RD, RA, RC, RB, 2);		KL2(RD, RB, RC, RA, RE, 2);
656*4882a593Smuzhiyun	SP(SI1, RD, RB, RC, RA, RE, 1);		KL2(RE, RB, RC, RA, RD, 1);
657*4882a593Smuzhiyun	S(SI0, RE, RB, RC, RA, RD);		 K2(RC, RD, RB, RE, RA, 0);
658*4882a593Smuzhiyun
659*4882a593Smuzhiyun	write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
660*4882a593Smuzhiyun	write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
661*4882a593Smuzhiyun
662*4882a593Smuzhiyun	RET;
663*4882a593SmuzhiyunSYM_FUNC_END(__serpent_dec_blk8_avx)
664*4882a593Smuzhiyun
665*4882a593SmuzhiyunSYM_FUNC_START(serpent_ecb_enc_8way_avx)
666*4882a593Smuzhiyun	/* input:
667*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
668*4882a593Smuzhiyun	 *	%rsi: dst
669*4882a593Smuzhiyun	 *	%rdx: src
670*4882a593Smuzhiyun	 */
671*4882a593Smuzhiyun	FRAME_BEGIN
672*4882a593Smuzhiyun
673*4882a593Smuzhiyun	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
674*4882a593Smuzhiyun
675*4882a593Smuzhiyun	call __serpent_enc_blk8_avx;
676*4882a593Smuzhiyun
677*4882a593Smuzhiyun	store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
678*4882a593Smuzhiyun
679*4882a593Smuzhiyun	FRAME_END
680*4882a593Smuzhiyun	RET;
681*4882a593SmuzhiyunSYM_FUNC_END(serpent_ecb_enc_8way_avx)
682*4882a593Smuzhiyun
683*4882a593SmuzhiyunSYM_FUNC_START(serpent_ecb_dec_8way_avx)
684*4882a593Smuzhiyun	/* input:
685*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
686*4882a593Smuzhiyun	 *	%rsi: dst
687*4882a593Smuzhiyun	 *	%rdx: src
688*4882a593Smuzhiyun	 */
689*4882a593Smuzhiyun	FRAME_BEGIN
690*4882a593Smuzhiyun
691*4882a593Smuzhiyun	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
692*4882a593Smuzhiyun
693*4882a593Smuzhiyun	call __serpent_dec_blk8_avx;
694*4882a593Smuzhiyun
695*4882a593Smuzhiyun	store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
696*4882a593Smuzhiyun
697*4882a593Smuzhiyun	FRAME_END
698*4882a593Smuzhiyun	RET;
699*4882a593SmuzhiyunSYM_FUNC_END(serpent_ecb_dec_8way_avx)
700*4882a593Smuzhiyun
701*4882a593SmuzhiyunSYM_FUNC_START(serpent_cbc_dec_8way_avx)
702*4882a593Smuzhiyun	/* input:
703*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
704*4882a593Smuzhiyun	 *	%rsi: dst
705*4882a593Smuzhiyun	 *	%rdx: src
706*4882a593Smuzhiyun	 */
707*4882a593Smuzhiyun	FRAME_BEGIN
708*4882a593Smuzhiyun
709*4882a593Smuzhiyun	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
710*4882a593Smuzhiyun
711*4882a593Smuzhiyun	call __serpent_dec_blk8_avx;
712*4882a593Smuzhiyun
713*4882a593Smuzhiyun	store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
714*4882a593Smuzhiyun
715*4882a593Smuzhiyun	FRAME_END
716*4882a593Smuzhiyun	RET;
717*4882a593SmuzhiyunSYM_FUNC_END(serpent_cbc_dec_8way_avx)
718*4882a593Smuzhiyun
719*4882a593SmuzhiyunSYM_FUNC_START(serpent_ctr_8way_avx)
720*4882a593Smuzhiyun	/* input:
721*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
722*4882a593Smuzhiyun	 *	%rsi: dst
723*4882a593Smuzhiyun	 *	%rdx: src
724*4882a593Smuzhiyun	 *	%rcx: iv (little endian, 128bit)
725*4882a593Smuzhiyun	 */
726*4882a593Smuzhiyun	FRAME_BEGIN
727*4882a593Smuzhiyun
728*4882a593Smuzhiyun	load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
729*4882a593Smuzhiyun		      RD2, RK0, RK1, RK2);
730*4882a593Smuzhiyun
731*4882a593Smuzhiyun	call __serpent_enc_blk8_avx;
732*4882a593Smuzhiyun
733*4882a593Smuzhiyun	store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
734*4882a593Smuzhiyun
735*4882a593Smuzhiyun	FRAME_END
736*4882a593Smuzhiyun	RET;
737*4882a593SmuzhiyunSYM_FUNC_END(serpent_ctr_8way_avx)
738*4882a593Smuzhiyun
739*4882a593SmuzhiyunSYM_FUNC_START(serpent_xts_enc_8way_avx)
740*4882a593Smuzhiyun	/* input:
741*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
742*4882a593Smuzhiyun	 *	%rsi: dst
743*4882a593Smuzhiyun	 *	%rdx: src
744*4882a593Smuzhiyun	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
745*4882a593Smuzhiyun	 */
746*4882a593Smuzhiyun	FRAME_BEGIN
747*4882a593Smuzhiyun
748*4882a593Smuzhiyun	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
749*4882a593Smuzhiyun	load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
750*4882a593Smuzhiyun		      RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
751*4882a593Smuzhiyun
752*4882a593Smuzhiyun	call __serpent_enc_blk8_avx;
753*4882a593Smuzhiyun
754*4882a593Smuzhiyun	/* dst <= regs xor IVs(in dst) */
755*4882a593Smuzhiyun	store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
756*4882a593Smuzhiyun
757*4882a593Smuzhiyun	FRAME_END
758*4882a593Smuzhiyun	RET;
759*4882a593SmuzhiyunSYM_FUNC_END(serpent_xts_enc_8way_avx)
760*4882a593Smuzhiyun
761*4882a593SmuzhiyunSYM_FUNC_START(serpent_xts_dec_8way_avx)
762*4882a593Smuzhiyun	/* input:
763*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
764*4882a593Smuzhiyun	 *	%rsi: dst
765*4882a593Smuzhiyun	 *	%rdx: src
766*4882a593Smuzhiyun	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
767*4882a593Smuzhiyun	 */
768*4882a593Smuzhiyun	FRAME_BEGIN
769*4882a593Smuzhiyun
770*4882a593Smuzhiyun	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
771*4882a593Smuzhiyun	load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
772*4882a593Smuzhiyun		      RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
773*4882a593Smuzhiyun
774*4882a593Smuzhiyun	call __serpent_dec_blk8_avx;
775*4882a593Smuzhiyun
776*4882a593Smuzhiyun	/* dst <= regs xor IVs(in dst) */
777*4882a593Smuzhiyun	store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
778*4882a593Smuzhiyun
779*4882a593Smuzhiyun	FRAME_END
780*4882a593Smuzhiyun	RET;
781*4882a593SmuzhiyunSYM_FUNC_END(serpent_xts_dec_8way_avx)
782