xref: /OK3568_Linux_fs/kernel/arch/x86/crypto/serpent-avx2-asm_64.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * x86_64/AVX2 assembler optimized version of Serpent
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * Based on AVX assembler implementation of Serpent by:
8*4882a593Smuzhiyun *  Copyright © 2012 Johannes Goetzfried
9*4882a593Smuzhiyun *      <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
10*4882a593Smuzhiyun */
11*4882a593Smuzhiyun
12*4882a593Smuzhiyun#include <linux/linkage.h>
13*4882a593Smuzhiyun#include <asm/frame.h>
14*4882a593Smuzhiyun#include "glue_helper-asm-avx2.S"
15*4882a593Smuzhiyun
16*4882a593Smuzhiyun.file "serpent-avx2-asm_64.S"
17*4882a593Smuzhiyun
18*4882a593Smuzhiyun.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
19*4882a593Smuzhiyun.align 16
20*4882a593Smuzhiyun.Lbswap128_mask:
21*4882a593Smuzhiyun	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
22*4882a593Smuzhiyun
23*4882a593Smuzhiyun.section	.rodata.cst16.xts_gf128mul_and_shl1_mask_0, "aM", @progbits, 16
24*4882a593Smuzhiyun.align 16
25*4882a593Smuzhiyun.Lxts_gf128mul_and_shl1_mask_0:
26*4882a593Smuzhiyun	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
27*4882a593Smuzhiyun
28*4882a593Smuzhiyun.section	.rodata.cst16.xts_gf128mul_and_shl1_mask_1, "aM", @progbits, 16
29*4882a593Smuzhiyun.align 16
30*4882a593Smuzhiyun.Lxts_gf128mul_and_shl1_mask_1:
31*4882a593Smuzhiyun	.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun.text
34*4882a593Smuzhiyun
35*4882a593Smuzhiyun#define CTX %rdi
36*4882a593Smuzhiyun
37*4882a593Smuzhiyun#define RNOT %ymm0
38*4882a593Smuzhiyun#define tp  %ymm1
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun#define RA1 %ymm2
41*4882a593Smuzhiyun#define RA2 %ymm3
42*4882a593Smuzhiyun#define RB1 %ymm4
43*4882a593Smuzhiyun#define RB2 %ymm5
44*4882a593Smuzhiyun#define RC1 %ymm6
45*4882a593Smuzhiyun#define RC2 %ymm7
46*4882a593Smuzhiyun#define RD1 %ymm8
47*4882a593Smuzhiyun#define RD2 %ymm9
48*4882a593Smuzhiyun#define RE1 %ymm10
49*4882a593Smuzhiyun#define RE2 %ymm11
50*4882a593Smuzhiyun
51*4882a593Smuzhiyun#define RK0 %ymm12
52*4882a593Smuzhiyun#define RK1 %ymm13
53*4882a593Smuzhiyun#define RK2 %ymm14
54*4882a593Smuzhiyun#define RK3 %ymm15
55*4882a593Smuzhiyun
56*4882a593Smuzhiyun#define RK0x %xmm12
57*4882a593Smuzhiyun#define RK1x %xmm13
58*4882a593Smuzhiyun#define RK2x %xmm14
59*4882a593Smuzhiyun#define RK3x %xmm15
60*4882a593Smuzhiyun
61*4882a593Smuzhiyun#define S0_1(x0, x1, x2, x3, x4)      \
62*4882a593Smuzhiyun	vpor		x0,   x3, tp; \
63*4882a593Smuzhiyun	vpxor		x3,   x0, x0; \
64*4882a593Smuzhiyun	vpxor		x2,   x3, x4; \
65*4882a593Smuzhiyun	vpxor		RNOT, x4, x4; \
66*4882a593Smuzhiyun	vpxor		x1,   tp, x3; \
67*4882a593Smuzhiyun	vpand		x0,   x1, x1; \
68*4882a593Smuzhiyun	vpxor		x4,   x1, x1; \
69*4882a593Smuzhiyun	vpxor		x0,   x2, x2;
70*4882a593Smuzhiyun#define S0_2(x0, x1, x2, x3, x4)      \
71*4882a593Smuzhiyun	vpxor		x3,   x0, x0; \
72*4882a593Smuzhiyun	vpor		x0,   x4, x4; \
73*4882a593Smuzhiyun	vpxor		x2,   x0, x0; \
74*4882a593Smuzhiyun	vpand		x1,   x2, x2; \
75*4882a593Smuzhiyun	vpxor		x2,   x3, x3; \
76*4882a593Smuzhiyun	vpxor		RNOT, x1, x1; \
77*4882a593Smuzhiyun	vpxor		x4,   x2, x2; \
78*4882a593Smuzhiyun	vpxor		x2,   x1, x1;
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun#define S1_1(x0, x1, x2, x3, x4)      \
81*4882a593Smuzhiyun	vpxor		x0,   x1, tp; \
82*4882a593Smuzhiyun	vpxor		x3,   x0, x0; \
83*4882a593Smuzhiyun	vpxor		RNOT, x3, x3; \
84*4882a593Smuzhiyun	vpand		tp,   x1, x4; \
85*4882a593Smuzhiyun	vpor		tp,   x0, x0; \
86*4882a593Smuzhiyun	vpxor		x2,   x3, x3; \
87*4882a593Smuzhiyun	vpxor		x3,   x0, x0; \
88*4882a593Smuzhiyun	vpxor		x3,   tp, x1;
89*4882a593Smuzhiyun#define S1_2(x0, x1, x2, x3, x4)      \
90*4882a593Smuzhiyun	vpxor		x4,   x3, x3; \
91*4882a593Smuzhiyun	vpor		x4,   x1, x1; \
92*4882a593Smuzhiyun	vpxor		x2,   x4, x4; \
93*4882a593Smuzhiyun	vpand		x0,   x2, x2; \
94*4882a593Smuzhiyun	vpxor		x1,   x2, x2; \
95*4882a593Smuzhiyun	vpor		x0,   x1, x1; \
96*4882a593Smuzhiyun	vpxor		RNOT, x0, x0; \
97*4882a593Smuzhiyun	vpxor		x2,   x0, x0; \
98*4882a593Smuzhiyun	vpxor		x1,   x4, x4;
99*4882a593Smuzhiyun
100*4882a593Smuzhiyun#define S2_1(x0, x1, x2, x3, x4)      \
101*4882a593Smuzhiyun	vpxor		RNOT, x3, x3; \
102*4882a593Smuzhiyun	vpxor		x0,   x1, x1; \
103*4882a593Smuzhiyun	vpand		x2,   x0, tp; \
104*4882a593Smuzhiyun	vpxor		x3,   tp, tp; \
105*4882a593Smuzhiyun	vpor		x0,   x3, x3; \
106*4882a593Smuzhiyun	vpxor		x1,   x2, x2; \
107*4882a593Smuzhiyun	vpxor		x1,   x3, x3; \
108*4882a593Smuzhiyun	vpand		tp,   x1, x1;
109*4882a593Smuzhiyun#define S2_2(x0, x1, x2, x3, x4)      \
110*4882a593Smuzhiyun	vpxor		x2,   tp, tp; \
111*4882a593Smuzhiyun	vpand		x3,   x2, x2; \
112*4882a593Smuzhiyun	vpor		x1,   x3, x3; \
113*4882a593Smuzhiyun	vpxor		RNOT, tp, tp; \
114*4882a593Smuzhiyun	vpxor		tp,   x3, x3; \
115*4882a593Smuzhiyun	vpxor		tp,   x0, x4; \
116*4882a593Smuzhiyun	vpxor		x2,   tp, x0; \
117*4882a593Smuzhiyun	vpor		x2,   x1, x1;
118*4882a593Smuzhiyun
119*4882a593Smuzhiyun#define S3_1(x0, x1, x2, x3, x4)      \
120*4882a593Smuzhiyun	vpxor		x3,   x1, tp; \
121*4882a593Smuzhiyun	vpor		x0,   x3, x3; \
122*4882a593Smuzhiyun	vpand		x0,   x1, x4; \
123*4882a593Smuzhiyun	vpxor		x2,   x0, x0; \
124*4882a593Smuzhiyun	vpxor		tp,   x2, x2; \
125*4882a593Smuzhiyun	vpand		x3,   tp, x1; \
126*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
127*4882a593Smuzhiyun	vpor		x4,   x0, x0; \
128*4882a593Smuzhiyun	vpxor		x3,   x4, x4;
129*4882a593Smuzhiyun#define S3_2(x0, x1, x2, x3, x4)      \
130*4882a593Smuzhiyun	vpxor		x0,   x1, x1; \
131*4882a593Smuzhiyun	vpand		x3,   x0, x0; \
132*4882a593Smuzhiyun	vpand		x4,   x3, x3; \
133*4882a593Smuzhiyun	vpxor		x2,   x3, x3; \
134*4882a593Smuzhiyun	vpor		x1,   x4, x4; \
135*4882a593Smuzhiyun	vpand		x1,   x2, x2; \
136*4882a593Smuzhiyun	vpxor		x3,   x4, x4; \
137*4882a593Smuzhiyun	vpxor		x3,   x0, x0; \
138*4882a593Smuzhiyun	vpxor		x2,   x3, x3;
139*4882a593Smuzhiyun
140*4882a593Smuzhiyun#define S4_1(x0, x1, x2, x3, x4)      \
141*4882a593Smuzhiyun	vpand		x0,   x3, tp; \
142*4882a593Smuzhiyun	vpxor		x3,   x0, x0; \
143*4882a593Smuzhiyun	vpxor		x2,   tp, tp; \
144*4882a593Smuzhiyun	vpor		x3,   x2, x2; \
145*4882a593Smuzhiyun	vpxor		x1,   x0, x0; \
146*4882a593Smuzhiyun	vpxor		tp,   x3, x4; \
147*4882a593Smuzhiyun	vpor		x0,   x2, x2; \
148*4882a593Smuzhiyun	vpxor		x1,   x2, x2;
149*4882a593Smuzhiyun#define S4_2(x0, x1, x2, x3, x4)      \
150*4882a593Smuzhiyun	vpand		x0,   x1, x1; \
151*4882a593Smuzhiyun	vpxor		x4,   x1, x1; \
152*4882a593Smuzhiyun	vpand		x2,   x4, x4; \
153*4882a593Smuzhiyun	vpxor		tp,   x2, x2; \
154*4882a593Smuzhiyun	vpxor		x0,   x4, x4; \
155*4882a593Smuzhiyun	vpor		x1,   tp, x3; \
156*4882a593Smuzhiyun	vpxor		RNOT, x1, x1; \
157*4882a593Smuzhiyun	vpxor		x0,   x3, x3;
158*4882a593Smuzhiyun
159*4882a593Smuzhiyun#define S5_1(x0, x1, x2, x3, x4)      \
160*4882a593Smuzhiyun	vpor		x0,   x1, tp; \
161*4882a593Smuzhiyun	vpxor		tp,   x2, x2; \
162*4882a593Smuzhiyun	vpxor		RNOT, x3, x3; \
163*4882a593Smuzhiyun	vpxor		x0,   x1, x4; \
164*4882a593Smuzhiyun	vpxor		x2,   x0, x0; \
165*4882a593Smuzhiyun	vpand		x4,   tp, x1; \
166*4882a593Smuzhiyun	vpor		x3,   x4, x4; \
167*4882a593Smuzhiyun	vpxor		x0,   x4, x4;
168*4882a593Smuzhiyun#define S5_2(x0, x1, x2, x3, x4)      \
169*4882a593Smuzhiyun	vpand		x3,   x0, x0; \
170*4882a593Smuzhiyun	vpxor		x3,   x1, x1; \
171*4882a593Smuzhiyun	vpxor		x2,   x3, x3; \
172*4882a593Smuzhiyun	vpxor		x1,   x0, x0; \
173*4882a593Smuzhiyun	vpand		x4,   x2, x2; \
174*4882a593Smuzhiyun	vpxor		x2,   x1, x1; \
175*4882a593Smuzhiyun	vpand		x0,   x2, x2; \
176*4882a593Smuzhiyun	vpxor		x2,   x3, x3;
177*4882a593Smuzhiyun
178*4882a593Smuzhiyun#define S6_1(x0, x1, x2, x3, x4)      \
179*4882a593Smuzhiyun	vpxor		x0,   x3, x3; \
180*4882a593Smuzhiyun	vpxor		x2,   x1, tp; \
181*4882a593Smuzhiyun	vpxor		x0,   x2, x2; \
182*4882a593Smuzhiyun	vpand		x3,   x0, x0; \
183*4882a593Smuzhiyun	vpor		x3,   tp, tp; \
184*4882a593Smuzhiyun	vpxor		RNOT, x1, x4; \
185*4882a593Smuzhiyun	vpxor		tp,   x0, x0; \
186*4882a593Smuzhiyun	vpxor		x2,   tp, x1;
187*4882a593Smuzhiyun#define S6_2(x0, x1, x2, x3, x4)      \
188*4882a593Smuzhiyun	vpxor		x4,   x3, x3; \
189*4882a593Smuzhiyun	vpxor		x0,   x4, x4; \
190*4882a593Smuzhiyun	vpand		x0,   x2, x2; \
191*4882a593Smuzhiyun	vpxor		x1,   x4, x4; \
192*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
193*4882a593Smuzhiyun	vpand		x1,   x3, x3; \
194*4882a593Smuzhiyun	vpxor		x0,   x3, x3; \
195*4882a593Smuzhiyun	vpxor		x2,   x1, x1;
196*4882a593Smuzhiyun
197*4882a593Smuzhiyun#define S7_1(x0, x1, x2, x3, x4)      \
198*4882a593Smuzhiyun	vpxor		RNOT, x1, tp; \
199*4882a593Smuzhiyun	vpxor		RNOT, x0, x0; \
200*4882a593Smuzhiyun	vpand		x2,   tp, x1; \
201*4882a593Smuzhiyun	vpxor		x3,   x1, x1; \
202*4882a593Smuzhiyun	vpor		tp,   x3, x3; \
203*4882a593Smuzhiyun	vpxor		x2,   tp, x4; \
204*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
205*4882a593Smuzhiyun	vpxor		x0,   x3, x3; \
206*4882a593Smuzhiyun	vpor		x1,   x0, x0;
207*4882a593Smuzhiyun#define S7_2(x0, x1, x2, x3, x4)      \
208*4882a593Smuzhiyun	vpand		x0,   x2, x2; \
209*4882a593Smuzhiyun	vpxor		x4,   x0, x0; \
210*4882a593Smuzhiyun	vpxor		x3,   x4, x4; \
211*4882a593Smuzhiyun	vpand		x0,   x3, x3; \
212*4882a593Smuzhiyun	vpxor		x1,   x4, x4; \
213*4882a593Smuzhiyun	vpxor		x4,   x2, x2; \
214*4882a593Smuzhiyun	vpxor		x1,   x3, x3; \
215*4882a593Smuzhiyun	vpor		x0,   x4, x4; \
216*4882a593Smuzhiyun	vpxor		x1,   x4, x4;
217*4882a593Smuzhiyun
218*4882a593Smuzhiyun#define SI0_1(x0, x1, x2, x3, x4)     \
219*4882a593Smuzhiyun	vpxor		x0,   x1, x1; \
220*4882a593Smuzhiyun	vpor		x1,   x3, tp; \
221*4882a593Smuzhiyun	vpxor		x1,   x3, x4; \
222*4882a593Smuzhiyun	vpxor		RNOT, x0, x0; \
223*4882a593Smuzhiyun	vpxor		tp,   x2, x2; \
224*4882a593Smuzhiyun	vpxor		x0,   tp, x3; \
225*4882a593Smuzhiyun	vpand		x1,   x0, x0; \
226*4882a593Smuzhiyun	vpxor		x2,   x0, x0;
227*4882a593Smuzhiyun#define SI0_2(x0, x1, x2, x3, x4)     \
228*4882a593Smuzhiyun	vpand		x3,   x2, x2; \
229*4882a593Smuzhiyun	vpxor		x4,   x3, x3; \
230*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
231*4882a593Smuzhiyun	vpxor		x3,   x1, x1; \
232*4882a593Smuzhiyun	vpand		x0,   x3, x3; \
233*4882a593Smuzhiyun	vpxor		x0,   x1, x1; \
234*4882a593Smuzhiyun	vpxor		x2,   x0, x0; \
235*4882a593Smuzhiyun	vpxor		x3,   x4, x4;
236*4882a593Smuzhiyun
237*4882a593Smuzhiyun#define SI1_1(x0, x1, x2, x3, x4)     \
238*4882a593Smuzhiyun	vpxor		x3,   x1, x1; \
239*4882a593Smuzhiyun	vpxor		x2,   x0, tp; \
240*4882a593Smuzhiyun	vpxor		RNOT, x2, x2; \
241*4882a593Smuzhiyun	vpor		x1,   x0, x4; \
242*4882a593Smuzhiyun	vpxor		x3,   x4, x4; \
243*4882a593Smuzhiyun	vpand		x1,   x3, x3; \
244*4882a593Smuzhiyun	vpxor		x2,   x1, x1; \
245*4882a593Smuzhiyun	vpand		x4,   x2, x2;
246*4882a593Smuzhiyun#define SI1_2(x0, x1, x2, x3, x4)     \
247*4882a593Smuzhiyun	vpxor		x1,   x4, x4; \
248*4882a593Smuzhiyun	vpor		x3,   x1, x1; \
249*4882a593Smuzhiyun	vpxor		tp,   x3, x3; \
250*4882a593Smuzhiyun	vpxor		tp,   x2, x2; \
251*4882a593Smuzhiyun	vpor		x4,   tp, x0; \
252*4882a593Smuzhiyun	vpxor		x4,   x2, x2; \
253*4882a593Smuzhiyun	vpxor		x0,   x1, x1; \
254*4882a593Smuzhiyun	vpxor		x1,   x4, x4;
255*4882a593Smuzhiyun
256*4882a593Smuzhiyun#define SI2_1(x0, x1, x2, x3, x4)     \
257*4882a593Smuzhiyun	vpxor		x1,   x2, x2; \
258*4882a593Smuzhiyun	vpxor		RNOT, x3, tp; \
259*4882a593Smuzhiyun	vpor		x2,   tp, tp; \
260*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
261*4882a593Smuzhiyun	vpxor		x0,   x3, x4; \
262*4882a593Smuzhiyun	vpxor		x1,   tp, x3; \
263*4882a593Smuzhiyun	vpor		x2,   x1, x1; \
264*4882a593Smuzhiyun	vpxor		x0,   x2, x2;
265*4882a593Smuzhiyun#define SI2_2(x0, x1, x2, x3, x4)     \
266*4882a593Smuzhiyun	vpxor		x4,   x1, x1; \
267*4882a593Smuzhiyun	vpor		x3,   x4, x4; \
268*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
269*4882a593Smuzhiyun	vpxor		x2,   x4, x4; \
270*4882a593Smuzhiyun	vpand		x1,   x2, x2; \
271*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
272*4882a593Smuzhiyun	vpxor		x4,   x3, x3; \
273*4882a593Smuzhiyun	vpxor		x0,   x4, x4;
274*4882a593Smuzhiyun
275*4882a593Smuzhiyun#define SI3_1(x0, x1, x2, x3, x4)     \
276*4882a593Smuzhiyun	vpxor		x1,   x2, x2; \
277*4882a593Smuzhiyun	vpand		x2,   x1, tp; \
278*4882a593Smuzhiyun	vpxor		x0,   tp, tp; \
279*4882a593Smuzhiyun	vpor		x1,   x0, x0; \
280*4882a593Smuzhiyun	vpxor		x3,   x1, x4; \
281*4882a593Smuzhiyun	vpxor		x3,   x0, x0; \
282*4882a593Smuzhiyun	vpor		tp,   x3, x3; \
283*4882a593Smuzhiyun	vpxor		x2,   tp, x1;
284*4882a593Smuzhiyun#define SI3_2(x0, x1, x2, x3, x4)     \
285*4882a593Smuzhiyun	vpxor		x3,   x1, x1; \
286*4882a593Smuzhiyun	vpxor		x2,   x0, x0; \
287*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
288*4882a593Smuzhiyun	vpand		x1,   x3, x3; \
289*4882a593Smuzhiyun	vpxor		x0,   x1, x1; \
290*4882a593Smuzhiyun	vpand		x2,   x0, x0; \
291*4882a593Smuzhiyun	vpxor		x3,   x4, x4; \
292*4882a593Smuzhiyun	vpxor		x0,   x3, x3; \
293*4882a593Smuzhiyun	vpxor		x1,   x0, x0;
294*4882a593Smuzhiyun
295*4882a593Smuzhiyun#define SI4_1(x0, x1, x2, x3, x4)     \
296*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
297*4882a593Smuzhiyun	vpand		x1,   x0, tp; \
298*4882a593Smuzhiyun	vpxor		x2,   tp, tp; \
299*4882a593Smuzhiyun	vpor		x3,   x2, x2; \
300*4882a593Smuzhiyun	vpxor		RNOT, x0, x4; \
301*4882a593Smuzhiyun	vpxor		tp,   x1, x1; \
302*4882a593Smuzhiyun	vpxor		x2,   tp, x0; \
303*4882a593Smuzhiyun	vpand		x4,   x2, x2;
304*4882a593Smuzhiyun#define SI4_2(x0, x1, x2, x3, x4)     \
305*4882a593Smuzhiyun	vpxor		x0,   x2, x2; \
306*4882a593Smuzhiyun	vpor		x4,   x0, x0; \
307*4882a593Smuzhiyun	vpxor		x3,   x0, x0; \
308*4882a593Smuzhiyun	vpand		x2,   x3, x3; \
309*4882a593Smuzhiyun	vpxor		x3,   x4, x4; \
310*4882a593Smuzhiyun	vpxor		x1,   x3, x3; \
311*4882a593Smuzhiyun	vpand		x0,   x1, x1; \
312*4882a593Smuzhiyun	vpxor		x1,   x4, x4; \
313*4882a593Smuzhiyun	vpxor		x3,   x0, x0;
314*4882a593Smuzhiyun
315*4882a593Smuzhiyun#define SI5_1(x0, x1, x2, x3, x4)     \
316*4882a593Smuzhiyun	vpor		x2,   x1, tp; \
317*4882a593Smuzhiyun	vpxor		x1,   x2, x2; \
318*4882a593Smuzhiyun	vpxor		x3,   tp, tp; \
319*4882a593Smuzhiyun	vpand		x1,   x3, x3; \
320*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
321*4882a593Smuzhiyun	vpor		x0,   x3, x3; \
322*4882a593Smuzhiyun	vpxor		RNOT, x0, x0; \
323*4882a593Smuzhiyun	vpxor		x2,   x3, x3; \
324*4882a593Smuzhiyun	vpor		x0,   x2, x2;
325*4882a593Smuzhiyun#define SI5_2(x0, x1, x2, x3, x4)     \
326*4882a593Smuzhiyun	vpxor		tp,   x1, x4; \
327*4882a593Smuzhiyun	vpxor		x4,   x2, x2; \
328*4882a593Smuzhiyun	vpand		x0,   x4, x4; \
329*4882a593Smuzhiyun	vpxor		tp,   x0, x0; \
330*4882a593Smuzhiyun	vpxor		x3,   tp, x1; \
331*4882a593Smuzhiyun	vpand		x2,   x0, x0; \
332*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
333*4882a593Smuzhiyun	vpxor		x2,   x0, x0; \
334*4882a593Smuzhiyun	vpxor		x4,   x2, x2; \
335*4882a593Smuzhiyun	vpxor		x3,   x4, x4;
336*4882a593Smuzhiyun
337*4882a593Smuzhiyun#define SI6_1(x0, x1, x2, x3, x4)     \
338*4882a593Smuzhiyun	vpxor		x2,   x0, x0; \
339*4882a593Smuzhiyun	vpand		x3,   x0, tp; \
340*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
341*4882a593Smuzhiyun	vpxor		x2,   tp, tp; \
342*4882a593Smuzhiyun	vpxor		x1,   x3, x3; \
343*4882a593Smuzhiyun	vpor		x0,   x2, x2; \
344*4882a593Smuzhiyun	vpxor		x3,   x2, x2; \
345*4882a593Smuzhiyun	vpand		tp,   x3, x3;
346*4882a593Smuzhiyun#define SI6_2(x0, x1, x2, x3, x4)     \
347*4882a593Smuzhiyun	vpxor		RNOT, tp, tp; \
348*4882a593Smuzhiyun	vpxor		x1,   x3, x3; \
349*4882a593Smuzhiyun	vpand		x2,   x1, x1; \
350*4882a593Smuzhiyun	vpxor		tp,   x0, x4; \
351*4882a593Smuzhiyun	vpxor		x4,   x3, x3; \
352*4882a593Smuzhiyun	vpxor		x2,   x4, x4; \
353*4882a593Smuzhiyun	vpxor		x1,   tp, x0; \
354*4882a593Smuzhiyun	vpxor		x0,   x2, x2;
355*4882a593Smuzhiyun
356*4882a593Smuzhiyun#define SI7_1(x0, x1, x2, x3, x4)     \
357*4882a593Smuzhiyun	vpand		x0,   x3, tp; \
358*4882a593Smuzhiyun	vpxor		x2,   x0, x0; \
359*4882a593Smuzhiyun	vpor		x3,   x2, x2; \
360*4882a593Smuzhiyun	vpxor		x1,   x3, x4; \
361*4882a593Smuzhiyun	vpxor		RNOT, x0, x0; \
362*4882a593Smuzhiyun	vpor		tp,   x1, x1; \
363*4882a593Smuzhiyun	vpxor		x0,   x4, x4; \
364*4882a593Smuzhiyun	vpand		x2,   x0, x0; \
365*4882a593Smuzhiyun	vpxor		x1,   x0, x0;
366*4882a593Smuzhiyun#define SI7_2(x0, x1, x2, x3, x4)     \
367*4882a593Smuzhiyun	vpand		x2,   x1, x1; \
368*4882a593Smuzhiyun	vpxor		x2,   tp, x3; \
369*4882a593Smuzhiyun	vpxor		x3,   x4, x4; \
370*4882a593Smuzhiyun	vpand		x3,   x2, x2; \
371*4882a593Smuzhiyun	vpor		x0,   x3, x3; \
372*4882a593Smuzhiyun	vpxor		x4,   x1, x1; \
373*4882a593Smuzhiyun	vpxor		x4,   x3, x3; \
374*4882a593Smuzhiyun	vpand		x0,   x4, x4; \
375*4882a593Smuzhiyun	vpxor		x2,   x4, x4;
376*4882a593Smuzhiyun
377*4882a593Smuzhiyun#define get_key(i,j,t) \
378*4882a593Smuzhiyun	vpbroadcastd (4*(i)+(j))*4(CTX), t;
379*4882a593Smuzhiyun
380*4882a593Smuzhiyun#define K2(x0, x1, x2, x3, x4, i) \
381*4882a593Smuzhiyun	get_key(i, 0, RK0); \
382*4882a593Smuzhiyun	get_key(i, 1, RK1); \
383*4882a593Smuzhiyun	get_key(i, 2, RK2); \
384*4882a593Smuzhiyun	get_key(i, 3, RK3); \
385*4882a593Smuzhiyun	vpxor RK0,	x0 ## 1, x0 ## 1; \
386*4882a593Smuzhiyun	vpxor RK1,	x1 ## 1, x1 ## 1; \
387*4882a593Smuzhiyun	vpxor RK2,	x2 ## 1, x2 ## 1; \
388*4882a593Smuzhiyun	vpxor RK3,	x3 ## 1, x3 ## 1; \
389*4882a593Smuzhiyun		vpxor RK0,	x0 ## 2, x0 ## 2; \
390*4882a593Smuzhiyun		vpxor RK1,	x1 ## 2, x1 ## 2; \
391*4882a593Smuzhiyun		vpxor RK2,	x2 ## 2, x2 ## 2; \
392*4882a593Smuzhiyun		vpxor RK3,	x3 ## 2, x3 ## 2;
393*4882a593Smuzhiyun
394*4882a593Smuzhiyun#define LK2(x0, x1, x2, x3, x4, i) \
395*4882a593Smuzhiyun	vpslld $13,		x0 ## 1, x4 ## 1;          \
396*4882a593Smuzhiyun	vpsrld $(32 - 13),	x0 ## 1, x0 ## 1;          \
397*4882a593Smuzhiyun	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
398*4882a593Smuzhiyun	vpxor			x0 ## 1, x1 ## 1, x1 ## 1; \
399*4882a593Smuzhiyun	vpslld $3,		x2 ## 1, x4 ## 1;          \
400*4882a593Smuzhiyun	vpsrld $(32 - 3),	x2 ## 1, x2 ## 1;          \
401*4882a593Smuzhiyun	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
402*4882a593Smuzhiyun	vpxor			x2 ## 1, x1 ## 1, x1 ## 1; \
403*4882a593Smuzhiyun		vpslld $13,		x0 ## 2, x4 ## 2;          \
404*4882a593Smuzhiyun		vpsrld $(32 - 13),	x0 ## 2, x0 ## 2;          \
405*4882a593Smuzhiyun		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
406*4882a593Smuzhiyun		vpxor			x0 ## 2, x1 ## 2, x1 ## 2; \
407*4882a593Smuzhiyun		vpslld $3,		x2 ## 2, x4 ## 2;          \
408*4882a593Smuzhiyun		vpsrld $(32 - 3),	x2 ## 2, x2 ## 2;          \
409*4882a593Smuzhiyun		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
410*4882a593Smuzhiyun		vpxor			x2 ## 2, x1 ## 2, x1 ## 2; \
411*4882a593Smuzhiyun	vpslld $1,		x1 ## 1, x4 ## 1;          \
412*4882a593Smuzhiyun	vpsrld $(32 - 1),	x1 ## 1, x1 ## 1;          \
413*4882a593Smuzhiyun	vpor			x4 ## 1, x1 ## 1, x1 ## 1; \
414*4882a593Smuzhiyun	vpslld $3,		x0 ## 1, x4 ## 1;          \
415*4882a593Smuzhiyun	vpxor			x2 ## 1, x3 ## 1, x3 ## 1; \
416*4882a593Smuzhiyun	vpxor			x4 ## 1, x3 ## 1, x3 ## 1; \
417*4882a593Smuzhiyun	get_key(i, 1, RK1); \
418*4882a593Smuzhiyun		vpslld $1,		x1 ## 2, x4 ## 2;          \
419*4882a593Smuzhiyun		vpsrld $(32 - 1),	x1 ## 2, x1 ## 2;          \
420*4882a593Smuzhiyun		vpor			x4 ## 2, x1 ## 2, x1 ## 2; \
421*4882a593Smuzhiyun		vpslld $3,		x0 ## 2, x4 ## 2;          \
422*4882a593Smuzhiyun		vpxor			x2 ## 2, x3 ## 2, x3 ## 2; \
423*4882a593Smuzhiyun		vpxor			x4 ## 2, x3 ## 2, x3 ## 2; \
424*4882a593Smuzhiyun		get_key(i, 3, RK3); \
425*4882a593Smuzhiyun	vpslld $7,		x3 ## 1, x4 ## 1;          \
426*4882a593Smuzhiyun	vpsrld $(32 - 7),	x3 ## 1, x3 ## 1;          \
427*4882a593Smuzhiyun	vpor			x4 ## 1, x3 ## 1, x3 ## 1; \
428*4882a593Smuzhiyun	vpslld $7,		x1 ## 1, x4 ## 1;          \
429*4882a593Smuzhiyun	vpxor			x1 ## 1, x0 ## 1, x0 ## 1; \
430*4882a593Smuzhiyun	vpxor			x3 ## 1, x0 ## 1, x0 ## 1; \
431*4882a593Smuzhiyun	vpxor			x3 ## 1, x2 ## 1, x2 ## 1; \
432*4882a593Smuzhiyun	vpxor			x4 ## 1, x2 ## 1, x2 ## 1; \
433*4882a593Smuzhiyun	get_key(i, 0, RK0); \
434*4882a593Smuzhiyun		vpslld $7,		x3 ## 2, x4 ## 2;          \
435*4882a593Smuzhiyun		vpsrld $(32 - 7),	x3 ## 2, x3 ## 2;          \
436*4882a593Smuzhiyun		vpor			x4 ## 2, x3 ## 2, x3 ## 2; \
437*4882a593Smuzhiyun		vpslld $7,		x1 ## 2, x4 ## 2;          \
438*4882a593Smuzhiyun		vpxor			x1 ## 2, x0 ## 2, x0 ## 2; \
439*4882a593Smuzhiyun		vpxor			x3 ## 2, x0 ## 2, x0 ## 2; \
440*4882a593Smuzhiyun		vpxor			x3 ## 2, x2 ## 2, x2 ## 2; \
441*4882a593Smuzhiyun		vpxor			x4 ## 2, x2 ## 2, x2 ## 2; \
442*4882a593Smuzhiyun		get_key(i, 2, RK2); \
443*4882a593Smuzhiyun	vpxor			RK1, x1 ## 1, x1 ## 1;     \
444*4882a593Smuzhiyun	vpxor			RK3, x3 ## 1, x3 ## 1;     \
445*4882a593Smuzhiyun	vpslld $5,		x0 ## 1, x4 ## 1;          \
446*4882a593Smuzhiyun	vpsrld $(32 - 5),	x0 ## 1, x0 ## 1;          \
447*4882a593Smuzhiyun	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
448*4882a593Smuzhiyun	vpslld $22,		x2 ## 1, x4 ## 1;          \
449*4882a593Smuzhiyun	vpsrld $(32 - 22),	x2 ## 1, x2 ## 1;          \
450*4882a593Smuzhiyun	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
451*4882a593Smuzhiyun	vpxor			RK0, x0 ## 1, x0 ## 1;     \
452*4882a593Smuzhiyun	vpxor			RK2, x2 ## 1, x2 ## 1;     \
453*4882a593Smuzhiyun		vpxor			RK1, x1 ## 2, x1 ## 2;     \
454*4882a593Smuzhiyun		vpxor			RK3, x3 ## 2, x3 ## 2;     \
455*4882a593Smuzhiyun		vpslld $5,		x0 ## 2, x4 ## 2;          \
456*4882a593Smuzhiyun		vpsrld $(32 - 5),	x0 ## 2, x0 ## 2;          \
457*4882a593Smuzhiyun		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
458*4882a593Smuzhiyun		vpslld $22,		x2 ## 2, x4 ## 2;          \
459*4882a593Smuzhiyun		vpsrld $(32 - 22),	x2 ## 2, x2 ## 2;          \
460*4882a593Smuzhiyun		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
461*4882a593Smuzhiyun		vpxor			RK0, x0 ## 2, x0 ## 2;     \
462*4882a593Smuzhiyun		vpxor			RK2, x2 ## 2, x2 ## 2;
463*4882a593Smuzhiyun
464*4882a593Smuzhiyun#define KL2(x0, x1, x2, x3, x4, i) \
465*4882a593Smuzhiyun	vpxor			RK0, x0 ## 1, x0 ## 1;     \
466*4882a593Smuzhiyun	vpxor			RK2, x2 ## 1, x2 ## 1;     \
467*4882a593Smuzhiyun	vpsrld $5,		x0 ## 1, x4 ## 1;          \
468*4882a593Smuzhiyun	vpslld $(32 - 5),	x0 ## 1, x0 ## 1;          \
469*4882a593Smuzhiyun	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
470*4882a593Smuzhiyun	vpxor			RK3, x3 ## 1, x3 ## 1;     \
471*4882a593Smuzhiyun	vpxor			RK1, x1 ## 1, x1 ## 1;     \
472*4882a593Smuzhiyun	vpsrld $22,		x2 ## 1, x4 ## 1;          \
473*4882a593Smuzhiyun	vpslld $(32 - 22),	x2 ## 1, x2 ## 1;          \
474*4882a593Smuzhiyun	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
475*4882a593Smuzhiyun	vpxor			x3 ## 1, x2 ## 1, x2 ## 1; \
476*4882a593Smuzhiyun		vpxor			RK0, x0 ## 2, x0 ## 2;     \
477*4882a593Smuzhiyun		vpxor			RK2, x2 ## 2, x2 ## 2;     \
478*4882a593Smuzhiyun		vpsrld $5,		x0 ## 2, x4 ## 2;          \
479*4882a593Smuzhiyun		vpslld $(32 - 5),	x0 ## 2, x0 ## 2;          \
480*4882a593Smuzhiyun		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
481*4882a593Smuzhiyun		vpxor			RK3, x3 ## 2, x3 ## 2;     \
482*4882a593Smuzhiyun		vpxor			RK1, x1 ## 2, x1 ## 2;     \
483*4882a593Smuzhiyun		vpsrld $22,		x2 ## 2, x4 ## 2;          \
484*4882a593Smuzhiyun		vpslld $(32 - 22),	x2 ## 2, x2 ## 2;          \
485*4882a593Smuzhiyun		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
486*4882a593Smuzhiyun		vpxor			x3 ## 2, x2 ## 2, x2 ## 2; \
487*4882a593Smuzhiyun	vpxor			x3 ## 1, x0 ## 1, x0 ## 1; \
488*4882a593Smuzhiyun	vpslld $7,		x1 ## 1, x4 ## 1;          \
489*4882a593Smuzhiyun	vpxor			x1 ## 1, x0 ## 1, x0 ## 1; \
490*4882a593Smuzhiyun	vpxor			x4 ## 1, x2 ## 1, x2 ## 1; \
491*4882a593Smuzhiyun	vpsrld $1,		x1 ## 1, x4 ## 1;          \
492*4882a593Smuzhiyun	vpslld $(32 - 1),	x1 ## 1, x1 ## 1;          \
493*4882a593Smuzhiyun	vpor			x4 ## 1, x1 ## 1, x1 ## 1; \
494*4882a593Smuzhiyun		vpxor			x3 ## 2, x0 ## 2, x0 ## 2; \
495*4882a593Smuzhiyun		vpslld $7,		x1 ## 2, x4 ## 2;          \
496*4882a593Smuzhiyun		vpxor			x1 ## 2, x0 ## 2, x0 ## 2; \
497*4882a593Smuzhiyun		vpxor			x4 ## 2, x2 ## 2, x2 ## 2; \
498*4882a593Smuzhiyun		vpsrld $1,		x1 ## 2, x4 ## 2;          \
499*4882a593Smuzhiyun		vpslld $(32 - 1),	x1 ## 2, x1 ## 2;          \
500*4882a593Smuzhiyun		vpor			x4 ## 2, x1 ## 2, x1 ## 2; \
501*4882a593Smuzhiyun	vpsrld $7,		x3 ## 1, x4 ## 1;          \
502*4882a593Smuzhiyun	vpslld $(32 - 7),	x3 ## 1, x3 ## 1;          \
503*4882a593Smuzhiyun	vpor			x4 ## 1, x3 ## 1, x3 ## 1; \
504*4882a593Smuzhiyun	vpxor			x0 ## 1, x1 ## 1, x1 ## 1; \
505*4882a593Smuzhiyun	vpslld $3,		x0 ## 1, x4 ## 1;          \
506*4882a593Smuzhiyun	vpxor			x4 ## 1, x3 ## 1, x3 ## 1; \
507*4882a593Smuzhiyun		vpsrld $7,		x3 ## 2, x4 ## 2;          \
508*4882a593Smuzhiyun		vpslld $(32 - 7),	x3 ## 2, x3 ## 2;          \
509*4882a593Smuzhiyun		vpor			x4 ## 2, x3 ## 2, x3 ## 2; \
510*4882a593Smuzhiyun		vpxor			x0 ## 2, x1 ## 2, x1 ## 2; \
511*4882a593Smuzhiyun		vpslld $3,		x0 ## 2, x4 ## 2;          \
512*4882a593Smuzhiyun		vpxor			x4 ## 2, x3 ## 2, x3 ## 2; \
513*4882a593Smuzhiyun	vpsrld $13,		x0 ## 1, x4 ## 1;          \
514*4882a593Smuzhiyun	vpslld $(32 - 13),	x0 ## 1, x0 ## 1;          \
515*4882a593Smuzhiyun	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
516*4882a593Smuzhiyun	vpxor			x2 ## 1, x1 ## 1, x1 ## 1; \
517*4882a593Smuzhiyun	vpxor			x2 ## 1, x3 ## 1, x3 ## 1; \
518*4882a593Smuzhiyun	vpsrld $3,		x2 ## 1, x4 ## 1;          \
519*4882a593Smuzhiyun	vpslld $(32 - 3),	x2 ## 1, x2 ## 1;          \
520*4882a593Smuzhiyun	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
521*4882a593Smuzhiyun		vpsrld $13,		x0 ## 2, x4 ## 2;          \
522*4882a593Smuzhiyun		vpslld $(32 - 13),	x0 ## 2, x0 ## 2;          \
523*4882a593Smuzhiyun		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
524*4882a593Smuzhiyun		vpxor			x2 ## 2, x1 ## 2, x1 ## 2; \
525*4882a593Smuzhiyun		vpxor			x2 ## 2, x3 ## 2, x3 ## 2; \
526*4882a593Smuzhiyun		vpsrld $3,		x2 ## 2, x4 ## 2;          \
527*4882a593Smuzhiyun		vpslld $(32 - 3),	x2 ## 2, x2 ## 2;          \
528*4882a593Smuzhiyun		vpor			x4 ## 2, x2 ## 2, x2 ## 2;
529*4882a593Smuzhiyun
530*4882a593Smuzhiyun#define S(SBOX, x0, x1, x2, x3, x4) \
531*4882a593Smuzhiyun	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
532*4882a593Smuzhiyun	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
533*4882a593Smuzhiyun	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
534*4882a593Smuzhiyun	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
535*4882a593Smuzhiyun
536*4882a593Smuzhiyun#define SP(SBOX, x0, x1, x2, x3, x4, i) \
537*4882a593Smuzhiyun	get_key(i, 0, RK0); \
538*4882a593Smuzhiyun	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
539*4882a593Smuzhiyun	get_key(i, 2, RK2); \
540*4882a593Smuzhiyun	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
541*4882a593Smuzhiyun	get_key(i, 3, RK3); \
542*4882a593Smuzhiyun	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
543*4882a593Smuzhiyun	get_key(i, 1, RK1); \
544*4882a593Smuzhiyun	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
545*4882a593Smuzhiyun
546*4882a593Smuzhiyun#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
547*4882a593Smuzhiyun	vpunpckldq		x1, x0, t0; \
548*4882a593Smuzhiyun	vpunpckhdq		x1, x0, t2; \
549*4882a593Smuzhiyun	vpunpckldq		x3, x2, t1; \
550*4882a593Smuzhiyun	vpunpckhdq		x3, x2, x3; \
551*4882a593Smuzhiyun	\
552*4882a593Smuzhiyun	vpunpcklqdq		t1, t0, x0; \
553*4882a593Smuzhiyun	vpunpckhqdq		t1, t0, x1; \
554*4882a593Smuzhiyun	vpunpcklqdq		x3, t2, x2; \
555*4882a593Smuzhiyun	vpunpckhqdq		x3, t2, x3;
556*4882a593Smuzhiyun
557*4882a593Smuzhiyun#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
558*4882a593Smuzhiyun	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
559*4882a593Smuzhiyun
560*4882a593Smuzhiyun#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
561*4882a593Smuzhiyun	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
562*4882a593Smuzhiyun
563*4882a593Smuzhiyun.align 8
564*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(__serpent_enc_blk16)
565*4882a593Smuzhiyun	/* input:
566*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
567*4882a593Smuzhiyun	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext
568*4882a593Smuzhiyun	 * output:
569*4882a593Smuzhiyun	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
570*4882a593Smuzhiyun	 */
571*4882a593Smuzhiyun
572*4882a593Smuzhiyun	vpcmpeqd RNOT, RNOT, RNOT;
573*4882a593Smuzhiyun
574*4882a593Smuzhiyun	read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
575*4882a593Smuzhiyun	read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
576*4882a593Smuzhiyun
577*4882a593Smuzhiyun						 K2(RA, RB, RC, RD, RE, 0);
578*4882a593Smuzhiyun	S(S0, RA, RB, RC, RD, RE);		LK2(RC, RB, RD, RA, RE, 1);
579*4882a593Smuzhiyun	S(S1, RC, RB, RD, RA, RE);		LK2(RE, RD, RA, RC, RB, 2);
580*4882a593Smuzhiyun	S(S2, RE, RD, RA, RC, RB);		LK2(RB, RD, RE, RC, RA, 3);
581*4882a593Smuzhiyun	S(S3, RB, RD, RE, RC, RA);		LK2(RC, RA, RD, RB, RE, 4);
582*4882a593Smuzhiyun	S(S4, RC, RA, RD, RB, RE);		LK2(RA, RD, RB, RE, RC, 5);
583*4882a593Smuzhiyun	S(S5, RA, RD, RB, RE, RC);		LK2(RC, RA, RD, RE, RB, 6);
584*4882a593Smuzhiyun	S(S6, RC, RA, RD, RE, RB);		LK2(RD, RB, RA, RE, RC, 7);
585*4882a593Smuzhiyun	S(S7, RD, RB, RA, RE, RC);		LK2(RC, RA, RE, RD, RB, 8);
586*4882a593Smuzhiyun	S(S0, RC, RA, RE, RD, RB);		LK2(RE, RA, RD, RC, RB, 9);
587*4882a593Smuzhiyun	S(S1, RE, RA, RD, RC, RB);		LK2(RB, RD, RC, RE, RA, 10);
588*4882a593Smuzhiyun	S(S2, RB, RD, RC, RE, RA);		LK2(RA, RD, RB, RE, RC, 11);
589*4882a593Smuzhiyun	S(S3, RA, RD, RB, RE, RC);		LK2(RE, RC, RD, RA, RB, 12);
590*4882a593Smuzhiyun	S(S4, RE, RC, RD, RA, RB);		LK2(RC, RD, RA, RB, RE, 13);
591*4882a593Smuzhiyun	S(S5, RC, RD, RA, RB, RE);		LK2(RE, RC, RD, RB, RA, 14);
592*4882a593Smuzhiyun	S(S6, RE, RC, RD, RB, RA);		LK2(RD, RA, RC, RB, RE, 15);
593*4882a593Smuzhiyun	S(S7, RD, RA, RC, RB, RE);		LK2(RE, RC, RB, RD, RA, 16);
594*4882a593Smuzhiyun	S(S0, RE, RC, RB, RD, RA);		LK2(RB, RC, RD, RE, RA, 17);
595*4882a593Smuzhiyun	S(S1, RB, RC, RD, RE, RA);		LK2(RA, RD, RE, RB, RC, 18);
596*4882a593Smuzhiyun	S(S2, RA, RD, RE, RB, RC);		LK2(RC, RD, RA, RB, RE, 19);
597*4882a593Smuzhiyun	S(S3, RC, RD, RA, RB, RE);		LK2(RB, RE, RD, RC, RA, 20);
598*4882a593Smuzhiyun	S(S4, RB, RE, RD, RC, RA);		LK2(RE, RD, RC, RA, RB, 21);
599*4882a593Smuzhiyun	S(S5, RE, RD, RC, RA, RB);		LK2(RB, RE, RD, RA, RC, 22);
600*4882a593Smuzhiyun	S(S6, RB, RE, RD, RA, RC);		LK2(RD, RC, RE, RA, RB, 23);
601*4882a593Smuzhiyun	S(S7, RD, RC, RE, RA, RB);		LK2(RB, RE, RA, RD, RC, 24);
602*4882a593Smuzhiyun	S(S0, RB, RE, RA, RD, RC);		LK2(RA, RE, RD, RB, RC, 25);
603*4882a593Smuzhiyun	S(S1, RA, RE, RD, RB, RC);		LK2(RC, RD, RB, RA, RE, 26);
604*4882a593Smuzhiyun	S(S2, RC, RD, RB, RA, RE);		LK2(RE, RD, RC, RA, RB, 27);
605*4882a593Smuzhiyun	S(S3, RE, RD, RC, RA, RB);		LK2(RA, RB, RD, RE, RC, 28);
606*4882a593Smuzhiyun	S(S4, RA, RB, RD, RE, RC);		LK2(RB, RD, RE, RC, RA, 29);
607*4882a593Smuzhiyun	S(S5, RB, RD, RE, RC, RA);		LK2(RA, RB, RD, RC, RE, 30);
608*4882a593Smuzhiyun	S(S6, RA, RB, RD, RC, RE);		LK2(RD, RE, RB, RC, RA, 31);
609*4882a593Smuzhiyun	S(S7, RD, RE, RB, RC, RA);		 K2(RA, RB, RC, RD, RE, 32);
610*4882a593Smuzhiyun
611*4882a593Smuzhiyun	write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
612*4882a593Smuzhiyun	write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
613*4882a593Smuzhiyun
614*4882a593Smuzhiyun	RET;
615*4882a593SmuzhiyunSYM_FUNC_END(__serpent_enc_blk16)
616*4882a593Smuzhiyun
617*4882a593Smuzhiyun.align 8
618*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(__serpent_dec_blk16)
619*4882a593Smuzhiyun	/* input:
620*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
621*4882a593Smuzhiyun	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
622*4882a593Smuzhiyun	 * output:
623*4882a593Smuzhiyun	 *	RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext
624*4882a593Smuzhiyun	 */
625*4882a593Smuzhiyun
626*4882a593Smuzhiyun	vpcmpeqd RNOT, RNOT, RNOT;
627*4882a593Smuzhiyun
628*4882a593Smuzhiyun	read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
629*4882a593Smuzhiyun	read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
630*4882a593Smuzhiyun
631*4882a593Smuzhiyun						 K2(RA, RB, RC, RD, RE, 32);
632*4882a593Smuzhiyun	SP(SI7, RA, RB, RC, RD, RE, 31);	KL2(RB, RD, RA, RE, RC, 31);
633*4882a593Smuzhiyun	SP(SI6, RB, RD, RA, RE, RC, 30);	KL2(RA, RC, RE, RB, RD, 30);
634*4882a593Smuzhiyun	SP(SI5, RA, RC, RE, RB, RD, 29);	KL2(RC, RD, RA, RE, RB, 29);
635*4882a593Smuzhiyun	SP(SI4, RC, RD, RA, RE, RB, 28);	KL2(RC, RA, RB, RE, RD, 28);
636*4882a593Smuzhiyun	SP(SI3, RC, RA, RB, RE, RD, 27);	KL2(RB, RC, RD, RE, RA, 27);
637*4882a593Smuzhiyun	SP(SI2, RB, RC, RD, RE, RA, 26);	KL2(RC, RA, RE, RD, RB, 26);
638*4882a593Smuzhiyun	SP(SI1, RC, RA, RE, RD, RB, 25);	KL2(RB, RA, RE, RD, RC, 25);
639*4882a593Smuzhiyun	SP(SI0, RB, RA, RE, RD, RC, 24);	KL2(RE, RC, RA, RB, RD, 24);
640*4882a593Smuzhiyun	SP(SI7, RE, RC, RA, RB, RD, 23);	KL2(RC, RB, RE, RD, RA, 23);
641*4882a593Smuzhiyun	SP(SI6, RC, RB, RE, RD, RA, 22);	KL2(RE, RA, RD, RC, RB, 22);
642*4882a593Smuzhiyun	SP(SI5, RE, RA, RD, RC, RB, 21);	KL2(RA, RB, RE, RD, RC, 21);
643*4882a593Smuzhiyun	SP(SI4, RA, RB, RE, RD, RC, 20);	KL2(RA, RE, RC, RD, RB, 20);
644*4882a593Smuzhiyun	SP(SI3, RA, RE, RC, RD, RB, 19);	KL2(RC, RA, RB, RD, RE, 19);
645*4882a593Smuzhiyun	SP(SI2, RC, RA, RB, RD, RE, 18);	KL2(RA, RE, RD, RB, RC, 18);
646*4882a593Smuzhiyun	SP(SI1, RA, RE, RD, RB, RC, 17);	KL2(RC, RE, RD, RB, RA, 17);
647*4882a593Smuzhiyun	SP(SI0, RC, RE, RD, RB, RA, 16);	KL2(RD, RA, RE, RC, RB, 16);
648*4882a593Smuzhiyun	SP(SI7, RD, RA, RE, RC, RB, 15);	KL2(RA, RC, RD, RB, RE, 15);
649*4882a593Smuzhiyun	SP(SI6, RA, RC, RD, RB, RE, 14);	KL2(RD, RE, RB, RA, RC, 14);
650*4882a593Smuzhiyun	SP(SI5, RD, RE, RB, RA, RC, 13);	KL2(RE, RC, RD, RB, RA, 13);
651*4882a593Smuzhiyun	SP(SI4, RE, RC, RD, RB, RA, 12);	KL2(RE, RD, RA, RB, RC, 12);
652*4882a593Smuzhiyun	SP(SI3, RE, RD, RA, RB, RC, 11);	KL2(RA, RE, RC, RB, RD, 11);
653*4882a593Smuzhiyun	SP(SI2, RA, RE, RC, RB, RD, 10);	KL2(RE, RD, RB, RC, RA, 10);
654*4882a593Smuzhiyun	SP(SI1, RE, RD, RB, RC, RA, 9);		KL2(RA, RD, RB, RC, RE, 9);
655*4882a593Smuzhiyun	SP(SI0, RA, RD, RB, RC, RE, 8);		KL2(RB, RE, RD, RA, RC, 8);
656*4882a593Smuzhiyun	SP(SI7, RB, RE, RD, RA, RC, 7);		KL2(RE, RA, RB, RC, RD, 7);
657*4882a593Smuzhiyun	SP(SI6, RE, RA, RB, RC, RD, 6);		KL2(RB, RD, RC, RE, RA, 6);
658*4882a593Smuzhiyun	SP(SI5, RB, RD, RC, RE, RA, 5);		KL2(RD, RA, RB, RC, RE, 5);
659*4882a593Smuzhiyun	SP(SI4, RD, RA, RB, RC, RE, 4);		KL2(RD, RB, RE, RC, RA, 4);
660*4882a593Smuzhiyun	SP(SI3, RD, RB, RE, RC, RA, 3);		KL2(RE, RD, RA, RC, RB, 3);
661*4882a593Smuzhiyun	SP(SI2, RE, RD, RA, RC, RB, 2);		KL2(RD, RB, RC, RA, RE, 2);
662*4882a593Smuzhiyun	SP(SI1, RD, RB, RC, RA, RE, 1);		KL2(RE, RB, RC, RA, RD, 1);
663*4882a593Smuzhiyun	S(SI0, RE, RB, RC, RA, RD);		 K2(RC, RD, RB, RE, RA, 0);
664*4882a593Smuzhiyun
665*4882a593Smuzhiyun	write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
666*4882a593Smuzhiyun	write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
667*4882a593Smuzhiyun
668*4882a593Smuzhiyun	RET;
669*4882a593SmuzhiyunSYM_FUNC_END(__serpent_dec_blk16)
670*4882a593Smuzhiyun
671*4882a593SmuzhiyunSYM_FUNC_START(serpent_ecb_enc_16way)
672*4882a593Smuzhiyun	/* input:
673*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
674*4882a593Smuzhiyun	 *	%rsi: dst
675*4882a593Smuzhiyun	 *	%rdx: src
676*4882a593Smuzhiyun	 */
677*4882a593Smuzhiyun	FRAME_BEGIN
678*4882a593Smuzhiyun
679*4882a593Smuzhiyun	vzeroupper;
680*4882a593Smuzhiyun
681*4882a593Smuzhiyun	load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
682*4882a593Smuzhiyun
683*4882a593Smuzhiyun	call __serpent_enc_blk16;
684*4882a593Smuzhiyun
685*4882a593Smuzhiyun	store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
686*4882a593Smuzhiyun
687*4882a593Smuzhiyun	vzeroupper;
688*4882a593Smuzhiyun
689*4882a593Smuzhiyun	FRAME_END
690*4882a593Smuzhiyun	RET;
691*4882a593SmuzhiyunSYM_FUNC_END(serpent_ecb_enc_16way)
692*4882a593Smuzhiyun
693*4882a593SmuzhiyunSYM_FUNC_START(serpent_ecb_dec_16way)
694*4882a593Smuzhiyun	/* input:
695*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
696*4882a593Smuzhiyun	 *	%rsi: dst
697*4882a593Smuzhiyun	 *	%rdx: src
698*4882a593Smuzhiyun	 */
699*4882a593Smuzhiyun	FRAME_BEGIN
700*4882a593Smuzhiyun
701*4882a593Smuzhiyun	vzeroupper;
702*4882a593Smuzhiyun
703*4882a593Smuzhiyun	load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
704*4882a593Smuzhiyun
705*4882a593Smuzhiyun	call __serpent_dec_blk16;
706*4882a593Smuzhiyun
707*4882a593Smuzhiyun	store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
708*4882a593Smuzhiyun
709*4882a593Smuzhiyun	vzeroupper;
710*4882a593Smuzhiyun
711*4882a593Smuzhiyun	FRAME_END
712*4882a593Smuzhiyun	RET;
713*4882a593SmuzhiyunSYM_FUNC_END(serpent_ecb_dec_16way)
714*4882a593Smuzhiyun
715*4882a593SmuzhiyunSYM_FUNC_START(serpent_cbc_dec_16way)
716*4882a593Smuzhiyun	/* input:
717*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
718*4882a593Smuzhiyun	 *	%rsi: dst
719*4882a593Smuzhiyun	 *	%rdx: src
720*4882a593Smuzhiyun	 */
721*4882a593Smuzhiyun	FRAME_BEGIN
722*4882a593Smuzhiyun
723*4882a593Smuzhiyun	vzeroupper;
724*4882a593Smuzhiyun
725*4882a593Smuzhiyun	load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
726*4882a593Smuzhiyun
727*4882a593Smuzhiyun	call __serpent_dec_blk16;
728*4882a593Smuzhiyun
729*4882a593Smuzhiyun	store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
730*4882a593Smuzhiyun			RK0);
731*4882a593Smuzhiyun
732*4882a593Smuzhiyun	vzeroupper;
733*4882a593Smuzhiyun
734*4882a593Smuzhiyun	FRAME_END
735*4882a593Smuzhiyun	RET;
736*4882a593SmuzhiyunSYM_FUNC_END(serpent_cbc_dec_16way)
737*4882a593Smuzhiyun
738*4882a593SmuzhiyunSYM_FUNC_START(serpent_ctr_16way)
739*4882a593Smuzhiyun	/* input:
740*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
741*4882a593Smuzhiyun	 *	%rsi: dst (16 blocks)
742*4882a593Smuzhiyun	 *	%rdx: src (16 blocks)
743*4882a593Smuzhiyun	 *	%rcx: iv (little endian, 128bit)
744*4882a593Smuzhiyun	 */
745*4882a593Smuzhiyun	FRAME_BEGIN
746*4882a593Smuzhiyun
747*4882a593Smuzhiyun	vzeroupper;
748*4882a593Smuzhiyun
749*4882a593Smuzhiyun	load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
750*4882a593Smuzhiyun		       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
751*4882a593Smuzhiyun		       tp);
752*4882a593Smuzhiyun
753*4882a593Smuzhiyun	call __serpent_enc_blk16;
754*4882a593Smuzhiyun
755*4882a593Smuzhiyun	store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
756*4882a593Smuzhiyun
757*4882a593Smuzhiyun	vzeroupper;
758*4882a593Smuzhiyun
759*4882a593Smuzhiyun	FRAME_END
760*4882a593Smuzhiyun	RET;
761*4882a593SmuzhiyunSYM_FUNC_END(serpent_ctr_16way)
762*4882a593Smuzhiyun
763*4882a593SmuzhiyunSYM_FUNC_START(serpent_xts_enc_16way)
764*4882a593Smuzhiyun	/* input:
765*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
766*4882a593Smuzhiyun	 *	%rsi: dst (16 blocks)
767*4882a593Smuzhiyun	 *	%rdx: src (16 blocks)
768*4882a593Smuzhiyun	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
769*4882a593Smuzhiyun	 */
770*4882a593Smuzhiyun	FRAME_BEGIN
771*4882a593Smuzhiyun
772*4882a593Smuzhiyun	vzeroupper;
773*4882a593Smuzhiyun
774*4882a593Smuzhiyun	load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
775*4882a593Smuzhiyun		       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
776*4882a593Smuzhiyun		       .Lxts_gf128mul_and_shl1_mask_0,
777*4882a593Smuzhiyun		       .Lxts_gf128mul_and_shl1_mask_1);
778*4882a593Smuzhiyun
779*4882a593Smuzhiyun	call __serpent_enc_blk16;
780*4882a593Smuzhiyun
781*4882a593Smuzhiyun	store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
782*4882a593Smuzhiyun
783*4882a593Smuzhiyun	vzeroupper;
784*4882a593Smuzhiyun
785*4882a593Smuzhiyun	FRAME_END
786*4882a593Smuzhiyun	RET;
787*4882a593SmuzhiyunSYM_FUNC_END(serpent_xts_enc_16way)
788*4882a593Smuzhiyun
789*4882a593SmuzhiyunSYM_FUNC_START(serpent_xts_dec_16way)
790*4882a593Smuzhiyun	/* input:
791*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
792*4882a593Smuzhiyun	 *	%rsi: dst (16 blocks)
793*4882a593Smuzhiyun	 *	%rdx: src (16 blocks)
794*4882a593Smuzhiyun	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
795*4882a593Smuzhiyun	 */
796*4882a593Smuzhiyun	FRAME_BEGIN
797*4882a593Smuzhiyun
798*4882a593Smuzhiyun	vzeroupper;
799*4882a593Smuzhiyun
800*4882a593Smuzhiyun	load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
801*4882a593Smuzhiyun		       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
802*4882a593Smuzhiyun		       .Lxts_gf128mul_and_shl1_mask_0,
803*4882a593Smuzhiyun		       .Lxts_gf128mul_and_shl1_mask_1);
804*4882a593Smuzhiyun
805*4882a593Smuzhiyun	call __serpent_dec_blk16;
806*4882a593Smuzhiyun
807*4882a593Smuzhiyun	store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
808*4882a593Smuzhiyun
809*4882a593Smuzhiyun	vzeroupper;
810*4882a593Smuzhiyun
811*4882a593Smuzhiyun	FRAME_END
812*4882a593Smuzhiyun	RET;
813*4882a593SmuzhiyunSYM_FUNC_END(serpent_xts_dec_16way)
814