xref: /OK3568_Linux_fs/kernel/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * Based on crypto/serpent.c by
8*4882a593Smuzhiyun *  Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
9*4882a593Smuzhiyun *                2003 Herbert Valerio Riedel <hvr@gnu.org>
10*4882a593Smuzhiyun */
11*4882a593Smuzhiyun
12*4882a593Smuzhiyun#include <linux/linkage.h>
13*4882a593Smuzhiyun
14*4882a593Smuzhiyun.file "serpent-sse2-x86_64-asm_64.S"
15*4882a593Smuzhiyun.text
16*4882a593Smuzhiyun
17*4882a593Smuzhiyun#define CTX %rdi
18*4882a593Smuzhiyun
19*4882a593Smuzhiyun/**********************************************************************
20*4882a593Smuzhiyun  8-way SSE2 serpent
21*4882a593Smuzhiyun **********************************************************************/
22*4882a593Smuzhiyun#define RA1 %xmm0
23*4882a593Smuzhiyun#define RB1 %xmm1
24*4882a593Smuzhiyun#define RC1 %xmm2
25*4882a593Smuzhiyun#define RD1 %xmm3
26*4882a593Smuzhiyun#define RE1 %xmm4
27*4882a593Smuzhiyun
28*4882a593Smuzhiyun#define RA2 %xmm5
29*4882a593Smuzhiyun#define RB2 %xmm6
30*4882a593Smuzhiyun#define RC2 %xmm7
31*4882a593Smuzhiyun#define RD2 %xmm8
32*4882a593Smuzhiyun#define RE2 %xmm9
33*4882a593Smuzhiyun
34*4882a593Smuzhiyun#define RNOT %xmm10
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun#define RK0 %xmm11
37*4882a593Smuzhiyun#define RK1 %xmm12
38*4882a593Smuzhiyun#define RK2 %xmm13
39*4882a593Smuzhiyun#define RK3 %xmm14
40*4882a593Smuzhiyun
41*4882a593Smuzhiyun#define S0_1(x0, x1, x2, x3, x4) \
42*4882a593Smuzhiyun	movdqa x3,		x4; \
43*4882a593Smuzhiyun	por x0,			x3; \
44*4882a593Smuzhiyun	pxor x4,		x0; \
45*4882a593Smuzhiyun	pxor x2,		x4; \
46*4882a593Smuzhiyun	pxor RNOT,		x4; \
47*4882a593Smuzhiyun	pxor x1,		x3; \
48*4882a593Smuzhiyun	pand x0,		x1; \
49*4882a593Smuzhiyun	pxor x4,		x1; \
50*4882a593Smuzhiyun	pxor x0,		x2;
51*4882a593Smuzhiyun#define S0_2(x0, x1, x2, x3, x4) \
52*4882a593Smuzhiyun	pxor x3,		x0; \
53*4882a593Smuzhiyun	por x0,			x4; \
54*4882a593Smuzhiyun	pxor x2,		x0; \
55*4882a593Smuzhiyun	pand x1,		x2; \
56*4882a593Smuzhiyun	pxor x2,		x3; \
57*4882a593Smuzhiyun	pxor RNOT,		x1; \
58*4882a593Smuzhiyun	pxor x4,		x2; \
59*4882a593Smuzhiyun	pxor x2,		x1;
60*4882a593Smuzhiyun
61*4882a593Smuzhiyun#define S1_1(x0, x1, x2, x3, x4) \
62*4882a593Smuzhiyun	movdqa x1,		x4; \
63*4882a593Smuzhiyun	pxor x0,		x1; \
64*4882a593Smuzhiyun	pxor x3,		x0; \
65*4882a593Smuzhiyun	pxor RNOT,		x3; \
66*4882a593Smuzhiyun	pand x1,		x4; \
67*4882a593Smuzhiyun	por x1,			x0; \
68*4882a593Smuzhiyun	pxor x2,		x3; \
69*4882a593Smuzhiyun	pxor x3,		x0; \
70*4882a593Smuzhiyun	pxor x3,		x1;
71*4882a593Smuzhiyun#define S1_2(x0, x1, x2, x3, x4) \
72*4882a593Smuzhiyun	pxor x4,		x3; \
73*4882a593Smuzhiyun	por x4,			x1; \
74*4882a593Smuzhiyun	pxor x2,		x4; \
75*4882a593Smuzhiyun	pand x0,		x2; \
76*4882a593Smuzhiyun	pxor x1,		x2; \
77*4882a593Smuzhiyun	por x0,			x1; \
78*4882a593Smuzhiyun	pxor RNOT,		x0; \
79*4882a593Smuzhiyun	pxor x2,		x0; \
80*4882a593Smuzhiyun	pxor x1,		x4;
81*4882a593Smuzhiyun
82*4882a593Smuzhiyun#define S2_1(x0, x1, x2, x3, x4) \
83*4882a593Smuzhiyun	pxor RNOT,		x3; \
84*4882a593Smuzhiyun	pxor x0,		x1; \
85*4882a593Smuzhiyun	movdqa x0,		x4; \
86*4882a593Smuzhiyun	pand x2,		x0; \
87*4882a593Smuzhiyun	pxor x3,		x0; \
88*4882a593Smuzhiyun	por x4,			x3; \
89*4882a593Smuzhiyun	pxor x1,		x2; \
90*4882a593Smuzhiyun	pxor x1,		x3; \
91*4882a593Smuzhiyun	pand x0,		x1;
92*4882a593Smuzhiyun#define S2_2(x0, x1, x2, x3, x4) \
93*4882a593Smuzhiyun	pxor x2,		x0; \
94*4882a593Smuzhiyun	pand x3,		x2; \
95*4882a593Smuzhiyun	por x1,			x3; \
96*4882a593Smuzhiyun	pxor RNOT,		x0; \
97*4882a593Smuzhiyun	pxor x0,		x3; \
98*4882a593Smuzhiyun	pxor x0,		x4; \
99*4882a593Smuzhiyun	pxor x2,		x0; \
100*4882a593Smuzhiyun	por x2,			x1;
101*4882a593Smuzhiyun
102*4882a593Smuzhiyun#define S3_1(x0, x1, x2, x3, x4) \
103*4882a593Smuzhiyun	movdqa x1,		x4; \
104*4882a593Smuzhiyun	pxor x3,		x1; \
105*4882a593Smuzhiyun	por x0,			x3; \
106*4882a593Smuzhiyun	pand x0,		x4; \
107*4882a593Smuzhiyun	pxor x2,		x0; \
108*4882a593Smuzhiyun	pxor x1,		x2; \
109*4882a593Smuzhiyun	pand x3,		x1; \
110*4882a593Smuzhiyun	pxor x3,		x2; \
111*4882a593Smuzhiyun	por x4,			x0; \
112*4882a593Smuzhiyun	pxor x3,		x4;
113*4882a593Smuzhiyun#define S3_2(x0, x1, x2, x3, x4) \
114*4882a593Smuzhiyun	pxor x0,		x1; \
115*4882a593Smuzhiyun	pand x3,		x0; \
116*4882a593Smuzhiyun	pand x4,		x3; \
117*4882a593Smuzhiyun	pxor x2,		x3; \
118*4882a593Smuzhiyun	por x1,			x4; \
119*4882a593Smuzhiyun	pand x1,		x2; \
120*4882a593Smuzhiyun	pxor x3,		x4; \
121*4882a593Smuzhiyun	pxor x3,		x0; \
122*4882a593Smuzhiyun	pxor x2,		x3;
123*4882a593Smuzhiyun
124*4882a593Smuzhiyun#define S4_1(x0, x1, x2, x3, x4) \
125*4882a593Smuzhiyun	movdqa x3,		x4; \
126*4882a593Smuzhiyun	pand x0,		x3; \
127*4882a593Smuzhiyun	pxor x4,		x0; \
128*4882a593Smuzhiyun	pxor x2,		x3; \
129*4882a593Smuzhiyun	por x4,			x2; \
130*4882a593Smuzhiyun	pxor x1,		x0; \
131*4882a593Smuzhiyun	pxor x3,		x4; \
132*4882a593Smuzhiyun	por x0,			x2; \
133*4882a593Smuzhiyun	pxor x1,		x2;
134*4882a593Smuzhiyun#define S4_2(x0, x1, x2, x3, x4) \
135*4882a593Smuzhiyun	pand x0,		x1; \
136*4882a593Smuzhiyun	pxor x4,		x1; \
137*4882a593Smuzhiyun	pand x2,		x4; \
138*4882a593Smuzhiyun	pxor x3,		x2; \
139*4882a593Smuzhiyun	pxor x0,		x4; \
140*4882a593Smuzhiyun	por x1,			x3; \
141*4882a593Smuzhiyun	pxor RNOT,		x1; \
142*4882a593Smuzhiyun	pxor x0,		x3;
143*4882a593Smuzhiyun
144*4882a593Smuzhiyun#define S5_1(x0, x1, x2, x3, x4) \
145*4882a593Smuzhiyun	movdqa x1,		x4; \
146*4882a593Smuzhiyun	por x0,			x1; \
147*4882a593Smuzhiyun	pxor x1,		x2; \
148*4882a593Smuzhiyun	pxor RNOT,		x3; \
149*4882a593Smuzhiyun	pxor x0,		x4; \
150*4882a593Smuzhiyun	pxor x2,		x0; \
151*4882a593Smuzhiyun	pand x4,		x1; \
152*4882a593Smuzhiyun	por x3,			x4; \
153*4882a593Smuzhiyun	pxor x0,		x4;
154*4882a593Smuzhiyun#define S5_2(x0, x1, x2, x3, x4) \
155*4882a593Smuzhiyun	pand x3,		x0; \
156*4882a593Smuzhiyun	pxor x3,		x1; \
157*4882a593Smuzhiyun	pxor x2,		x3; \
158*4882a593Smuzhiyun	pxor x1,		x0; \
159*4882a593Smuzhiyun	pand x4,		x2; \
160*4882a593Smuzhiyun	pxor x2,		x1; \
161*4882a593Smuzhiyun	pand x0,		x2; \
162*4882a593Smuzhiyun	pxor x2,		x3;
163*4882a593Smuzhiyun
164*4882a593Smuzhiyun#define S6_1(x0, x1, x2, x3, x4) \
165*4882a593Smuzhiyun	movdqa x1,		x4; \
166*4882a593Smuzhiyun	pxor x0,		x3; \
167*4882a593Smuzhiyun	pxor x2,		x1; \
168*4882a593Smuzhiyun	pxor x0,		x2; \
169*4882a593Smuzhiyun	pand x3,		x0; \
170*4882a593Smuzhiyun	por x3,			x1; \
171*4882a593Smuzhiyun	pxor RNOT,		x4; \
172*4882a593Smuzhiyun	pxor x1,		x0; \
173*4882a593Smuzhiyun	pxor x2,		x1;
174*4882a593Smuzhiyun#define S6_2(x0, x1, x2, x3, x4) \
175*4882a593Smuzhiyun	pxor x4,		x3; \
176*4882a593Smuzhiyun	pxor x0,		x4; \
177*4882a593Smuzhiyun	pand x0,		x2; \
178*4882a593Smuzhiyun	pxor x1,		x4; \
179*4882a593Smuzhiyun	pxor x3,		x2; \
180*4882a593Smuzhiyun	pand x1,		x3; \
181*4882a593Smuzhiyun	pxor x0,		x3; \
182*4882a593Smuzhiyun	pxor x2,		x1;
183*4882a593Smuzhiyun
184*4882a593Smuzhiyun#define S7_1(x0, x1, x2, x3, x4) \
185*4882a593Smuzhiyun	pxor RNOT,		x1; \
186*4882a593Smuzhiyun	movdqa x1,		x4; \
187*4882a593Smuzhiyun	pxor RNOT,		x0; \
188*4882a593Smuzhiyun	pand x2,		x1; \
189*4882a593Smuzhiyun	pxor x3,		x1; \
190*4882a593Smuzhiyun	por x4,			x3; \
191*4882a593Smuzhiyun	pxor x2,		x4; \
192*4882a593Smuzhiyun	pxor x3,		x2; \
193*4882a593Smuzhiyun	pxor x0,		x3; \
194*4882a593Smuzhiyun	por x1,			x0;
195*4882a593Smuzhiyun#define S7_2(x0, x1, x2, x3, x4) \
196*4882a593Smuzhiyun	pand x0,		x2; \
197*4882a593Smuzhiyun	pxor x4,		x0; \
198*4882a593Smuzhiyun	pxor x3,		x4; \
199*4882a593Smuzhiyun	pand x0,		x3; \
200*4882a593Smuzhiyun	pxor x1,		x4; \
201*4882a593Smuzhiyun	pxor x4,		x2; \
202*4882a593Smuzhiyun	pxor x1,		x3; \
203*4882a593Smuzhiyun	por x0,			x4; \
204*4882a593Smuzhiyun	pxor x1,		x4;
205*4882a593Smuzhiyun
206*4882a593Smuzhiyun#define SI0_1(x0, x1, x2, x3, x4) \
207*4882a593Smuzhiyun	movdqa x3,		x4; \
208*4882a593Smuzhiyun	pxor x0,		x1; \
209*4882a593Smuzhiyun	por x1,			x3; \
210*4882a593Smuzhiyun	pxor x1,		x4; \
211*4882a593Smuzhiyun	pxor RNOT,		x0; \
212*4882a593Smuzhiyun	pxor x3,		x2; \
213*4882a593Smuzhiyun	pxor x0,		x3; \
214*4882a593Smuzhiyun	pand x1,		x0; \
215*4882a593Smuzhiyun	pxor x2,		x0;
216*4882a593Smuzhiyun#define SI0_2(x0, x1, x2, x3, x4) \
217*4882a593Smuzhiyun	pand x3,		x2; \
218*4882a593Smuzhiyun	pxor x4,		x3; \
219*4882a593Smuzhiyun	pxor x3,		x2; \
220*4882a593Smuzhiyun	pxor x3,		x1; \
221*4882a593Smuzhiyun	pand x0,		x3; \
222*4882a593Smuzhiyun	pxor x0,		x1; \
223*4882a593Smuzhiyun	pxor x2,		x0; \
224*4882a593Smuzhiyun	pxor x3,		x4;
225*4882a593Smuzhiyun
226*4882a593Smuzhiyun#define SI1_1(x0, x1, x2, x3, x4) \
227*4882a593Smuzhiyun	pxor x3,		x1; \
228*4882a593Smuzhiyun	movdqa x0,		x4; \
229*4882a593Smuzhiyun	pxor x2,		x0; \
230*4882a593Smuzhiyun	pxor RNOT,		x2; \
231*4882a593Smuzhiyun	por x1,			x4; \
232*4882a593Smuzhiyun	pxor x3,		x4; \
233*4882a593Smuzhiyun	pand x1,		x3; \
234*4882a593Smuzhiyun	pxor x2,		x1; \
235*4882a593Smuzhiyun	pand x4,		x2;
236*4882a593Smuzhiyun#define SI1_2(x0, x1, x2, x3, x4) \
237*4882a593Smuzhiyun	pxor x1,		x4; \
238*4882a593Smuzhiyun	por x3,			x1; \
239*4882a593Smuzhiyun	pxor x0,		x3; \
240*4882a593Smuzhiyun	pxor x0,		x2; \
241*4882a593Smuzhiyun	por x4,			x0; \
242*4882a593Smuzhiyun	pxor x4,		x2; \
243*4882a593Smuzhiyun	pxor x0,		x1; \
244*4882a593Smuzhiyun	pxor x1,		x4;
245*4882a593Smuzhiyun
246*4882a593Smuzhiyun#define SI2_1(x0, x1, x2, x3, x4) \
247*4882a593Smuzhiyun	pxor x1,		x2; \
248*4882a593Smuzhiyun	movdqa x3,		x4; \
249*4882a593Smuzhiyun	pxor RNOT,		x3; \
250*4882a593Smuzhiyun	por x2,			x3; \
251*4882a593Smuzhiyun	pxor x4,		x2; \
252*4882a593Smuzhiyun	pxor x0,		x4; \
253*4882a593Smuzhiyun	pxor x1,		x3; \
254*4882a593Smuzhiyun	por x2,			x1; \
255*4882a593Smuzhiyun	pxor x0,		x2;
256*4882a593Smuzhiyun#define SI2_2(x0, x1, x2, x3, x4) \
257*4882a593Smuzhiyun	pxor x4,		x1; \
258*4882a593Smuzhiyun	por x3,			x4; \
259*4882a593Smuzhiyun	pxor x3,		x2; \
260*4882a593Smuzhiyun	pxor x2,		x4; \
261*4882a593Smuzhiyun	pand x1,		x2; \
262*4882a593Smuzhiyun	pxor x3,		x2; \
263*4882a593Smuzhiyun	pxor x4,		x3; \
264*4882a593Smuzhiyun	pxor x0,		x4;
265*4882a593Smuzhiyun
266*4882a593Smuzhiyun#define SI3_1(x0, x1, x2, x3, x4) \
267*4882a593Smuzhiyun	pxor x1,		x2; \
268*4882a593Smuzhiyun	movdqa x1,		x4; \
269*4882a593Smuzhiyun	pand x2,		x1; \
270*4882a593Smuzhiyun	pxor x0,		x1; \
271*4882a593Smuzhiyun	por x4,			x0; \
272*4882a593Smuzhiyun	pxor x3,		x4; \
273*4882a593Smuzhiyun	pxor x3,		x0; \
274*4882a593Smuzhiyun	por x1,			x3; \
275*4882a593Smuzhiyun	pxor x2,		x1;
276*4882a593Smuzhiyun#define SI3_2(x0, x1, x2, x3, x4) \
277*4882a593Smuzhiyun	pxor x3,		x1; \
278*4882a593Smuzhiyun	pxor x2,		x0; \
279*4882a593Smuzhiyun	pxor x3,		x2; \
280*4882a593Smuzhiyun	pand x1,		x3; \
281*4882a593Smuzhiyun	pxor x0,		x1; \
282*4882a593Smuzhiyun	pand x2,		x0; \
283*4882a593Smuzhiyun	pxor x3,		x4; \
284*4882a593Smuzhiyun	pxor x0,		x3; \
285*4882a593Smuzhiyun	pxor x1,		x0;
286*4882a593Smuzhiyun
287*4882a593Smuzhiyun#define SI4_1(x0, x1, x2, x3, x4) \
288*4882a593Smuzhiyun	pxor x3,		x2; \
289*4882a593Smuzhiyun	movdqa x0,		x4; \
290*4882a593Smuzhiyun	pand x1,		x0; \
291*4882a593Smuzhiyun	pxor x2,		x0; \
292*4882a593Smuzhiyun	por x3,			x2; \
293*4882a593Smuzhiyun	pxor RNOT,		x4; \
294*4882a593Smuzhiyun	pxor x0,		x1; \
295*4882a593Smuzhiyun	pxor x2,		x0; \
296*4882a593Smuzhiyun	pand x4,		x2;
297*4882a593Smuzhiyun#define SI4_2(x0, x1, x2, x3, x4) \
298*4882a593Smuzhiyun	pxor x0,		x2; \
299*4882a593Smuzhiyun	por x4,			x0; \
300*4882a593Smuzhiyun	pxor x3,		x0; \
301*4882a593Smuzhiyun	pand x2,		x3; \
302*4882a593Smuzhiyun	pxor x3,		x4; \
303*4882a593Smuzhiyun	pxor x1,		x3; \
304*4882a593Smuzhiyun	pand x0,		x1; \
305*4882a593Smuzhiyun	pxor x1,		x4; \
306*4882a593Smuzhiyun	pxor x3,		x0;
307*4882a593Smuzhiyun
308*4882a593Smuzhiyun#define SI5_1(x0, x1, x2, x3, x4) \
309*4882a593Smuzhiyun	movdqa x1,		x4; \
310*4882a593Smuzhiyun	por x2,			x1; \
311*4882a593Smuzhiyun	pxor x4,		x2; \
312*4882a593Smuzhiyun	pxor x3,		x1; \
313*4882a593Smuzhiyun	pand x4,		x3; \
314*4882a593Smuzhiyun	pxor x3,		x2; \
315*4882a593Smuzhiyun	por x0,			x3; \
316*4882a593Smuzhiyun	pxor RNOT,		x0; \
317*4882a593Smuzhiyun	pxor x2,		x3; \
318*4882a593Smuzhiyun	por x0,			x2;
319*4882a593Smuzhiyun#define SI5_2(x0, x1, x2, x3, x4) \
320*4882a593Smuzhiyun	pxor x1,		x4; \
321*4882a593Smuzhiyun	pxor x4,		x2; \
322*4882a593Smuzhiyun	pand x0,		x4; \
323*4882a593Smuzhiyun	pxor x1,		x0; \
324*4882a593Smuzhiyun	pxor x3,		x1; \
325*4882a593Smuzhiyun	pand x2,		x0; \
326*4882a593Smuzhiyun	pxor x3,		x2; \
327*4882a593Smuzhiyun	pxor x2,		x0; \
328*4882a593Smuzhiyun	pxor x4,		x2; \
329*4882a593Smuzhiyun	pxor x3,		x4;
330*4882a593Smuzhiyun
331*4882a593Smuzhiyun#define SI6_1(x0, x1, x2, x3, x4) \
332*4882a593Smuzhiyun	pxor x2,		x0; \
333*4882a593Smuzhiyun	movdqa x0,		x4; \
334*4882a593Smuzhiyun	pand x3,		x0; \
335*4882a593Smuzhiyun	pxor x3,		x2; \
336*4882a593Smuzhiyun	pxor x2,		x0; \
337*4882a593Smuzhiyun	pxor x1,		x3; \
338*4882a593Smuzhiyun	por x4,			x2; \
339*4882a593Smuzhiyun	pxor x3,		x2; \
340*4882a593Smuzhiyun	pand x0,		x3;
341*4882a593Smuzhiyun#define SI6_2(x0, x1, x2, x3, x4) \
342*4882a593Smuzhiyun	pxor RNOT,		x0; \
343*4882a593Smuzhiyun	pxor x1,		x3; \
344*4882a593Smuzhiyun	pand x2,		x1; \
345*4882a593Smuzhiyun	pxor x0,		x4; \
346*4882a593Smuzhiyun	pxor x4,		x3; \
347*4882a593Smuzhiyun	pxor x2,		x4; \
348*4882a593Smuzhiyun	pxor x1,		x0; \
349*4882a593Smuzhiyun	pxor x0,		x2;
350*4882a593Smuzhiyun
351*4882a593Smuzhiyun#define SI7_1(x0, x1, x2, x3, x4) \
352*4882a593Smuzhiyun	movdqa x3,		x4; \
353*4882a593Smuzhiyun	pand x0,		x3; \
354*4882a593Smuzhiyun	pxor x2,		x0; \
355*4882a593Smuzhiyun	por x4,			x2; \
356*4882a593Smuzhiyun	pxor x1,		x4; \
357*4882a593Smuzhiyun	pxor RNOT,		x0; \
358*4882a593Smuzhiyun	por x3,			x1; \
359*4882a593Smuzhiyun	pxor x0,		x4; \
360*4882a593Smuzhiyun	pand x2,		x0; \
361*4882a593Smuzhiyun	pxor x1,		x0;
362*4882a593Smuzhiyun#define SI7_2(x0, x1, x2, x3, x4) \
363*4882a593Smuzhiyun	pand x2,		x1; \
364*4882a593Smuzhiyun	pxor x2,		x3; \
365*4882a593Smuzhiyun	pxor x3,		x4; \
366*4882a593Smuzhiyun	pand x3,		x2; \
367*4882a593Smuzhiyun	por x0,			x3; \
368*4882a593Smuzhiyun	pxor x4,		x1; \
369*4882a593Smuzhiyun	pxor x4,		x3; \
370*4882a593Smuzhiyun	pand x0,		x4; \
371*4882a593Smuzhiyun	pxor x2,		x4;
372*4882a593Smuzhiyun
373*4882a593Smuzhiyun#define get_key(i, j, t) \
374*4882a593Smuzhiyun	movd (4*(i)+(j))*4(CTX), t; \
375*4882a593Smuzhiyun	pshufd $0, t, t;
376*4882a593Smuzhiyun
377*4882a593Smuzhiyun#define K2(x0, x1, x2, x3, x4, i) \
378*4882a593Smuzhiyun	get_key(i, 0, RK0); \
379*4882a593Smuzhiyun	get_key(i, 1, RK1); \
380*4882a593Smuzhiyun	get_key(i, 2, RK2); \
381*4882a593Smuzhiyun	get_key(i, 3, RK3); \
382*4882a593Smuzhiyun	pxor RK0,		x0 ## 1; \
383*4882a593Smuzhiyun	pxor RK1,		x1 ## 1; \
384*4882a593Smuzhiyun	pxor RK2,		x2 ## 1; \
385*4882a593Smuzhiyun	pxor RK3,		x3 ## 1; \
386*4882a593Smuzhiyun		pxor RK0,		x0 ## 2; \
387*4882a593Smuzhiyun		pxor RK1,		x1 ## 2; \
388*4882a593Smuzhiyun		pxor RK2,		x2 ## 2; \
389*4882a593Smuzhiyun		pxor RK3,		x3 ## 2;
390*4882a593Smuzhiyun
391*4882a593Smuzhiyun#define LK2(x0, x1, x2, x3, x4, i) \
392*4882a593Smuzhiyun	movdqa x0 ## 1,		x4 ## 1; \
393*4882a593Smuzhiyun	pslld $13,		x0 ## 1; \
394*4882a593Smuzhiyun	psrld $(32 - 13),	x4 ## 1; \
395*4882a593Smuzhiyun	por x4 ## 1,		x0 ## 1; \
396*4882a593Smuzhiyun	pxor x0 ## 1,		x1 ## 1; \
397*4882a593Smuzhiyun	movdqa x2 ## 1,		x4 ## 1; \
398*4882a593Smuzhiyun	pslld $3,		x2 ## 1; \
399*4882a593Smuzhiyun	psrld $(32 - 3),	x4 ## 1; \
400*4882a593Smuzhiyun	por x4 ## 1,		x2 ## 1; \
401*4882a593Smuzhiyun	pxor x2 ## 1,		x1 ## 1; \
402*4882a593Smuzhiyun		movdqa x0 ## 2,		x4 ## 2; \
403*4882a593Smuzhiyun		pslld $13,		x0 ## 2; \
404*4882a593Smuzhiyun		psrld $(32 - 13),	x4 ## 2; \
405*4882a593Smuzhiyun		por x4 ## 2,		x0 ## 2; \
406*4882a593Smuzhiyun		pxor x0 ## 2,		x1 ## 2; \
407*4882a593Smuzhiyun		movdqa x2 ## 2,		x4 ## 2; \
408*4882a593Smuzhiyun		pslld $3,		x2 ## 2; \
409*4882a593Smuzhiyun		psrld $(32 - 3),	x4 ## 2; \
410*4882a593Smuzhiyun		por x4 ## 2,		x2 ## 2; \
411*4882a593Smuzhiyun		pxor x2 ## 2,		x1 ## 2; \
412*4882a593Smuzhiyun	movdqa x1 ## 1,		x4 ## 1; \
413*4882a593Smuzhiyun	pslld $1,		x1 ## 1; \
414*4882a593Smuzhiyun	psrld $(32 - 1),	x4 ## 1; \
415*4882a593Smuzhiyun	por x4 ## 1,		x1 ## 1; \
416*4882a593Smuzhiyun	movdqa x0 ## 1,		x4 ## 1; \
417*4882a593Smuzhiyun	pslld $3,		x4 ## 1; \
418*4882a593Smuzhiyun	pxor x2 ## 1,		x3 ## 1; \
419*4882a593Smuzhiyun	pxor x4 ## 1,		x3 ## 1; \
420*4882a593Smuzhiyun	movdqa x3 ## 1,		x4 ## 1; \
421*4882a593Smuzhiyun	get_key(i, 1, RK1); \
422*4882a593Smuzhiyun		movdqa x1 ## 2,		x4 ## 2; \
423*4882a593Smuzhiyun		pslld $1,		x1 ## 2; \
424*4882a593Smuzhiyun		psrld $(32 - 1),	x4 ## 2; \
425*4882a593Smuzhiyun		por x4 ## 2,		x1 ## 2; \
426*4882a593Smuzhiyun		movdqa x0 ## 2,		x4 ## 2; \
427*4882a593Smuzhiyun		pslld $3,		x4 ## 2; \
428*4882a593Smuzhiyun		pxor x2 ## 2,		x3 ## 2; \
429*4882a593Smuzhiyun		pxor x4 ## 2,		x3 ## 2; \
430*4882a593Smuzhiyun		movdqa x3 ## 2,		x4 ## 2; \
431*4882a593Smuzhiyun		get_key(i, 3, RK3); \
432*4882a593Smuzhiyun	pslld $7,		x3 ## 1; \
433*4882a593Smuzhiyun	psrld $(32 - 7),	x4 ## 1; \
434*4882a593Smuzhiyun	por x4 ## 1,		x3 ## 1; \
435*4882a593Smuzhiyun	movdqa x1 ## 1,		x4 ## 1; \
436*4882a593Smuzhiyun	pslld $7,		x4 ## 1; \
437*4882a593Smuzhiyun	pxor x1 ## 1,		x0 ## 1; \
438*4882a593Smuzhiyun	pxor x3 ## 1,		x0 ## 1; \
439*4882a593Smuzhiyun	pxor x3 ## 1,		x2 ## 1; \
440*4882a593Smuzhiyun	pxor x4 ## 1,		x2 ## 1; \
441*4882a593Smuzhiyun	get_key(i, 0, RK0); \
442*4882a593Smuzhiyun		pslld $7,		x3 ## 2; \
443*4882a593Smuzhiyun		psrld $(32 - 7),	x4 ## 2; \
444*4882a593Smuzhiyun		por x4 ## 2,		x3 ## 2; \
445*4882a593Smuzhiyun		movdqa x1 ## 2,		x4 ## 2; \
446*4882a593Smuzhiyun		pslld $7,		x4 ## 2; \
447*4882a593Smuzhiyun		pxor x1 ## 2,		x0 ## 2; \
448*4882a593Smuzhiyun		pxor x3 ## 2,		x0 ## 2; \
449*4882a593Smuzhiyun		pxor x3 ## 2,		x2 ## 2; \
450*4882a593Smuzhiyun		pxor x4 ## 2,		x2 ## 2; \
451*4882a593Smuzhiyun		get_key(i, 2, RK2); \
452*4882a593Smuzhiyun	pxor RK1,		x1 ## 1; \
453*4882a593Smuzhiyun	pxor RK3,		x3 ## 1; \
454*4882a593Smuzhiyun	movdqa x0 ## 1,		x4 ## 1; \
455*4882a593Smuzhiyun	pslld $5,		x0 ## 1; \
456*4882a593Smuzhiyun	psrld $(32 - 5),	x4 ## 1; \
457*4882a593Smuzhiyun	por x4 ## 1,		x0 ## 1; \
458*4882a593Smuzhiyun	movdqa x2 ## 1,		x4 ## 1; \
459*4882a593Smuzhiyun	pslld $22,		x2 ## 1; \
460*4882a593Smuzhiyun	psrld $(32 - 22),	x4 ## 1; \
461*4882a593Smuzhiyun	por x4 ## 1,		x2 ## 1; \
462*4882a593Smuzhiyun	pxor RK0,		x0 ## 1; \
463*4882a593Smuzhiyun	pxor RK2,		x2 ## 1; \
464*4882a593Smuzhiyun		pxor RK1,		x1 ## 2; \
465*4882a593Smuzhiyun		pxor RK3,		x3 ## 2; \
466*4882a593Smuzhiyun		movdqa x0 ## 2,		x4 ## 2; \
467*4882a593Smuzhiyun		pslld $5,		x0 ## 2; \
468*4882a593Smuzhiyun		psrld $(32 - 5),	x4 ## 2; \
469*4882a593Smuzhiyun		por x4 ## 2,		x0 ## 2; \
470*4882a593Smuzhiyun		movdqa x2 ## 2,		x4 ## 2; \
471*4882a593Smuzhiyun		pslld $22,		x2 ## 2; \
472*4882a593Smuzhiyun		psrld $(32 - 22),	x4 ## 2; \
473*4882a593Smuzhiyun		por x4 ## 2,		x2 ## 2; \
474*4882a593Smuzhiyun		pxor RK0,		x0 ## 2; \
475*4882a593Smuzhiyun		pxor RK2,		x2 ## 2;
476*4882a593Smuzhiyun
477*4882a593Smuzhiyun#define KL2(x0, x1, x2, x3, x4, i) \
478*4882a593Smuzhiyun	pxor RK0,		x0 ## 1; \
479*4882a593Smuzhiyun	pxor RK2,		x2 ## 1; \
480*4882a593Smuzhiyun	movdqa x0 ## 1,		x4 ## 1; \
481*4882a593Smuzhiyun	psrld $5,		x0 ## 1; \
482*4882a593Smuzhiyun	pslld $(32 - 5),	x4 ## 1; \
483*4882a593Smuzhiyun	por x4 ## 1,		x0 ## 1; \
484*4882a593Smuzhiyun	pxor RK3,		x3 ## 1; \
485*4882a593Smuzhiyun	pxor RK1,		x1 ## 1; \
486*4882a593Smuzhiyun	movdqa x2 ## 1,		x4 ## 1; \
487*4882a593Smuzhiyun	psrld $22,		x2 ## 1; \
488*4882a593Smuzhiyun	pslld $(32 - 22),	x4 ## 1; \
489*4882a593Smuzhiyun	por x4 ## 1,		x2 ## 1; \
490*4882a593Smuzhiyun	pxor x3 ## 1,		x2 ## 1; \
491*4882a593Smuzhiyun		pxor RK0,		x0 ## 2; \
492*4882a593Smuzhiyun		pxor RK2,		x2 ## 2; \
493*4882a593Smuzhiyun		movdqa x0 ## 2,		x4 ## 2; \
494*4882a593Smuzhiyun		psrld $5,		x0 ## 2; \
495*4882a593Smuzhiyun		pslld $(32 - 5),	x4 ## 2; \
496*4882a593Smuzhiyun		por x4 ## 2,		x0 ## 2; \
497*4882a593Smuzhiyun		pxor RK3,		x3 ## 2; \
498*4882a593Smuzhiyun		pxor RK1,		x1 ## 2; \
499*4882a593Smuzhiyun		movdqa x2 ## 2,		x4 ## 2; \
500*4882a593Smuzhiyun		psrld $22,		x2 ## 2; \
501*4882a593Smuzhiyun		pslld $(32 - 22),	x4 ## 2; \
502*4882a593Smuzhiyun		por x4 ## 2,		x2 ## 2; \
503*4882a593Smuzhiyun		pxor x3 ## 2,		x2 ## 2; \
504*4882a593Smuzhiyun	pxor x3 ## 1,		x0 ## 1; \
505*4882a593Smuzhiyun	movdqa x1 ## 1,		x4 ## 1; \
506*4882a593Smuzhiyun	pslld $7,		x4 ## 1; \
507*4882a593Smuzhiyun	pxor x1 ## 1,		x0 ## 1; \
508*4882a593Smuzhiyun	pxor x4 ## 1,		x2 ## 1; \
509*4882a593Smuzhiyun	movdqa x1 ## 1,		x4 ## 1; \
510*4882a593Smuzhiyun	psrld $1,		x1 ## 1; \
511*4882a593Smuzhiyun	pslld $(32 - 1),	x4 ## 1; \
512*4882a593Smuzhiyun	por x4 ## 1,		x1 ## 1; \
513*4882a593Smuzhiyun		pxor x3 ## 2,		x0 ## 2; \
514*4882a593Smuzhiyun		movdqa x1 ## 2,		x4 ## 2; \
515*4882a593Smuzhiyun		pslld $7,		x4 ## 2; \
516*4882a593Smuzhiyun		pxor x1 ## 2,		x0 ## 2; \
517*4882a593Smuzhiyun		pxor x4 ## 2,		x2 ## 2; \
518*4882a593Smuzhiyun		movdqa x1 ## 2,		x4 ## 2; \
519*4882a593Smuzhiyun		psrld $1,		x1 ## 2; \
520*4882a593Smuzhiyun		pslld $(32 - 1),	x4 ## 2; \
521*4882a593Smuzhiyun		por x4 ## 2,		x1 ## 2; \
522*4882a593Smuzhiyun	movdqa x3 ## 1,		x4 ## 1; \
523*4882a593Smuzhiyun	psrld $7,		x3 ## 1; \
524*4882a593Smuzhiyun	pslld $(32 - 7),	x4 ## 1; \
525*4882a593Smuzhiyun	por x4 ## 1,		x3 ## 1; \
526*4882a593Smuzhiyun	pxor x0 ## 1,		x1 ## 1; \
527*4882a593Smuzhiyun	movdqa x0 ## 1,		x4 ## 1; \
528*4882a593Smuzhiyun	pslld $3,		x4 ## 1; \
529*4882a593Smuzhiyun	pxor x4 ## 1,		x3 ## 1; \
530*4882a593Smuzhiyun	movdqa x0 ## 1,		x4 ## 1; \
531*4882a593Smuzhiyun		movdqa x3 ## 2,		x4 ## 2; \
532*4882a593Smuzhiyun		psrld $7,		x3 ## 2; \
533*4882a593Smuzhiyun		pslld $(32 - 7),	x4 ## 2; \
534*4882a593Smuzhiyun		por x4 ## 2,		x3 ## 2; \
535*4882a593Smuzhiyun		pxor x0 ## 2,		x1 ## 2; \
536*4882a593Smuzhiyun		movdqa x0 ## 2,		x4 ## 2; \
537*4882a593Smuzhiyun		pslld $3,		x4 ## 2; \
538*4882a593Smuzhiyun		pxor x4 ## 2,		x3 ## 2; \
539*4882a593Smuzhiyun		movdqa x0 ## 2,		x4 ## 2; \
540*4882a593Smuzhiyun	psrld $13,		x0 ## 1; \
541*4882a593Smuzhiyun	pslld $(32 - 13),	x4 ## 1; \
542*4882a593Smuzhiyun	por x4 ## 1,		x0 ## 1; \
543*4882a593Smuzhiyun	pxor x2 ## 1,		x1 ## 1; \
544*4882a593Smuzhiyun	pxor x2 ## 1,		x3 ## 1; \
545*4882a593Smuzhiyun	movdqa x2 ## 1,		x4 ## 1; \
546*4882a593Smuzhiyun	psrld $3,		x2 ## 1; \
547*4882a593Smuzhiyun	pslld $(32 - 3),	x4 ## 1; \
548*4882a593Smuzhiyun	por x4 ## 1,		x2 ## 1; \
549*4882a593Smuzhiyun		psrld $13,		x0 ## 2; \
550*4882a593Smuzhiyun		pslld $(32 - 13),	x4 ## 2; \
551*4882a593Smuzhiyun		por x4 ## 2,		x0 ## 2; \
552*4882a593Smuzhiyun		pxor x2 ## 2,		x1 ## 2; \
553*4882a593Smuzhiyun		pxor x2 ## 2,		x3 ## 2; \
554*4882a593Smuzhiyun		movdqa x2 ## 2,		x4 ## 2; \
555*4882a593Smuzhiyun		psrld $3,		x2 ## 2; \
556*4882a593Smuzhiyun		pslld $(32 - 3),	x4 ## 2; \
557*4882a593Smuzhiyun		por x4 ## 2,		x2 ## 2;
558*4882a593Smuzhiyun
559*4882a593Smuzhiyun#define S(SBOX, x0, x1, x2, x3, x4) \
560*4882a593Smuzhiyun	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
561*4882a593Smuzhiyun	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
562*4882a593Smuzhiyun	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
563*4882a593Smuzhiyun	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
564*4882a593Smuzhiyun
565*4882a593Smuzhiyun#define SP(SBOX, x0, x1, x2, x3, x4, i) \
566*4882a593Smuzhiyun	get_key(i, 0, RK0); \
567*4882a593Smuzhiyun	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
568*4882a593Smuzhiyun	get_key(i, 2, RK2); \
569*4882a593Smuzhiyun	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
570*4882a593Smuzhiyun	get_key(i, 3, RK3); \
571*4882a593Smuzhiyun	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
572*4882a593Smuzhiyun	get_key(i, 1, RK1); \
573*4882a593Smuzhiyun	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
574*4882a593Smuzhiyun
575*4882a593Smuzhiyun#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
576*4882a593Smuzhiyun	movdqa x0,		t2; \
577*4882a593Smuzhiyun	punpckldq x1,		x0; \
578*4882a593Smuzhiyun	punpckhdq x1,		t2; \
579*4882a593Smuzhiyun	movdqa x2,		t1; \
580*4882a593Smuzhiyun	punpckhdq x3,		x2; \
581*4882a593Smuzhiyun	punpckldq x3,		t1; \
582*4882a593Smuzhiyun	movdqa x0,		x1; \
583*4882a593Smuzhiyun	punpcklqdq t1,		x0; \
584*4882a593Smuzhiyun	punpckhqdq t1,		x1; \
585*4882a593Smuzhiyun	movdqa t2,		x3; \
586*4882a593Smuzhiyun	punpcklqdq x2,		t2; \
587*4882a593Smuzhiyun	punpckhqdq x2,		x3; \
588*4882a593Smuzhiyun	movdqa t2,		x2;
589*4882a593Smuzhiyun
590*4882a593Smuzhiyun#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
591*4882a593Smuzhiyun	movdqu (0*4*4)(in),	x0; \
592*4882a593Smuzhiyun	movdqu (1*4*4)(in),	x1; \
593*4882a593Smuzhiyun	movdqu (2*4*4)(in),	x2; \
594*4882a593Smuzhiyun	movdqu (3*4*4)(in),	x3; \
595*4882a593Smuzhiyun	\
596*4882a593Smuzhiyun	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
597*4882a593Smuzhiyun
598*4882a593Smuzhiyun#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
599*4882a593Smuzhiyun	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
600*4882a593Smuzhiyun	\
601*4882a593Smuzhiyun	movdqu x0,		(0*4*4)(out); \
602*4882a593Smuzhiyun	movdqu x1,		(1*4*4)(out); \
603*4882a593Smuzhiyun	movdqu x2,		(2*4*4)(out); \
604*4882a593Smuzhiyun	movdqu x3,		(3*4*4)(out);
605*4882a593Smuzhiyun
606*4882a593Smuzhiyun#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
607*4882a593Smuzhiyun	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
608*4882a593Smuzhiyun	\
609*4882a593Smuzhiyun	movdqu (0*4*4)(out),	t0; \
610*4882a593Smuzhiyun	pxor t0,		x0; \
611*4882a593Smuzhiyun	movdqu x0,		(0*4*4)(out); \
612*4882a593Smuzhiyun	movdqu (1*4*4)(out),	t0; \
613*4882a593Smuzhiyun	pxor t0,		x1; \
614*4882a593Smuzhiyun	movdqu x1,		(1*4*4)(out); \
615*4882a593Smuzhiyun	movdqu (2*4*4)(out),	t0; \
616*4882a593Smuzhiyun	pxor t0,		x2; \
617*4882a593Smuzhiyun	movdqu x2,		(2*4*4)(out); \
618*4882a593Smuzhiyun	movdqu (3*4*4)(out),	t0; \
619*4882a593Smuzhiyun	pxor t0,		x3; \
620*4882a593Smuzhiyun	movdqu x3,		(3*4*4)(out);
621*4882a593Smuzhiyun
622*4882a593SmuzhiyunSYM_FUNC_START(__serpent_enc_blk_8way)
623*4882a593Smuzhiyun	/* input:
624*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
625*4882a593Smuzhiyun	 *	%rsi: dst
626*4882a593Smuzhiyun	 *	%rdx: src
627*4882a593Smuzhiyun	 *	%rcx: bool, if true: xor output
628*4882a593Smuzhiyun	 */
629*4882a593Smuzhiyun
630*4882a593Smuzhiyun	pcmpeqd RNOT, RNOT;
631*4882a593Smuzhiyun
632*4882a593Smuzhiyun	leaq (4*4*4)(%rdx), %rax;
633*4882a593Smuzhiyun	read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
634*4882a593Smuzhiyun	read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
635*4882a593Smuzhiyun
636*4882a593Smuzhiyun						 K2(RA, RB, RC, RD, RE, 0);
637*4882a593Smuzhiyun	S(S0, RA, RB, RC, RD, RE);		LK2(RC, RB, RD, RA, RE, 1);
638*4882a593Smuzhiyun	S(S1, RC, RB, RD, RA, RE);		LK2(RE, RD, RA, RC, RB, 2);
639*4882a593Smuzhiyun	S(S2, RE, RD, RA, RC, RB);		LK2(RB, RD, RE, RC, RA, 3);
640*4882a593Smuzhiyun	S(S3, RB, RD, RE, RC, RA);		LK2(RC, RA, RD, RB, RE, 4);
641*4882a593Smuzhiyun	S(S4, RC, RA, RD, RB, RE);		LK2(RA, RD, RB, RE, RC, 5);
642*4882a593Smuzhiyun	S(S5, RA, RD, RB, RE, RC);		LK2(RC, RA, RD, RE, RB, 6);
643*4882a593Smuzhiyun	S(S6, RC, RA, RD, RE, RB);		LK2(RD, RB, RA, RE, RC, 7);
644*4882a593Smuzhiyun	S(S7, RD, RB, RA, RE, RC);		LK2(RC, RA, RE, RD, RB, 8);
645*4882a593Smuzhiyun	S(S0, RC, RA, RE, RD, RB);		LK2(RE, RA, RD, RC, RB, 9);
646*4882a593Smuzhiyun	S(S1, RE, RA, RD, RC, RB);		LK2(RB, RD, RC, RE, RA, 10);
647*4882a593Smuzhiyun	S(S2, RB, RD, RC, RE, RA);		LK2(RA, RD, RB, RE, RC, 11);
648*4882a593Smuzhiyun	S(S3, RA, RD, RB, RE, RC);		LK2(RE, RC, RD, RA, RB, 12);
649*4882a593Smuzhiyun	S(S4, RE, RC, RD, RA, RB);		LK2(RC, RD, RA, RB, RE, 13);
650*4882a593Smuzhiyun	S(S5, RC, RD, RA, RB, RE);		LK2(RE, RC, RD, RB, RA, 14);
651*4882a593Smuzhiyun	S(S6, RE, RC, RD, RB, RA);		LK2(RD, RA, RC, RB, RE, 15);
652*4882a593Smuzhiyun	S(S7, RD, RA, RC, RB, RE);		LK2(RE, RC, RB, RD, RA, 16);
653*4882a593Smuzhiyun	S(S0, RE, RC, RB, RD, RA);		LK2(RB, RC, RD, RE, RA, 17);
654*4882a593Smuzhiyun	S(S1, RB, RC, RD, RE, RA);		LK2(RA, RD, RE, RB, RC, 18);
655*4882a593Smuzhiyun	S(S2, RA, RD, RE, RB, RC);		LK2(RC, RD, RA, RB, RE, 19);
656*4882a593Smuzhiyun	S(S3, RC, RD, RA, RB, RE);		LK2(RB, RE, RD, RC, RA, 20);
657*4882a593Smuzhiyun	S(S4, RB, RE, RD, RC, RA);		LK2(RE, RD, RC, RA, RB, 21);
658*4882a593Smuzhiyun	S(S5, RE, RD, RC, RA, RB);		LK2(RB, RE, RD, RA, RC, 22);
659*4882a593Smuzhiyun	S(S6, RB, RE, RD, RA, RC);		LK2(RD, RC, RE, RA, RB, 23);
660*4882a593Smuzhiyun	S(S7, RD, RC, RE, RA, RB);		LK2(RB, RE, RA, RD, RC, 24);
661*4882a593Smuzhiyun	S(S0, RB, RE, RA, RD, RC);		LK2(RA, RE, RD, RB, RC, 25);
662*4882a593Smuzhiyun	S(S1, RA, RE, RD, RB, RC);		LK2(RC, RD, RB, RA, RE, 26);
663*4882a593Smuzhiyun	S(S2, RC, RD, RB, RA, RE);		LK2(RE, RD, RC, RA, RB, 27);
664*4882a593Smuzhiyun	S(S3, RE, RD, RC, RA, RB);		LK2(RA, RB, RD, RE, RC, 28);
665*4882a593Smuzhiyun	S(S4, RA, RB, RD, RE, RC);		LK2(RB, RD, RE, RC, RA, 29);
666*4882a593Smuzhiyun	S(S5, RB, RD, RE, RC, RA);		LK2(RA, RB, RD, RC, RE, 30);
667*4882a593Smuzhiyun	S(S6, RA, RB, RD, RC, RE);		LK2(RD, RE, RB, RC, RA, 31);
668*4882a593Smuzhiyun	S(S7, RD, RE, RB, RC, RA);		 K2(RA, RB, RC, RD, RE, 32);
669*4882a593Smuzhiyun
670*4882a593Smuzhiyun	leaq (4*4*4)(%rsi), %rax;
671*4882a593Smuzhiyun
672*4882a593Smuzhiyun	testb %cl, %cl;
673*4882a593Smuzhiyun	jnz .L__enc_xor8;
674*4882a593Smuzhiyun
675*4882a593Smuzhiyun	write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
676*4882a593Smuzhiyun	write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
677*4882a593Smuzhiyun
678*4882a593Smuzhiyun	RET;
679*4882a593Smuzhiyun
680*4882a593Smuzhiyun.L__enc_xor8:
681*4882a593Smuzhiyun	xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
682*4882a593Smuzhiyun	xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
683*4882a593Smuzhiyun
684*4882a593Smuzhiyun	RET;
685*4882a593SmuzhiyunSYM_FUNC_END(__serpent_enc_blk_8way)
686*4882a593Smuzhiyun
687*4882a593SmuzhiyunSYM_FUNC_START(serpent_dec_blk_8way)
688*4882a593Smuzhiyun	/* input:
689*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
690*4882a593Smuzhiyun	 *	%rsi: dst
691*4882a593Smuzhiyun	 *	%rdx: src
692*4882a593Smuzhiyun	 */
693*4882a593Smuzhiyun
694*4882a593Smuzhiyun	pcmpeqd RNOT, RNOT;
695*4882a593Smuzhiyun
696*4882a593Smuzhiyun	leaq (4*4*4)(%rdx), %rax;
697*4882a593Smuzhiyun	read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
698*4882a593Smuzhiyun	read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
699*4882a593Smuzhiyun
700*4882a593Smuzhiyun						 K2(RA, RB, RC, RD, RE, 32);
701*4882a593Smuzhiyun	SP(SI7, RA, RB, RC, RD, RE, 31);	KL2(RB, RD, RA, RE, RC, 31);
702*4882a593Smuzhiyun	SP(SI6, RB, RD, RA, RE, RC, 30);	KL2(RA, RC, RE, RB, RD, 30);
703*4882a593Smuzhiyun	SP(SI5, RA, RC, RE, RB, RD, 29);	KL2(RC, RD, RA, RE, RB, 29);
704*4882a593Smuzhiyun	SP(SI4, RC, RD, RA, RE, RB, 28);	KL2(RC, RA, RB, RE, RD, 28);
705*4882a593Smuzhiyun	SP(SI3, RC, RA, RB, RE, RD, 27);	KL2(RB, RC, RD, RE, RA, 27);
706*4882a593Smuzhiyun	SP(SI2, RB, RC, RD, RE, RA, 26);	KL2(RC, RA, RE, RD, RB, 26);
707*4882a593Smuzhiyun	SP(SI1, RC, RA, RE, RD, RB, 25);	KL2(RB, RA, RE, RD, RC, 25);
708*4882a593Smuzhiyun	SP(SI0, RB, RA, RE, RD, RC, 24);	KL2(RE, RC, RA, RB, RD, 24);
709*4882a593Smuzhiyun	SP(SI7, RE, RC, RA, RB, RD, 23);	KL2(RC, RB, RE, RD, RA, 23);
710*4882a593Smuzhiyun	SP(SI6, RC, RB, RE, RD, RA, 22);	KL2(RE, RA, RD, RC, RB, 22);
711*4882a593Smuzhiyun	SP(SI5, RE, RA, RD, RC, RB, 21);	KL2(RA, RB, RE, RD, RC, 21);
712*4882a593Smuzhiyun	SP(SI4, RA, RB, RE, RD, RC, 20);	KL2(RA, RE, RC, RD, RB, 20);
713*4882a593Smuzhiyun	SP(SI3, RA, RE, RC, RD, RB, 19);	KL2(RC, RA, RB, RD, RE, 19);
714*4882a593Smuzhiyun	SP(SI2, RC, RA, RB, RD, RE, 18);	KL2(RA, RE, RD, RB, RC, 18);
715*4882a593Smuzhiyun	SP(SI1, RA, RE, RD, RB, RC, 17);	KL2(RC, RE, RD, RB, RA, 17);
716*4882a593Smuzhiyun	SP(SI0, RC, RE, RD, RB, RA, 16);	KL2(RD, RA, RE, RC, RB, 16);
717*4882a593Smuzhiyun	SP(SI7, RD, RA, RE, RC, RB, 15);	KL2(RA, RC, RD, RB, RE, 15);
718*4882a593Smuzhiyun	SP(SI6, RA, RC, RD, RB, RE, 14);	KL2(RD, RE, RB, RA, RC, 14);
719*4882a593Smuzhiyun	SP(SI5, RD, RE, RB, RA, RC, 13);	KL2(RE, RC, RD, RB, RA, 13);
720*4882a593Smuzhiyun	SP(SI4, RE, RC, RD, RB, RA, 12);	KL2(RE, RD, RA, RB, RC, 12);
721*4882a593Smuzhiyun	SP(SI3, RE, RD, RA, RB, RC, 11);	KL2(RA, RE, RC, RB, RD, 11);
722*4882a593Smuzhiyun	SP(SI2, RA, RE, RC, RB, RD, 10);	KL2(RE, RD, RB, RC, RA, 10);
723*4882a593Smuzhiyun	SP(SI1, RE, RD, RB, RC, RA, 9);		KL2(RA, RD, RB, RC, RE, 9);
724*4882a593Smuzhiyun	SP(SI0, RA, RD, RB, RC, RE, 8);		KL2(RB, RE, RD, RA, RC, 8);
725*4882a593Smuzhiyun	SP(SI7, RB, RE, RD, RA, RC, 7);		KL2(RE, RA, RB, RC, RD, 7);
726*4882a593Smuzhiyun	SP(SI6, RE, RA, RB, RC, RD, 6);		KL2(RB, RD, RC, RE, RA, 6);
727*4882a593Smuzhiyun	SP(SI5, RB, RD, RC, RE, RA, 5);		KL2(RD, RA, RB, RC, RE, 5);
728*4882a593Smuzhiyun	SP(SI4, RD, RA, RB, RC, RE, 4);		KL2(RD, RB, RE, RC, RA, 4);
729*4882a593Smuzhiyun	SP(SI3, RD, RB, RE, RC, RA, 3);		KL2(RE, RD, RA, RC, RB, 3);
730*4882a593Smuzhiyun	SP(SI2, RE, RD, RA, RC, RB, 2);		KL2(RD, RB, RC, RA, RE, 2);
731*4882a593Smuzhiyun	SP(SI1, RD, RB, RC, RA, RE, 1);		KL2(RE, RB, RC, RA, RD, 1);
732*4882a593Smuzhiyun	S(SI0, RE, RB, RC, RA, RD);		 K2(RC, RD, RB, RE, RA, 0);
733*4882a593Smuzhiyun
734*4882a593Smuzhiyun	leaq (4*4*4)(%rsi), %rax;
735*4882a593Smuzhiyun	write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
736*4882a593Smuzhiyun	write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
737*4882a593Smuzhiyun
738*4882a593Smuzhiyun	RET;
739*4882a593SmuzhiyunSYM_FUNC_END(serpent_dec_blk_8way)
740