xref: /OK3568_Linux_fs/kernel/arch/x86/crypto/serpent-sse2-i586-asm_32.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * Based on crypto/serpent.c by
8*4882a593Smuzhiyun *  Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
9*4882a593Smuzhiyun *                2003 Herbert Valerio Riedel <hvr@gnu.org>
10*4882a593Smuzhiyun */
11*4882a593Smuzhiyun
12*4882a593Smuzhiyun#include <linux/linkage.h>
13*4882a593Smuzhiyun
14*4882a593Smuzhiyun.file "serpent-sse2-i586-asm_32.S"
15*4882a593Smuzhiyun.text
16*4882a593Smuzhiyun
17*4882a593Smuzhiyun#define arg_ctx 4
18*4882a593Smuzhiyun#define arg_dst 8
19*4882a593Smuzhiyun#define arg_src 12
20*4882a593Smuzhiyun#define arg_xor 16
21*4882a593Smuzhiyun
22*4882a593Smuzhiyun/**********************************************************************
23*4882a593Smuzhiyun  4-way SSE2 serpent
24*4882a593Smuzhiyun **********************************************************************/
25*4882a593Smuzhiyun#define CTX %edx
26*4882a593Smuzhiyun
27*4882a593Smuzhiyun#define RA %xmm0
28*4882a593Smuzhiyun#define RB %xmm1
29*4882a593Smuzhiyun#define RC %xmm2
30*4882a593Smuzhiyun#define RD %xmm3
31*4882a593Smuzhiyun#define RE %xmm4
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun#define RT0 %xmm5
34*4882a593Smuzhiyun#define RT1 %xmm6
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun#define RNOT %xmm7
37*4882a593Smuzhiyun
38*4882a593Smuzhiyun#define get_key(i, j, t) \
39*4882a593Smuzhiyun	movd (4*(i)+(j))*4(CTX), t; \
40*4882a593Smuzhiyun	pshufd $0, t, t;
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun#define K(x0, x1, x2, x3, x4, i) \
43*4882a593Smuzhiyun	get_key(i, 0, x4); \
44*4882a593Smuzhiyun	get_key(i, 1, RT0); \
45*4882a593Smuzhiyun	get_key(i, 2, RT1); \
46*4882a593Smuzhiyun	pxor x4,		x0; \
47*4882a593Smuzhiyun	pxor RT0,		x1; \
48*4882a593Smuzhiyun	pxor RT1,		x2; \
49*4882a593Smuzhiyun	get_key(i, 3, x4); \
50*4882a593Smuzhiyun	pxor x4,		x3;
51*4882a593Smuzhiyun
52*4882a593Smuzhiyun#define LK(x0, x1, x2, x3, x4, i) \
53*4882a593Smuzhiyun	movdqa x0,		x4; \
54*4882a593Smuzhiyun	pslld $13,		x0; \
55*4882a593Smuzhiyun	psrld $(32 - 13),	x4; \
56*4882a593Smuzhiyun	por x4,			x0; \
57*4882a593Smuzhiyun	pxor x0,		x1; \
58*4882a593Smuzhiyun	movdqa x2,		x4; \
59*4882a593Smuzhiyun	pslld $3,		x2; \
60*4882a593Smuzhiyun	psrld $(32 - 3),	x4; \
61*4882a593Smuzhiyun	por x4,			x2; \
62*4882a593Smuzhiyun	pxor x2,		x1; \
63*4882a593Smuzhiyun	movdqa x1,		x4; \
64*4882a593Smuzhiyun	pslld $1,		x1; \
65*4882a593Smuzhiyun	psrld $(32 - 1),	x4; \
66*4882a593Smuzhiyun	por x4,			x1; \
67*4882a593Smuzhiyun	movdqa x0,		x4; \
68*4882a593Smuzhiyun	pslld $3,		x4; \
69*4882a593Smuzhiyun	pxor x2,		x3; \
70*4882a593Smuzhiyun	pxor x4,		x3; \
71*4882a593Smuzhiyun	movdqa x3,		x4; \
72*4882a593Smuzhiyun	pslld $7,		x3; \
73*4882a593Smuzhiyun	psrld $(32 - 7),	x4; \
74*4882a593Smuzhiyun	por x4,			x3; \
75*4882a593Smuzhiyun	movdqa x1,		x4; \
76*4882a593Smuzhiyun	pslld $7,		x4; \
77*4882a593Smuzhiyun	pxor x1,		x0; \
78*4882a593Smuzhiyun	pxor x3,		x0; \
79*4882a593Smuzhiyun	pxor x3,		x2; \
80*4882a593Smuzhiyun	pxor x4,		x2; \
81*4882a593Smuzhiyun	movdqa x0,		x4; \
82*4882a593Smuzhiyun	get_key(i, 1, RT0); \
83*4882a593Smuzhiyun	pxor RT0,		x1; \
84*4882a593Smuzhiyun	get_key(i, 3, RT0); \
85*4882a593Smuzhiyun	pxor RT0,		x3; \
86*4882a593Smuzhiyun	pslld $5,		x0; \
87*4882a593Smuzhiyun	psrld $(32 - 5),	x4; \
88*4882a593Smuzhiyun	por x4,			x0; \
89*4882a593Smuzhiyun	movdqa x2,		x4; \
90*4882a593Smuzhiyun	pslld $22,		x2; \
91*4882a593Smuzhiyun	psrld $(32 - 22),	x4; \
92*4882a593Smuzhiyun	por x4,			x2; \
93*4882a593Smuzhiyun	get_key(i, 0, RT0); \
94*4882a593Smuzhiyun	pxor RT0,		x0; \
95*4882a593Smuzhiyun	get_key(i, 2, RT0); \
96*4882a593Smuzhiyun	pxor RT0,		x2;
97*4882a593Smuzhiyun
98*4882a593Smuzhiyun#define KL(x0, x1, x2, x3, x4, i) \
99*4882a593Smuzhiyun	K(x0, x1, x2, x3, x4, i); \
100*4882a593Smuzhiyun	movdqa x0,		x4; \
101*4882a593Smuzhiyun	psrld $5,		x0; \
102*4882a593Smuzhiyun	pslld $(32 - 5),	x4; \
103*4882a593Smuzhiyun	por x4,			x0; \
104*4882a593Smuzhiyun	movdqa x2,		x4; \
105*4882a593Smuzhiyun	psrld $22,		x2; \
106*4882a593Smuzhiyun	pslld $(32 - 22),	x4; \
107*4882a593Smuzhiyun	por x4,			x2; \
108*4882a593Smuzhiyun	pxor x3,		x2; \
109*4882a593Smuzhiyun	pxor x3,		x0; \
110*4882a593Smuzhiyun	movdqa x1,		x4; \
111*4882a593Smuzhiyun	pslld $7,		x4; \
112*4882a593Smuzhiyun	pxor x1,		x0; \
113*4882a593Smuzhiyun	pxor x4,		x2; \
114*4882a593Smuzhiyun	movdqa x1,		x4; \
115*4882a593Smuzhiyun	psrld $1,		x1; \
116*4882a593Smuzhiyun	pslld $(32 - 1),	x4; \
117*4882a593Smuzhiyun	por x4,			x1; \
118*4882a593Smuzhiyun	movdqa x3,		x4; \
119*4882a593Smuzhiyun	psrld $7,		x3; \
120*4882a593Smuzhiyun	pslld $(32 - 7),	x4; \
121*4882a593Smuzhiyun	por x4,			x3; \
122*4882a593Smuzhiyun	pxor x0,		x1; \
123*4882a593Smuzhiyun	movdqa x0,		x4; \
124*4882a593Smuzhiyun	pslld $3,		x4; \
125*4882a593Smuzhiyun	pxor x4,		x3; \
126*4882a593Smuzhiyun	movdqa x0,		x4; \
127*4882a593Smuzhiyun	psrld $13,		x0; \
128*4882a593Smuzhiyun	pslld $(32 - 13),	x4; \
129*4882a593Smuzhiyun	por x4,			x0; \
130*4882a593Smuzhiyun	pxor x2,		x1; \
131*4882a593Smuzhiyun	pxor x2,		x3; \
132*4882a593Smuzhiyun	movdqa x2,		x4; \
133*4882a593Smuzhiyun	psrld $3,		x2; \
134*4882a593Smuzhiyun	pslld $(32 - 3),	x4; \
135*4882a593Smuzhiyun	por x4,			x2;
136*4882a593Smuzhiyun
137*4882a593Smuzhiyun#define S0(x0, x1, x2, x3, x4) \
138*4882a593Smuzhiyun	movdqa x3,		x4; \
139*4882a593Smuzhiyun	por x0,			x3; \
140*4882a593Smuzhiyun	pxor x4,		x0; \
141*4882a593Smuzhiyun	pxor x2,		x4; \
142*4882a593Smuzhiyun	pxor RNOT,		x4; \
143*4882a593Smuzhiyun	pxor x1,		x3; \
144*4882a593Smuzhiyun	pand x0,		x1; \
145*4882a593Smuzhiyun	pxor x4,		x1; \
146*4882a593Smuzhiyun	pxor x0,		x2; \
147*4882a593Smuzhiyun	pxor x3,		x0; \
148*4882a593Smuzhiyun	por x0,			x4; \
149*4882a593Smuzhiyun	pxor x2,		x0; \
150*4882a593Smuzhiyun	pand x1,		x2; \
151*4882a593Smuzhiyun	pxor x2,		x3; \
152*4882a593Smuzhiyun	pxor RNOT,		x1; \
153*4882a593Smuzhiyun	pxor x4,		x2; \
154*4882a593Smuzhiyun	pxor x2,		x1;
155*4882a593Smuzhiyun
156*4882a593Smuzhiyun#define S1(x0, x1, x2, x3, x4) \
157*4882a593Smuzhiyun	movdqa x1,		x4; \
158*4882a593Smuzhiyun	pxor x0,		x1; \
159*4882a593Smuzhiyun	pxor x3,		x0; \
160*4882a593Smuzhiyun	pxor RNOT,		x3; \
161*4882a593Smuzhiyun	pand x1,		x4; \
162*4882a593Smuzhiyun	por x1,			x0; \
163*4882a593Smuzhiyun	pxor x2,		x3; \
164*4882a593Smuzhiyun	pxor x3,		x0; \
165*4882a593Smuzhiyun	pxor x3,		x1; \
166*4882a593Smuzhiyun	pxor x4,		x3; \
167*4882a593Smuzhiyun	por x4,			x1; \
168*4882a593Smuzhiyun	pxor x2,		x4; \
169*4882a593Smuzhiyun	pand x0,		x2; \
170*4882a593Smuzhiyun	pxor x1,		x2; \
171*4882a593Smuzhiyun	por x0,			x1; \
172*4882a593Smuzhiyun	pxor RNOT,		x0; \
173*4882a593Smuzhiyun	pxor x2,		x0; \
174*4882a593Smuzhiyun	pxor x1,		x4;
175*4882a593Smuzhiyun
176*4882a593Smuzhiyun#define S2(x0, x1, x2, x3, x4) \
177*4882a593Smuzhiyun	pxor RNOT,		x3; \
178*4882a593Smuzhiyun	pxor x0,		x1; \
179*4882a593Smuzhiyun	movdqa x0,		x4; \
180*4882a593Smuzhiyun	pand x2,		x0; \
181*4882a593Smuzhiyun	pxor x3,		x0; \
182*4882a593Smuzhiyun	por x4,			x3; \
183*4882a593Smuzhiyun	pxor x1,		x2; \
184*4882a593Smuzhiyun	pxor x1,		x3; \
185*4882a593Smuzhiyun	pand x0,		x1; \
186*4882a593Smuzhiyun	pxor x2,		x0; \
187*4882a593Smuzhiyun	pand x3,		x2; \
188*4882a593Smuzhiyun	por x1,			x3; \
189*4882a593Smuzhiyun	pxor RNOT,		x0; \
190*4882a593Smuzhiyun	pxor x0,		x3; \
191*4882a593Smuzhiyun	pxor x0,		x4; \
192*4882a593Smuzhiyun	pxor x2,		x0; \
193*4882a593Smuzhiyun	por x2,			x1;
194*4882a593Smuzhiyun
195*4882a593Smuzhiyun#define S3(x0, x1, x2, x3, x4) \
196*4882a593Smuzhiyun	movdqa x1,		x4; \
197*4882a593Smuzhiyun	pxor x3,		x1; \
198*4882a593Smuzhiyun	por x0,			x3; \
199*4882a593Smuzhiyun	pand x0,		x4; \
200*4882a593Smuzhiyun	pxor x2,		x0; \
201*4882a593Smuzhiyun	pxor x1,		x2; \
202*4882a593Smuzhiyun	pand x3,		x1; \
203*4882a593Smuzhiyun	pxor x3,		x2; \
204*4882a593Smuzhiyun	por x4,			x0; \
205*4882a593Smuzhiyun	pxor x3,		x4; \
206*4882a593Smuzhiyun	pxor x0,		x1; \
207*4882a593Smuzhiyun	pand x3,		x0; \
208*4882a593Smuzhiyun	pand x4,		x3; \
209*4882a593Smuzhiyun	pxor x2,		x3; \
210*4882a593Smuzhiyun	por x1,			x4; \
211*4882a593Smuzhiyun	pand x1,		x2; \
212*4882a593Smuzhiyun	pxor x3,		x4; \
213*4882a593Smuzhiyun	pxor x3,		x0; \
214*4882a593Smuzhiyun	pxor x2,		x3;
215*4882a593Smuzhiyun
216*4882a593Smuzhiyun#define S4(x0, x1, x2, x3, x4) \
217*4882a593Smuzhiyun	movdqa x3,		x4; \
218*4882a593Smuzhiyun	pand x0,		x3; \
219*4882a593Smuzhiyun	pxor x4,		x0; \
220*4882a593Smuzhiyun	pxor x2,		x3; \
221*4882a593Smuzhiyun	por x4,			x2; \
222*4882a593Smuzhiyun	pxor x1,		x0; \
223*4882a593Smuzhiyun	pxor x3,		x4; \
224*4882a593Smuzhiyun	por x0,			x2; \
225*4882a593Smuzhiyun	pxor x1,		x2; \
226*4882a593Smuzhiyun	pand x0,		x1; \
227*4882a593Smuzhiyun	pxor x4,		x1; \
228*4882a593Smuzhiyun	pand x2,		x4; \
229*4882a593Smuzhiyun	pxor x3,		x2; \
230*4882a593Smuzhiyun	pxor x0,		x4; \
231*4882a593Smuzhiyun	por x1,			x3; \
232*4882a593Smuzhiyun	pxor RNOT,		x1; \
233*4882a593Smuzhiyun	pxor x0,		x3;
234*4882a593Smuzhiyun
235*4882a593Smuzhiyun#define S5(x0, x1, x2, x3, x4) \
236*4882a593Smuzhiyun	movdqa x1,		x4; \
237*4882a593Smuzhiyun	por x0,			x1; \
238*4882a593Smuzhiyun	pxor x1,		x2; \
239*4882a593Smuzhiyun	pxor RNOT,		x3; \
240*4882a593Smuzhiyun	pxor x0,		x4; \
241*4882a593Smuzhiyun	pxor x2,		x0; \
242*4882a593Smuzhiyun	pand x4,		x1; \
243*4882a593Smuzhiyun	por x3,			x4; \
244*4882a593Smuzhiyun	pxor x0,		x4; \
245*4882a593Smuzhiyun	pand x3,		x0; \
246*4882a593Smuzhiyun	pxor x3,		x1; \
247*4882a593Smuzhiyun	pxor x2,		x3; \
248*4882a593Smuzhiyun	pxor x1,		x0; \
249*4882a593Smuzhiyun	pand x4,		x2; \
250*4882a593Smuzhiyun	pxor x2,		x1; \
251*4882a593Smuzhiyun	pand x0,		x2; \
252*4882a593Smuzhiyun	pxor x2,		x3;
253*4882a593Smuzhiyun
254*4882a593Smuzhiyun#define S6(x0, x1, x2, x3, x4) \
255*4882a593Smuzhiyun	movdqa x1,		x4; \
256*4882a593Smuzhiyun	pxor x0,		x3; \
257*4882a593Smuzhiyun	pxor x2,		x1; \
258*4882a593Smuzhiyun	pxor x0,		x2; \
259*4882a593Smuzhiyun	pand x3,		x0; \
260*4882a593Smuzhiyun	por x3,			x1; \
261*4882a593Smuzhiyun	pxor RNOT,		x4; \
262*4882a593Smuzhiyun	pxor x1,		x0; \
263*4882a593Smuzhiyun	pxor x2,		x1; \
264*4882a593Smuzhiyun	pxor x4,		x3; \
265*4882a593Smuzhiyun	pxor x0,		x4; \
266*4882a593Smuzhiyun	pand x0,		x2; \
267*4882a593Smuzhiyun	pxor x1,		x4; \
268*4882a593Smuzhiyun	pxor x3,		x2; \
269*4882a593Smuzhiyun	pand x1,		x3; \
270*4882a593Smuzhiyun	pxor x0,		x3; \
271*4882a593Smuzhiyun	pxor x2,		x1;
272*4882a593Smuzhiyun
273*4882a593Smuzhiyun#define S7(x0, x1, x2, x3, x4) \
274*4882a593Smuzhiyun	pxor RNOT,		x1; \
275*4882a593Smuzhiyun	movdqa x1,		x4; \
276*4882a593Smuzhiyun	pxor RNOT,		x0; \
277*4882a593Smuzhiyun	pand x2,		x1; \
278*4882a593Smuzhiyun	pxor x3,		x1; \
279*4882a593Smuzhiyun	por x4,			x3; \
280*4882a593Smuzhiyun	pxor x2,		x4; \
281*4882a593Smuzhiyun	pxor x3,		x2; \
282*4882a593Smuzhiyun	pxor x0,		x3; \
283*4882a593Smuzhiyun	por x1,			x0; \
284*4882a593Smuzhiyun	pand x0,		x2; \
285*4882a593Smuzhiyun	pxor x4,		x0; \
286*4882a593Smuzhiyun	pxor x3,		x4; \
287*4882a593Smuzhiyun	pand x0,		x3; \
288*4882a593Smuzhiyun	pxor x1,		x4; \
289*4882a593Smuzhiyun	pxor x4,		x2; \
290*4882a593Smuzhiyun	pxor x1,		x3; \
291*4882a593Smuzhiyun	por x0,			x4; \
292*4882a593Smuzhiyun	pxor x1,		x4;
293*4882a593Smuzhiyun
294*4882a593Smuzhiyun#define SI0(x0, x1, x2, x3, x4) \
295*4882a593Smuzhiyun	movdqa x3,		x4; \
296*4882a593Smuzhiyun	pxor x0,		x1; \
297*4882a593Smuzhiyun	por x1,			x3; \
298*4882a593Smuzhiyun	pxor x1,		x4; \
299*4882a593Smuzhiyun	pxor RNOT,		x0; \
300*4882a593Smuzhiyun	pxor x3,		x2; \
301*4882a593Smuzhiyun	pxor x0,		x3; \
302*4882a593Smuzhiyun	pand x1,		x0; \
303*4882a593Smuzhiyun	pxor x2,		x0; \
304*4882a593Smuzhiyun	pand x3,		x2; \
305*4882a593Smuzhiyun	pxor x4,		x3; \
306*4882a593Smuzhiyun	pxor x3,		x2; \
307*4882a593Smuzhiyun	pxor x3,		x1; \
308*4882a593Smuzhiyun	pand x0,		x3; \
309*4882a593Smuzhiyun	pxor x0,		x1; \
310*4882a593Smuzhiyun	pxor x2,		x0; \
311*4882a593Smuzhiyun	pxor x3,		x4;
312*4882a593Smuzhiyun
313*4882a593Smuzhiyun#define SI1(x0, x1, x2, x3, x4) \
314*4882a593Smuzhiyun	pxor x3,		x1; \
315*4882a593Smuzhiyun	movdqa x0,		x4; \
316*4882a593Smuzhiyun	pxor x2,		x0; \
317*4882a593Smuzhiyun	pxor RNOT,		x2; \
318*4882a593Smuzhiyun	por x1,			x4; \
319*4882a593Smuzhiyun	pxor x3,		x4; \
320*4882a593Smuzhiyun	pand x1,		x3; \
321*4882a593Smuzhiyun	pxor x2,		x1; \
322*4882a593Smuzhiyun	pand x4,		x2; \
323*4882a593Smuzhiyun	pxor x1,		x4; \
324*4882a593Smuzhiyun	por x3,			x1; \
325*4882a593Smuzhiyun	pxor x0,		x3; \
326*4882a593Smuzhiyun	pxor x0,		x2; \
327*4882a593Smuzhiyun	por x4,			x0; \
328*4882a593Smuzhiyun	pxor x4,		x2; \
329*4882a593Smuzhiyun	pxor x0,		x1; \
330*4882a593Smuzhiyun	pxor x1,		x4;
331*4882a593Smuzhiyun
332*4882a593Smuzhiyun#define SI2(x0, x1, x2, x3, x4) \
333*4882a593Smuzhiyun	pxor x1,		x2; \
334*4882a593Smuzhiyun	movdqa x3,		x4; \
335*4882a593Smuzhiyun	pxor RNOT,		x3; \
336*4882a593Smuzhiyun	por x2,			x3; \
337*4882a593Smuzhiyun	pxor x4,		x2; \
338*4882a593Smuzhiyun	pxor x0,		x4; \
339*4882a593Smuzhiyun	pxor x1,		x3; \
340*4882a593Smuzhiyun	por x2,			x1; \
341*4882a593Smuzhiyun	pxor x0,		x2; \
342*4882a593Smuzhiyun	pxor x4,		x1; \
343*4882a593Smuzhiyun	por x3,			x4; \
344*4882a593Smuzhiyun	pxor x3,		x2; \
345*4882a593Smuzhiyun	pxor x2,		x4; \
346*4882a593Smuzhiyun	pand x1,		x2; \
347*4882a593Smuzhiyun	pxor x3,		x2; \
348*4882a593Smuzhiyun	pxor x4,		x3; \
349*4882a593Smuzhiyun	pxor x0,		x4;
350*4882a593Smuzhiyun
351*4882a593Smuzhiyun#define SI3(x0, x1, x2, x3, x4) \
352*4882a593Smuzhiyun	pxor x1,		x2; \
353*4882a593Smuzhiyun	movdqa x1,		x4; \
354*4882a593Smuzhiyun	pand x2,		x1; \
355*4882a593Smuzhiyun	pxor x0,		x1; \
356*4882a593Smuzhiyun	por x4,			x0; \
357*4882a593Smuzhiyun	pxor x3,		x4; \
358*4882a593Smuzhiyun	pxor x3,		x0; \
359*4882a593Smuzhiyun	por x1,			x3; \
360*4882a593Smuzhiyun	pxor x2,		x1; \
361*4882a593Smuzhiyun	pxor x3,		x1; \
362*4882a593Smuzhiyun	pxor x2,		x0; \
363*4882a593Smuzhiyun	pxor x3,		x2; \
364*4882a593Smuzhiyun	pand x1,		x3; \
365*4882a593Smuzhiyun	pxor x0,		x1; \
366*4882a593Smuzhiyun	pand x2,		x0; \
367*4882a593Smuzhiyun	pxor x3,		x4; \
368*4882a593Smuzhiyun	pxor x0,		x3; \
369*4882a593Smuzhiyun	pxor x1,		x0;
370*4882a593Smuzhiyun
371*4882a593Smuzhiyun#define SI4(x0, x1, x2, x3, x4) \
372*4882a593Smuzhiyun	pxor x3,		x2; \
373*4882a593Smuzhiyun	movdqa x0,		x4; \
374*4882a593Smuzhiyun	pand x1,		x0; \
375*4882a593Smuzhiyun	pxor x2,		x0; \
376*4882a593Smuzhiyun	por x3,			x2; \
377*4882a593Smuzhiyun	pxor RNOT,		x4; \
378*4882a593Smuzhiyun	pxor x0,		x1; \
379*4882a593Smuzhiyun	pxor x2,		x0; \
380*4882a593Smuzhiyun	pand x4,		x2; \
381*4882a593Smuzhiyun	pxor x0,		x2; \
382*4882a593Smuzhiyun	por x4,			x0; \
383*4882a593Smuzhiyun	pxor x3,		x0; \
384*4882a593Smuzhiyun	pand x2,		x3; \
385*4882a593Smuzhiyun	pxor x3,		x4; \
386*4882a593Smuzhiyun	pxor x1,		x3; \
387*4882a593Smuzhiyun	pand x0,		x1; \
388*4882a593Smuzhiyun	pxor x1,		x4; \
389*4882a593Smuzhiyun	pxor x3,		x0;
390*4882a593Smuzhiyun
391*4882a593Smuzhiyun#define SI5(x0, x1, x2, x3, x4) \
392*4882a593Smuzhiyun	movdqa x1,		x4; \
393*4882a593Smuzhiyun	por x2,			x1; \
394*4882a593Smuzhiyun	pxor x4,		x2; \
395*4882a593Smuzhiyun	pxor x3,		x1; \
396*4882a593Smuzhiyun	pand x4,		x3; \
397*4882a593Smuzhiyun	pxor x3,		x2; \
398*4882a593Smuzhiyun	por x0,			x3; \
399*4882a593Smuzhiyun	pxor RNOT,		x0; \
400*4882a593Smuzhiyun	pxor x2,		x3; \
401*4882a593Smuzhiyun	por x0,			x2; \
402*4882a593Smuzhiyun	pxor x1,		x4; \
403*4882a593Smuzhiyun	pxor x4,		x2; \
404*4882a593Smuzhiyun	pand x0,		x4; \
405*4882a593Smuzhiyun	pxor x1,		x0; \
406*4882a593Smuzhiyun	pxor x3,		x1; \
407*4882a593Smuzhiyun	pand x2,		x0; \
408*4882a593Smuzhiyun	pxor x3,		x2; \
409*4882a593Smuzhiyun	pxor x2,		x0; \
410*4882a593Smuzhiyun	pxor x4,		x2; \
411*4882a593Smuzhiyun	pxor x3,		x4;
412*4882a593Smuzhiyun
413*4882a593Smuzhiyun#define SI6(x0, x1, x2, x3, x4) \
414*4882a593Smuzhiyun	pxor x2,		x0; \
415*4882a593Smuzhiyun	movdqa x0,		x4; \
416*4882a593Smuzhiyun	pand x3,		x0; \
417*4882a593Smuzhiyun	pxor x3,		x2; \
418*4882a593Smuzhiyun	pxor x2,		x0; \
419*4882a593Smuzhiyun	pxor x1,		x3; \
420*4882a593Smuzhiyun	por x4,			x2; \
421*4882a593Smuzhiyun	pxor x3,		x2; \
422*4882a593Smuzhiyun	pand x0,		x3; \
423*4882a593Smuzhiyun	pxor RNOT,		x0; \
424*4882a593Smuzhiyun	pxor x1,		x3; \
425*4882a593Smuzhiyun	pand x2,		x1; \
426*4882a593Smuzhiyun	pxor x0,		x4; \
427*4882a593Smuzhiyun	pxor x4,		x3; \
428*4882a593Smuzhiyun	pxor x2,		x4; \
429*4882a593Smuzhiyun	pxor x1,		x0; \
430*4882a593Smuzhiyun	pxor x0,		x2;
431*4882a593Smuzhiyun
432*4882a593Smuzhiyun#define SI7(x0, x1, x2, x3, x4) \
433*4882a593Smuzhiyun	movdqa x3,		x4; \
434*4882a593Smuzhiyun	pand x0,		x3; \
435*4882a593Smuzhiyun	pxor x2,		x0; \
436*4882a593Smuzhiyun	por x4,			x2; \
437*4882a593Smuzhiyun	pxor x1,		x4; \
438*4882a593Smuzhiyun	pxor RNOT,		x0; \
439*4882a593Smuzhiyun	por x3,			x1; \
440*4882a593Smuzhiyun	pxor x0,		x4; \
441*4882a593Smuzhiyun	pand x2,		x0; \
442*4882a593Smuzhiyun	pxor x1,		x0; \
443*4882a593Smuzhiyun	pand x2,		x1; \
444*4882a593Smuzhiyun	pxor x2,		x3; \
445*4882a593Smuzhiyun	pxor x3,		x4; \
446*4882a593Smuzhiyun	pand x3,		x2; \
447*4882a593Smuzhiyun	por x0,			x3; \
448*4882a593Smuzhiyun	pxor x4,		x1; \
449*4882a593Smuzhiyun	pxor x4,		x3; \
450*4882a593Smuzhiyun	pand x0,		x4; \
451*4882a593Smuzhiyun	pxor x2,		x4;
452*4882a593Smuzhiyun
453*4882a593Smuzhiyun#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
454*4882a593Smuzhiyun	movdqa x0,		t2; \
455*4882a593Smuzhiyun	punpckldq x1,		x0; \
456*4882a593Smuzhiyun	punpckhdq x1,		t2; \
457*4882a593Smuzhiyun	movdqa x2,		t1; \
458*4882a593Smuzhiyun	punpckhdq x3,		x2; \
459*4882a593Smuzhiyun	punpckldq x3,		t1; \
460*4882a593Smuzhiyun	movdqa x0,		x1; \
461*4882a593Smuzhiyun	punpcklqdq t1,		x0; \
462*4882a593Smuzhiyun	punpckhqdq t1,		x1; \
463*4882a593Smuzhiyun	movdqa t2,		x3; \
464*4882a593Smuzhiyun	punpcklqdq x2,		t2; \
465*4882a593Smuzhiyun	punpckhqdq x2,		x3; \
466*4882a593Smuzhiyun	movdqa t2,		x2;
467*4882a593Smuzhiyun
468*4882a593Smuzhiyun#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
469*4882a593Smuzhiyun	movdqu (0*4*4)(in),	x0; \
470*4882a593Smuzhiyun	movdqu (1*4*4)(in),	x1; \
471*4882a593Smuzhiyun	movdqu (2*4*4)(in),	x2; \
472*4882a593Smuzhiyun	movdqu (3*4*4)(in),	x3; \
473*4882a593Smuzhiyun	\
474*4882a593Smuzhiyun	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
475*4882a593Smuzhiyun
476*4882a593Smuzhiyun#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
477*4882a593Smuzhiyun	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
478*4882a593Smuzhiyun	\
479*4882a593Smuzhiyun	movdqu x0, (0*4*4)(out); \
480*4882a593Smuzhiyun	movdqu x1, (1*4*4)(out); \
481*4882a593Smuzhiyun	movdqu x2, (2*4*4)(out); \
482*4882a593Smuzhiyun	movdqu x3, (3*4*4)(out);
483*4882a593Smuzhiyun
484*4882a593Smuzhiyun#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
485*4882a593Smuzhiyun	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
486*4882a593Smuzhiyun	\
487*4882a593Smuzhiyun	movdqu (0*4*4)(out),	t0; \
488*4882a593Smuzhiyun	pxor t0,		x0; \
489*4882a593Smuzhiyun	movdqu x0,		(0*4*4)(out); \
490*4882a593Smuzhiyun	movdqu (1*4*4)(out),	t0; \
491*4882a593Smuzhiyun	pxor t0,		x1; \
492*4882a593Smuzhiyun	movdqu x1,		(1*4*4)(out); \
493*4882a593Smuzhiyun	movdqu (2*4*4)(out),	t0; \
494*4882a593Smuzhiyun	pxor t0,		x2; \
495*4882a593Smuzhiyun	movdqu x2,		(2*4*4)(out); \
496*4882a593Smuzhiyun	movdqu (3*4*4)(out),	t0; \
497*4882a593Smuzhiyun	pxor t0,		x3; \
498*4882a593Smuzhiyun	movdqu x3,		(3*4*4)(out);
499*4882a593Smuzhiyun
500*4882a593SmuzhiyunSYM_FUNC_START(__serpent_enc_blk_4way)
501*4882a593Smuzhiyun	/* input:
502*4882a593Smuzhiyun	 *	arg_ctx(%esp): ctx, CTX
503*4882a593Smuzhiyun	 *	arg_dst(%esp): dst
504*4882a593Smuzhiyun	 *	arg_src(%esp): src
505*4882a593Smuzhiyun	 *	arg_xor(%esp): bool, if true: xor output
506*4882a593Smuzhiyun	 */
507*4882a593Smuzhiyun
508*4882a593Smuzhiyun	pcmpeqd RNOT, RNOT;
509*4882a593Smuzhiyun
510*4882a593Smuzhiyun	movl arg_ctx(%esp), CTX;
511*4882a593Smuzhiyun
512*4882a593Smuzhiyun	movl arg_src(%esp), %eax;
513*4882a593Smuzhiyun	read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
514*4882a593Smuzhiyun
515*4882a593Smuzhiyun					 K(RA, RB, RC, RD, RE, 0);
516*4882a593Smuzhiyun	S0(RA, RB, RC, RD, RE);		LK(RC, RB, RD, RA, RE, 1);
517*4882a593Smuzhiyun	S1(RC, RB, RD, RA, RE);		LK(RE, RD, RA, RC, RB, 2);
518*4882a593Smuzhiyun	S2(RE, RD, RA, RC, RB);		LK(RB, RD, RE, RC, RA, 3);
519*4882a593Smuzhiyun	S3(RB, RD, RE, RC, RA);		LK(RC, RA, RD, RB, RE, 4);
520*4882a593Smuzhiyun	S4(RC, RA, RD, RB, RE);		LK(RA, RD, RB, RE, RC, 5);
521*4882a593Smuzhiyun	S5(RA, RD, RB, RE, RC);		LK(RC, RA, RD, RE, RB, 6);
522*4882a593Smuzhiyun	S6(RC, RA, RD, RE, RB);		LK(RD, RB, RA, RE, RC, 7);
523*4882a593Smuzhiyun	S7(RD, RB, RA, RE, RC);		LK(RC, RA, RE, RD, RB, 8);
524*4882a593Smuzhiyun	S0(RC, RA, RE, RD, RB);		LK(RE, RA, RD, RC, RB, 9);
525*4882a593Smuzhiyun	S1(RE, RA, RD, RC, RB);		LK(RB, RD, RC, RE, RA, 10);
526*4882a593Smuzhiyun	S2(RB, RD, RC, RE, RA);		LK(RA, RD, RB, RE, RC, 11);
527*4882a593Smuzhiyun	S3(RA, RD, RB, RE, RC);		LK(RE, RC, RD, RA, RB, 12);
528*4882a593Smuzhiyun	S4(RE, RC, RD, RA, RB);		LK(RC, RD, RA, RB, RE, 13);
529*4882a593Smuzhiyun	S5(RC, RD, RA, RB, RE);		LK(RE, RC, RD, RB, RA, 14);
530*4882a593Smuzhiyun	S6(RE, RC, RD, RB, RA);		LK(RD, RA, RC, RB, RE, 15);
531*4882a593Smuzhiyun	S7(RD, RA, RC, RB, RE);		LK(RE, RC, RB, RD, RA, 16);
532*4882a593Smuzhiyun	S0(RE, RC, RB, RD, RA);		LK(RB, RC, RD, RE, RA, 17);
533*4882a593Smuzhiyun	S1(RB, RC, RD, RE, RA);		LK(RA, RD, RE, RB, RC, 18);
534*4882a593Smuzhiyun	S2(RA, RD, RE, RB, RC);		LK(RC, RD, RA, RB, RE, 19);
535*4882a593Smuzhiyun	S3(RC, RD, RA, RB, RE);		LK(RB, RE, RD, RC, RA, 20);
536*4882a593Smuzhiyun	S4(RB, RE, RD, RC, RA);		LK(RE, RD, RC, RA, RB, 21);
537*4882a593Smuzhiyun	S5(RE, RD, RC, RA, RB);		LK(RB, RE, RD, RA, RC, 22);
538*4882a593Smuzhiyun	S6(RB, RE, RD, RA, RC);		LK(RD, RC, RE, RA, RB, 23);
539*4882a593Smuzhiyun	S7(RD, RC, RE, RA, RB);		LK(RB, RE, RA, RD, RC, 24);
540*4882a593Smuzhiyun	S0(RB, RE, RA, RD, RC);		LK(RA, RE, RD, RB, RC, 25);
541*4882a593Smuzhiyun	S1(RA, RE, RD, RB, RC);		LK(RC, RD, RB, RA, RE, 26);
542*4882a593Smuzhiyun	S2(RC, RD, RB, RA, RE);		LK(RE, RD, RC, RA, RB, 27);
543*4882a593Smuzhiyun	S3(RE, RD, RC, RA, RB);		LK(RA, RB, RD, RE, RC, 28);
544*4882a593Smuzhiyun	S4(RA, RB, RD, RE, RC);		LK(RB, RD, RE, RC, RA, 29);
545*4882a593Smuzhiyun	S5(RB, RD, RE, RC, RA);		LK(RA, RB, RD, RC, RE, 30);
546*4882a593Smuzhiyun	S6(RA, RB, RD, RC, RE);		LK(RD, RE, RB, RC, RA, 31);
547*4882a593Smuzhiyun	S7(RD, RE, RB, RC, RA);		 K(RA, RB, RC, RD, RE, 32);
548*4882a593Smuzhiyun
549*4882a593Smuzhiyun	movl arg_dst(%esp), %eax;
550*4882a593Smuzhiyun
551*4882a593Smuzhiyun	cmpb $0, arg_xor(%esp);
552*4882a593Smuzhiyun	jnz .L__enc_xor4;
553*4882a593Smuzhiyun
554*4882a593Smuzhiyun	write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
555*4882a593Smuzhiyun
556*4882a593Smuzhiyun	RET;
557*4882a593Smuzhiyun
558*4882a593Smuzhiyun.L__enc_xor4:
559*4882a593Smuzhiyun	xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
560*4882a593Smuzhiyun
561*4882a593Smuzhiyun	RET;
562*4882a593SmuzhiyunSYM_FUNC_END(__serpent_enc_blk_4way)
563*4882a593Smuzhiyun
564*4882a593SmuzhiyunSYM_FUNC_START(serpent_dec_blk_4way)
565*4882a593Smuzhiyun	/* input:
566*4882a593Smuzhiyun	 *	arg_ctx(%esp): ctx, CTX
567*4882a593Smuzhiyun	 *	arg_dst(%esp): dst
568*4882a593Smuzhiyun	 *	arg_src(%esp): src
569*4882a593Smuzhiyun	 */
570*4882a593Smuzhiyun
571*4882a593Smuzhiyun	pcmpeqd RNOT, RNOT;
572*4882a593Smuzhiyun
573*4882a593Smuzhiyun	movl arg_ctx(%esp), CTX;
574*4882a593Smuzhiyun
575*4882a593Smuzhiyun	movl arg_src(%esp), %eax;
576*4882a593Smuzhiyun	read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
577*4882a593Smuzhiyun
578*4882a593Smuzhiyun					 K(RA, RB, RC, RD, RE, 32);
579*4882a593Smuzhiyun	SI7(RA, RB, RC, RD, RE);	KL(RB, RD, RA, RE, RC, 31);
580*4882a593Smuzhiyun	SI6(RB, RD, RA, RE, RC);	KL(RA, RC, RE, RB, RD, 30);
581*4882a593Smuzhiyun	SI5(RA, RC, RE, RB, RD);	KL(RC, RD, RA, RE, RB, 29);
582*4882a593Smuzhiyun	SI4(RC, RD, RA, RE, RB);	KL(RC, RA, RB, RE, RD, 28);
583*4882a593Smuzhiyun	SI3(RC, RA, RB, RE, RD);	KL(RB, RC, RD, RE, RA, 27);
584*4882a593Smuzhiyun	SI2(RB, RC, RD, RE, RA);	KL(RC, RA, RE, RD, RB, 26);
585*4882a593Smuzhiyun	SI1(RC, RA, RE, RD, RB);	KL(RB, RA, RE, RD, RC, 25);
586*4882a593Smuzhiyun	SI0(RB, RA, RE, RD, RC);	KL(RE, RC, RA, RB, RD, 24);
587*4882a593Smuzhiyun	SI7(RE, RC, RA, RB, RD);	KL(RC, RB, RE, RD, RA, 23);
588*4882a593Smuzhiyun	SI6(RC, RB, RE, RD, RA);	KL(RE, RA, RD, RC, RB, 22);
589*4882a593Smuzhiyun	SI5(RE, RA, RD, RC, RB);	KL(RA, RB, RE, RD, RC, 21);
590*4882a593Smuzhiyun	SI4(RA, RB, RE, RD, RC);	KL(RA, RE, RC, RD, RB, 20);
591*4882a593Smuzhiyun	SI3(RA, RE, RC, RD, RB);	KL(RC, RA, RB, RD, RE, 19);
592*4882a593Smuzhiyun	SI2(RC, RA, RB, RD, RE);	KL(RA, RE, RD, RB, RC, 18);
593*4882a593Smuzhiyun	SI1(RA, RE, RD, RB, RC);	KL(RC, RE, RD, RB, RA, 17);
594*4882a593Smuzhiyun	SI0(RC, RE, RD, RB, RA);	KL(RD, RA, RE, RC, RB, 16);
595*4882a593Smuzhiyun	SI7(RD, RA, RE, RC, RB);	KL(RA, RC, RD, RB, RE, 15);
596*4882a593Smuzhiyun	SI6(RA, RC, RD, RB, RE);	KL(RD, RE, RB, RA, RC, 14);
597*4882a593Smuzhiyun	SI5(RD, RE, RB, RA, RC);	KL(RE, RC, RD, RB, RA, 13);
598*4882a593Smuzhiyun	SI4(RE, RC, RD, RB, RA);	KL(RE, RD, RA, RB, RC, 12);
599*4882a593Smuzhiyun	SI3(RE, RD, RA, RB, RC);	KL(RA, RE, RC, RB, RD, 11);
600*4882a593Smuzhiyun	SI2(RA, RE, RC, RB, RD);	KL(RE, RD, RB, RC, RA, 10);
601*4882a593Smuzhiyun	SI1(RE, RD, RB, RC, RA);	KL(RA, RD, RB, RC, RE, 9);
602*4882a593Smuzhiyun	SI0(RA, RD, RB, RC, RE);	KL(RB, RE, RD, RA, RC, 8);
603*4882a593Smuzhiyun	SI7(RB, RE, RD, RA, RC);	KL(RE, RA, RB, RC, RD, 7);
604*4882a593Smuzhiyun	SI6(RE, RA, RB, RC, RD);	KL(RB, RD, RC, RE, RA, 6);
605*4882a593Smuzhiyun	SI5(RB, RD, RC, RE, RA);	KL(RD, RA, RB, RC, RE, 5);
606*4882a593Smuzhiyun	SI4(RD, RA, RB, RC, RE);	KL(RD, RB, RE, RC, RA, 4);
607*4882a593Smuzhiyun	SI3(RD, RB, RE, RC, RA);	KL(RE, RD, RA, RC, RB, 3);
608*4882a593Smuzhiyun	SI2(RE, RD, RA, RC, RB);	KL(RD, RB, RC, RA, RE, 2);
609*4882a593Smuzhiyun	SI1(RD, RB, RC, RA, RE);	KL(RE, RB, RC, RA, RD, 1);
610*4882a593Smuzhiyun	SI0(RE, RB, RC, RA, RD);	 K(RC, RD, RB, RE, RA, 0);
611*4882a593Smuzhiyun
612*4882a593Smuzhiyun	movl arg_dst(%esp), %eax;
613*4882a593Smuzhiyun	write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
614*4882a593Smuzhiyun
615*4882a593Smuzhiyun	RET;
616*4882a593SmuzhiyunSYM_FUNC_END(serpent_dec_blk_4way)
617