xref: /OK3568_Linux_fs/kernel/arch/x86/crypto/camellia-aesni-avx2-asm_64.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * x86_64/AVX2/AES-NI assembler implementation of Camellia
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
6*4882a593Smuzhiyun */
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun#include <linux/linkage.h>
9*4882a593Smuzhiyun#include <asm/frame.h>
10*4882a593Smuzhiyun#include <asm/nospec-branch.h>
11*4882a593Smuzhiyun
12*4882a593Smuzhiyun#define CAMELLIA_TABLE_BYTE_LEN 272
13*4882a593Smuzhiyun
14*4882a593Smuzhiyun/* struct camellia_ctx: */
15*4882a593Smuzhiyun#define key_table 0
16*4882a593Smuzhiyun#define key_length CAMELLIA_TABLE_BYTE_LEN
17*4882a593Smuzhiyun
18*4882a593Smuzhiyun/* register macros */
19*4882a593Smuzhiyun#define CTX %rdi
20*4882a593Smuzhiyun#define RIO %r8
21*4882a593Smuzhiyun
22*4882a593Smuzhiyun/**********************************************************************
23*4882a593Smuzhiyun  helper macros
24*4882a593Smuzhiyun **********************************************************************/
25*4882a593Smuzhiyun#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
26*4882a593Smuzhiyun	vpand x, mask4bit, tmp0; \
27*4882a593Smuzhiyun	vpandn x, mask4bit, x; \
28*4882a593Smuzhiyun	vpsrld $4, x, x; \
29*4882a593Smuzhiyun	\
30*4882a593Smuzhiyun	vpshufb tmp0, lo_t, tmp0; \
31*4882a593Smuzhiyun	vpshufb x, hi_t, x; \
32*4882a593Smuzhiyun	vpxor tmp0, x, x;
33*4882a593Smuzhiyun
34*4882a593Smuzhiyun#define ymm0_x xmm0
35*4882a593Smuzhiyun#define ymm1_x xmm1
36*4882a593Smuzhiyun#define ymm2_x xmm2
37*4882a593Smuzhiyun#define ymm3_x xmm3
38*4882a593Smuzhiyun#define ymm4_x xmm4
39*4882a593Smuzhiyun#define ymm5_x xmm5
40*4882a593Smuzhiyun#define ymm6_x xmm6
41*4882a593Smuzhiyun#define ymm7_x xmm7
42*4882a593Smuzhiyun#define ymm8_x xmm8
43*4882a593Smuzhiyun#define ymm9_x xmm9
44*4882a593Smuzhiyun#define ymm10_x xmm10
45*4882a593Smuzhiyun#define ymm11_x xmm11
46*4882a593Smuzhiyun#define ymm12_x xmm12
47*4882a593Smuzhiyun#define ymm13_x xmm13
48*4882a593Smuzhiyun#define ymm14_x xmm14
49*4882a593Smuzhiyun#define ymm15_x xmm15
50*4882a593Smuzhiyun
51*4882a593Smuzhiyun/**********************************************************************
52*4882a593Smuzhiyun  32-way camellia
53*4882a593Smuzhiyun **********************************************************************/
54*4882a593Smuzhiyun
55*4882a593Smuzhiyun/*
56*4882a593Smuzhiyun * IN:
57*4882a593Smuzhiyun *   x0..x7: byte-sliced AB state
58*4882a593Smuzhiyun *   mem_cd: register pointer storing CD state
59*4882a593Smuzhiyun *   key: index for key material
60*4882a593Smuzhiyun * OUT:
61*4882a593Smuzhiyun *   x0..x7: new byte-sliced CD state
62*4882a593Smuzhiyun */
63*4882a593Smuzhiyun#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
64*4882a593Smuzhiyun		  t7, mem_cd, key) \
65*4882a593Smuzhiyun	/* \
66*4882a593Smuzhiyun	 * S-function with AES subbytes \
67*4882a593Smuzhiyun	 */ \
68*4882a593Smuzhiyun	vbroadcasti128 .Linv_shift_row, t4; \
69*4882a593Smuzhiyun	vpbroadcastd .L0f0f0f0f, t7; \
70*4882a593Smuzhiyun	vbroadcasti128 .Lpre_tf_lo_s1, t5; \
71*4882a593Smuzhiyun	vbroadcasti128 .Lpre_tf_hi_s1, t6; \
72*4882a593Smuzhiyun	vbroadcasti128 .Lpre_tf_lo_s4, t2; \
73*4882a593Smuzhiyun	vbroadcasti128 .Lpre_tf_hi_s4, t3; \
74*4882a593Smuzhiyun	\
75*4882a593Smuzhiyun	/* AES inverse shift rows */ \
76*4882a593Smuzhiyun	vpshufb t4, x0, x0; \
77*4882a593Smuzhiyun	vpshufb t4, x7, x7; \
78*4882a593Smuzhiyun	vpshufb t4, x3, x3; \
79*4882a593Smuzhiyun	vpshufb t4, x6, x6; \
80*4882a593Smuzhiyun	vpshufb t4, x2, x2; \
81*4882a593Smuzhiyun	vpshufb t4, x5, x5; \
82*4882a593Smuzhiyun	vpshufb t4, x1, x1; \
83*4882a593Smuzhiyun	vpshufb t4, x4, x4; \
84*4882a593Smuzhiyun	\
85*4882a593Smuzhiyun	/* prefilter sboxes 1, 2 and 3 */ \
86*4882a593Smuzhiyun	/* prefilter sbox 4 */ \
87*4882a593Smuzhiyun	filter_8bit(x0, t5, t6, t7, t4); \
88*4882a593Smuzhiyun	filter_8bit(x7, t5, t6, t7, t4); \
89*4882a593Smuzhiyun	vextracti128 $1, x0, t0##_x; \
90*4882a593Smuzhiyun	vextracti128 $1, x7, t1##_x; \
91*4882a593Smuzhiyun	filter_8bit(x3, t2, t3, t7, t4); \
92*4882a593Smuzhiyun	filter_8bit(x6, t2, t3, t7, t4); \
93*4882a593Smuzhiyun	vextracti128 $1, x3, t3##_x; \
94*4882a593Smuzhiyun	vextracti128 $1, x6, t2##_x; \
95*4882a593Smuzhiyun	filter_8bit(x2, t5, t6, t7, t4); \
96*4882a593Smuzhiyun	filter_8bit(x5, t5, t6, t7, t4); \
97*4882a593Smuzhiyun	filter_8bit(x1, t5, t6, t7, t4); \
98*4882a593Smuzhiyun	filter_8bit(x4, t5, t6, t7, t4); \
99*4882a593Smuzhiyun	\
100*4882a593Smuzhiyun	vpxor t4##_x, t4##_x, t4##_x; \
101*4882a593Smuzhiyun	\
102*4882a593Smuzhiyun	/* AES subbytes + AES shift rows */ \
103*4882a593Smuzhiyun	vextracti128 $1, x2, t6##_x; \
104*4882a593Smuzhiyun	vextracti128 $1, x5, t5##_x; \
105*4882a593Smuzhiyun	vaesenclast t4##_x, x0##_x, x0##_x; \
106*4882a593Smuzhiyun	vaesenclast t4##_x, t0##_x, t0##_x; \
107*4882a593Smuzhiyun	vinserti128 $1, t0##_x, x0, x0; \
108*4882a593Smuzhiyun	vaesenclast t4##_x, x7##_x, x7##_x; \
109*4882a593Smuzhiyun	vaesenclast t4##_x, t1##_x, t1##_x; \
110*4882a593Smuzhiyun	vinserti128 $1, t1##_x, x7, x7; \
111*4882a593Smuzhiyun	vaesenclast t4##_x, x3##_x, x3##_x; \
112*4882a593Smuzhiyun	vaesenclast t4##_x, t3##_x, t3##_x; \
113*4882a593Smuzhiyun	vinserti128 $1, t3##_x, x3, x3; \
114*4882a593Smuzhiyun	vaesenclast t4##_x, x6##_x, x6##_x; \
115*4882a593Smuzhiyun	vaesenclast t4##_x, t2##_x, t2##_x; \
116*4882a593Smuzhiyun	vinserti128 $1, t2##_x, x6, x6; \
117*4882a593Smuzhiyun	vextracti128 $1, x1, t3##_x; \
118*4882a593Smuzhiyun	vextracti128 $1, x4, t2##_x; \
119*4882a593Smuzhiyun	vbroadcasti128 .Lpost_tf_lo_s1, t0; \
120*4882a593Smuzhiyun	vbroadcasti128 .Lpost_tf_hi_s1, t1; \
121*4882a593Smuzhiyun	vaesenclast t4##_x, x2##_x, x2##_x; \
122*4882a593Smuzhiyun	vaesenclast t4##_x, t6##_x, t6##_x; \
123*4882a593Smuzhiyun	vinserti128 $1, t6##_x, x2, x2; \
124*4882a593Smuzhiyun	vaesenclast t4##_x, x5##_x, x5##_x; \
125*4882a593Smuzhiyun	vaesenclast t4##_x, t5##_x, t5##_x; \
126*4882a593Smuzhiyun	vinserti128 $1, t5##_x, x5, x5; \
127*4882a593Smuzhiyun	vaesenclast t4##_x, x1##_x, x1##_x; \
128*4882a593Smuzhiyun	vaesenclast t4##_x, t3##_x, t3##_x; \
129*4882a593Smuzhiyun	vinserti128 $1, t3##_x, x1, x1; \
130*4882a593Smuzhiyun	vaesenclast t4##_x, x4##_x, x4##_x; \
131*4882a593Smuzhiyun	vaesenclast t4##_x, t2##_x, t2##_x; \
132*4882a593Smuzhiyun	vinserti128 $1, t2##_x, x4, x4; \
133*4882a593Smuzhiyun	\
134*4882a593Smuzhiyun	/* postfilter sboxes 1 and 4 */ \
135*4882a593Smuzhiyun	vbroadcasti128 .Lpost_tf_lo_s3, t2; \
136*4882a593Smuzhiyun	vbroadcasti128 .Lpost_tf_hi_s3, t3; \
137*4882a593Smuzhiyun	filter_8bit(x0, t0, t1, t7, t6); \
138*4882a593Smuzhiyun	filter_8bit(x7, t0, t1, t7, t6); \
139*4882a593Smuzhiyun	filter_8bit(x3, t0, t1, t7, t6); \
140*4882a593Smuzhiyun	filter_8bit(x6, t0, t1, t7, t6); \
141*4882a593Smuzhiyun	\
142*4882a593Smuzhiyun	/* postfilter sbox 3 */ \
143*4882a593Smuzhiyun	vbroadcasti128 .Lpost_tf_lo_s2, t4; \
144*4882a593Smuzhiyun	vbroadcasti128 .Lpost_tf_hi_s2, t5; \
145*4882a593Smuzhiyun	filter_8bit(x2, t2, t3, t7, t6); \
146*4882a593Smuzhiyun	filter_8bit(x5, t2, t3, t7, t6); \
147*4882a593Smuzhiyun	\
148*4882a593Smuzhiyun	vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
149*4882a593Smuzhiyun	\
150*4882a593Smuzhiyun	/* postfilter sbox 2 */ \
151*4882a593Smuzhiyun	filter_8bit(x1, t4, t5, t7, t2); \
152*4882a593Smuzhiyun	filter_8bit(x4, t4, t5, t7, t2); \
153*4882a593Smuzhiyun	vpxor t7, t7, t7; \
154*4882a593Smuzhiyun	\
155*4882a593Smuzhiyun	vpsrldq $1, t0, t1; \
156*4882a593Smuzhiyun	vpsrldq $2, t0, t2; \
157*4882a593Smuzhiyun	vpshufb t7, t1, t1; \
158*4882a593Smuzhiyun	vpsrldq $3, t0, t3; \
159*4882a593Smuzhiyun	\
160*4882a593Smuzhiyun	/* P-function */ \
161*4882a593Smuzhiyun	vpxor x5, x0, x0; \
162*4882a593Smuzhiyun	vpxor x6, x1, x1; \
163*4882a593Smuzhiyun	vpxor x7, x2, x2; \
164*4882a593Smuzhiyun	vpxor x4, x3, x3; \
165*4882a593Smuzhiyun	\
166*4882a593Smuzhiyun	vpshufb t7, t2, t2; \
167*4882a593Smuzhiyun	vpsrldq $4, t0, t4; \
168*4882a593Smuzhiyun	vpshufb t7, t3, t3; \
169*4882a593Smuzhiyun	vpsrldq $5, t0, t5; \
170*4882a593Smuzhiyun	vpshufb t7, t4, t4; \
171*4882a593Smuzhiyun	\
172*4882a593Smuzhiyun	vpxor x2, x4, x4; \
173*4882a593Smuzhiyun	vpxor x3, x5, x5; \
174*4882a593Smuzhiyun	vpxor x0, x6, x6; \
175*4882a593Smuzhiyun	vpxor x1, x7, x7; \
176*4882a593Smuzhiyun	\
177*4882a593Smuzhiyun	vpsrldq $6, t0, t6; \
178*4882a593Smuzhiyun	vpshufb t7, t5, t5; \
179*4882a593Smuzhiyun	vpshufb t7, t6, t6; \
180*4882a593Smuzhiyun	\
181*4882a593Smuzhiyun	vpxor x7, x0, x0; \
182*4882a593Smuzhiyun	vpxor x4, x1, x1; \
183*4882a593Smuzhiyun	vpxor x5, x2, x2; \
184*4882a593Smuzhiyun	vpxor x6, x3, x3; \
185*4882a593Smuzhiyun	\
186*4882a593Smuzhiyun	vpxor x3, x4, x4; \
187*4882a593Smuzhiyun	vpxor x0, x5, x5; \
188*4882a593Smuzhiyun	vpxor x1, x6, x6; \
189*4882a593Smuzhiyun	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
190*4882a593Smuzhiyun	\
191*4882a593Smuzhiyun	/* Add key material and result to CD (x becomes new CD) */ \
192*4882a593Smuzhiyun	\
193*4882a593Smuzhiyun	vpxor t6, x1, x1; \
194*4882a593Smuzhiyun	vpxor 5 * 32(mem_cd), x1, x1; \
195*4882a593Smuzhiyun	\
196*4882a593Smuzhiyun	vpsrldq $7, t0, t6; \
197*4882a593Smuzhiyun	vpshufb t7, t0, t0; \
198*4882a593Smuzhiyun	vpshufb t7, t6, t7; \
199*4882a593Smuzhiyun	\
200*4882a593Smuzhiyun	vpxor t7, x0, x0; \
201*4882a593Smuzhiyun	vpxor 4 * 32(mem_cd), x0, x0; \
202*4882a593Smuzhiyun	\
203*4882a593Smuzhiyun	vpxor t5, x2, x2; \
204*4882a593Smuzhiyun	vpxor 6 * 32(mem_cd), x2, x2; \
205*4882a593Smuzhiyun	\
206*4882a593Smuzhiyun	vpxor t4, x3, x3; \
207*4882a593Smuzhiyun	vpxor 7 * 32(mem_cd), x3, x3; \
208*4882a593Smuzhiyun	\
209*4882a593Smuzhiyun	vpxor t3, x4, x4; \
210*4882a593Smuzhiyun	vpxor 0 * 32(mem_cd), x4, x4; \
211*4882a593Smuzhiyun	\
212*4882a593Smuzhiyun	vpxor t2, x5, x5; \
213*4882a593Smuzhiyun	vpxor 1 * 32(mem_cd), x5, x5; \
214*4882a593Smuzhiyun	\
215*4882a593Smuzhiyun	vpxor t1, x6, x6; \
216*4882a593Smuzhiyun	vpxor 2 * 32(mem_cd), x6, x6; \
217*4882a593Smuzhiyun	\
218*4882a593Smuzhiyun	vpxor t0, x7, x7; \
219*4882a593Smuzhiyun	vpxor 3 * 32(mem_cd), x7, x7;
220*4882a593Smuzhiyun
221*4882a593Smuzhiyun/*
222*4882a593Smuzhiyun * Size optimization... with inlined roundsm32 binary would be over 5 times
223*4882a593Smuzhiyun * larger and would only marginally faster.
224*4882a593Smuzhiyun */
225*4882a593Smuzhiyun.align 8
226*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
227*4882a593Smuzhiyun	roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
228*4882a593Smuzhiyun		  %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
229*4882a593Smuzhiyun		  %rcx, (%r9));
230*4882a593Smuzhiyun	RET;
231*4882a593SmuzhiyunSYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
232*4882a593Smuzhiyun
233*4882a593Smuzhiyun.align 8
234*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
235*4882a593Smuzhiyun	roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
236*4882a593Smuzhiyun		  %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
237*4882a593Smuzhiyun		  %rax, (%r9));
238*4882a593Smuzhiyun	RET;
239*4882a593SmuzhiyunSYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
240*4882a593Smuzhiyun
241*4882a593Smuzhiyun/*
242*4882a593Smuzhiyun * IN/OUT:
243*4882a593Smuzhiyun *  x0..x7: byte-sliced AB state preloaded
244*4882a593Smuzhiyun *  mem_ab: byte-sliced AB state in memory
245*4882a593Smuzhiyun *  mem_cb: byte-sliced CD state in memory
246*4882a593Smuzhiyun */
247*4882a593Smuzhiyun#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
248*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
249*4882a593Smuzhiyun	leaq (key_table + (i) * 8)(CTX), %r9; \
250*4882a593Smuzhiyun	call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
251*4882a593Smuzhiyun	\
252*4882a593Smuzhiyun	vmovdqu x0, 4 * 32(mem_cd); \
253*4882a593Smuzhiyun	vmovdqu x1, 5 * 32(mem_cd); \
254*4882a593Smuzhiyun	vmovdqu x2, 6 * 32(mem_cd); \
255*4882a593Smuzhiyun	vmovdqu x3, 7 * 32(mem_cd); \
256*4882a593Smuzhiyun	vmovdqu x4, 0 * 32(mem_cd); \
257*4882a593Smuzhiyun	vmovdqu x5, 1 * 32(mem_cd); \
258*4882a593Smuzhiyun	vmovdqu x6, 2 * 32(mem_cd); \
259*4882a593Smuzhiyun	vmovdqu x7, 3 * 32(mem_cd); \
260*4882a593Smuzhiyun	\
261*4882a593Smuzhiyun	leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
262*4882a593Smuzhiyun	call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
263*4882a593Smuzhiyun	\
264*4882a593Smuzhiyun	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
265*4882a593Smuzhiyun
266*4882a593Smuzhiyun#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
267*4882a593Smuzhiyun
268*4882a593Smuzhiyun#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
269*4882a593Smuzhiyun	/* Store new AB state */ \
270*4882a593Smuzhiyun	vmovdqu x4, 4 * 32(mem_ab); \
271*4882a593Smuzhiyun	vmovdqu x5, 5 * 32(mem_ab); \
272*4882a593Smuzhiyun	vmovdqu x6, 6 * 32(mem_ab); \
273*4882a593Smuzhiyun	vmovdqu x7, 7 * 32(mem_ab); \
274*4882a593Smuzhiyun	vmovdqu x0, 0 * 32(mem_ab); \
275*4882a593Smuzhiyun	vmovdqu x1, 1 * 32(mem_ab); \
276*4882a593Smuzhiyun	vmovdqu x2, 2 * 32(mem_ab); \
277*4882a593Smuzhiyun	vmovdqu x3, 3 * 32(mem_ab);
278*4882a593Smuzhiyun
279*4882a593Smuzhiyun#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
280*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd, i) \
281*4882a593Smuzhiyun	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
282*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
283*4882a593Smuzhiyun	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
284*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
285*4882a593Smuzhiyun	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
286*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
287*4882a593Smuzhiyun
288*4882a593Smuzhiyun#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
289*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd, i) \
290*4882a593Smuzhiyun	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
291*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
292*4882a593Smuzhiyun	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
293*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
294*4882a593Smuzhiyun	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
295*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
296*4882a593Smuzhiyun
297*4882a593Smuzhiyun/*
298*4882a593Smuzhiyun * IN:
299*4882a593Smuzhiyun *  v0..3: byte-sliced 32-bit integers
300*4882a593Smuzhiyun * OUT:
301*4882a593Smuzhiyun *  v0..3: (IN <<< 1)
302*4882a593Smuzhiyun */
303*4882a593Smuzhiyun#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
304*4882a593Smuzhiyun	vpcmpgtb v0, zero, t0; \
305*4882a593Smuzhiyun	vpaddb v0, v0, v0; \
306*4882a593Smuzhiyun	vpabsb t0, t0; \
307*4882a593Smuzhiyun	\
308*4882a593Smuzhiyun	vpcmpgtb v1, zero, t1; \
309*4882a593Smuzhiyun	vpaddb v1, v1, v1; \
310*4882a593Smuzhiyun	vpabsb t1, t1; \
311*4882a593Smuzhiyun	\
312*4882a593Smuzhiyun	vpcmpgtb v2, zero, t2; \
313*4882a593Smuzhiyun	vpaddb v2, v2, v2; \
314*4882a593Smuzhiyun	vpabsb t2, t2; \
315*4882a593Smuzhiyun	\
316*4882a593Smuzhiyun	vpor t0, v1, v1; \
317*4882a593Smuzhiyun	\
318*4882a593Smuzhiyun	vpcmpgtb v3, zero, t0; \
319*4882a593Smuzhiyun	vpaddb v3, v3, v3; \
320*4882a593Smuzhiyun	vpabsb t0, t0; \
321*4882a593Smuzhiyun	\
322*4882a593Smuzhiyun	vpor t1, v2, v2; \
323*4882a593Smuzhiyun	vpor t2, v3, v3; \
324*4882a593Smuzhiyun	vpor t0, v0, v0;
325*4882a593Smuzhiyun
326*4882a593Smuzhiyun/*
327*4882a593Smuzhiyun * IN:
328*4882a593Smuzhiyun *   r: byte-sliced AB state in memory
329*4882a593Smuzhiyun *   l: byte-sliced CD state in memory
330*4882a593Smuzhiyun * OUT:
331*4882a593Smuzhiyun *   x0..x7: new byte-sliced CD state
332*4882a593Smuzhiyun */
333*4882a593Smuzhiyun#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
334*4882a593Smuzhiyun	      tt1, tt2, tt3, kll, klr, krl, krr) \
335*4882a593Smuzhiyun	/* \
336*4882a593Smuzhiyun	 * t0 = kll; \
337*4882a593Smuzhiyun	 * t0 &= ll; \
338*4882a593Smuzhiyun	 * lr ^= rol32(t0, 1); \
339*4882a593Smuzhiyun	 */ \
340*4882a593Smuzhiyun	vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
341*4882a593Smuzhiyun	vpxor tt0, tt0, tt0; \
342*4882a593Smuzhiyun	vpshufb tt0, t0, t3; \
343*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
344*4882a593Smuzhiyun	vpshufb tt0, t0, t2; \
345*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
346*4882a593Smuzhiyun	vpshufb tt0, t0, t1; \
347*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
348*4882a593Smuzhiyun	vpshufb tt0, t0, t0; \
349*4882a593Smuzhiyun	\
350*4882a593Smuzhiyun	vpand l0, t0, t0; \
351*4882a593Smuzhiyun	vpand l1, t1, t1; \
352*4882a593Smuzhiyun	vpand l2, t2, t2; \
353*4882a593Smuzhiyun	vpand l3, t3, t3; \
354*4882a593Smuzhiyun	\
355*4882a593Smuzhiyun	rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
356*4882a593Smuzhiyun	\
357*4882a593Smuzhiyun	vpxor l4, t0, l4; \
358*4882a593Smuzhiyun	vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
359*4882a593Smuzhiyun	vmovdqu l4, 4 * 32(l); \
360*4882a593Smuzhiyun	vpxor l5, t1, l5; \
361*4882a593Smuzhiyun	vmovdqu l5, 5 * 32(l); \
362*4882a593Smuzhiyun	vpxor l6, t2, l6; \
363*4882a593Smuzhiyun	vmovdqu l6, 6 * 32(l); \
364*4882a593Smuzhiyun	vpxor l7, t3, l7; \
365*4882a593Smuzhiyun	vmovdqu l7, 7 * 32(l); \
366*4882a593Smuzhiyun	\
367*4882a593Smuzhiyun	/* \
368*4882a593Smuzhiyun	 * t2 = krr; \
369*4882a593Smuzhiyun	 * t2 |= rr; \
370*4882a593Smuzhiyun	 * rl ^= t2; \
371*4882a593Smuzhiyun	 */ \
372*4882a593Smuzhiyun	\
373*4882a593Smuzhiyun	vpshufb tt0, t0, t3; \
374*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
375*4882a593Smuzhiyun	vpshufb tt0, t0, t2; \
376*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
377*4882a593Smuzhiyun	vpshufb tt0, t0, t1; \
378*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
379*4882a593Smuzhiyun	vpshufb tt0, t0, t0; \
380*4882a593Smuzhiyun	\
381*4882a593Smuzhiyun	vpor 4 * 32(r), t0, t0; \
382*4882a593Smuzhiyun	vpor 5 * 32(r), t1, t1; \
383*4882a593Smuzhiyun	vpor 6 * 32(r), t2, t2; \
384*4882a593Smuzhiyun	vpor 7 * 32(r), t3, t3; \
385*4882a593Smuzhiyun	\
386*4882a593Smuzhiyun	vpxor 0 * 32(r), t0, t0; \
387*4882a593Smuzhiyun	vpxor 1 * 32(r), t1, t1; \
388*4882a593Smuzhiyun	vpxor 2 * 32(r), t2, t2; \
389*4882a593Smuzhiyun	vpxor 3 * 32(r), t3, t3; \
390*4882a593Smuzhiyun	vmovdqu t0, 0 * 32(r); \
391*4882a593Smuzhiyun	vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
392*4882a593Smuzhiyun	vmovdqu t1, 1 * 32(r); \
393*4882a593Smuzhiyun	vmovdqu t2, 2 * 32(r); \
394*4882a593Smuzhiyun	vmovdqu t3, 3 * 32(r); \
395*4882a593Smuzhiyun	\
396*4882a593Smuzhiyun	/* \
397*4882a593Smuzhiyun	 * t2 = krl; \
398*4882a593Smuzhiyun	 * t2 &= rl; \
399*4882a593Smuzhiyun	 * rr ^= rol32(t2, 1); \
400*4882a593Smuzhiyun	 */ \
401*4882a593Smuzhiyun	vpshufb tt0, t0, t3; \
402*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
403*4882a593Smuzhiyun	vpshufb tt0, t0, t2; \
404*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
405*4882a593Smuzhiyun	vpshufb tt0, t0, t1; \
406*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
407*4882a593Smuzhiyun	vpshufb tt0, t0, t0; \
408*4882a593Smuzhiyun	\
409*4882a593Smuzhiyun	vpand 0 * 32(r), t0, t0; \
410*4882a593Smuzhiyun	vpand 1 * 32(r), t1, t1; \
411*4882a593Smuzhiyun	vpand 2 * 32(r), t2, t2; \
412*4882a593Smuzhiyun	vpand 3 * 32(r), t3, t3; \
413*4882a593Smuzhiyun	\
414*4882a593Smuzhiyun	rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
415*4882a593Smuzhiyun	\
416*4882a593Smuzhiyun	vpxor 4 * 32(r), t0, t0; \
417*4882a593Smuzhiyun	vpxor 5 * 32(r), t1, t1; \
418*4882a593Smuzhiyun	vpxor 6 * 32(r), t2, t2; \
419*4882a593Smuzhiyun	vpxor 7 * 32(r), t3, t3; \
420*4882a593Smuzhiyun	vmovdqu t0, 4 * 32(r); \
421*4882a593Smuzhiyun	vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
422*4882a593Smuzhiyun	vmovdqu t1, 5 * 32(r); \
423*4882a593Smuzhiyun	vmovdqu t2, 6 * 32(r); \
424*4882a593Smuzhiyun	vmovdqu t3, 7 * 32(r); \
425*4882a593Smuzhiyun	\
426*4882a593Smuzhiyun	/* \
427*4882a593Smuzhiyun	 * t0 = klr; \
428*4882a593Smuzhiyun	 * t0 |= lr; \
429*4882a593Smuzhiyun	 * ll ^= t0; \
430*4882a593Smuzhiyun	 */ \
431*4882a593Smuzhiyun	\
432*4882a593Smuzhiyun	vpshufb tt0, t0, t3; \
433*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
434*4882a593Smuzhiyun	vpshufb tt0, t0, t2; \
435*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
436*4882a593Smuzhiyun	vpshufb tt0, t0, t1; \
437*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
438*4882a593Smuzhiyun	vpshufb tt0, t0, t0; \
439*4882a593Smuzhiyun	\
440*4882a593Smuzhiyun	vpor l4, t0, t0; \
441*4882a593Smuzhiyun	vpor l5, t1, t1; \
442*4882a593Smuzhiyun	vpor l6, t2, t2; \
443*4882a593Smuzhiyun	vpor l7, t3, t3; \
444*4882a593Smuzhiyun	\
445*4882a593Smuzhiyun	vpxor l0, t0, l0; \
446*4882a593Smuzhiyun	vmovdqu l0, 0 * 32(l); \
447*4882a593Smuzhiyun	vpxor l1, t1, l1; \
448*4882a593Smuzhiyun	vmovdqu l1, 1 * 32(l); \
449*4882a593Smuzhiyun	vpxor l2, t2, l2; \
450*4882a593Smuzhiyun	vmovdqu l2, 2 * 32(l); \
451*4882a593Smuzhiyun	vpxor l3, t3, l3; \
452*4882a593Smuzhiyun	vmovdqu l3, 3 * 32(l);
453*4882a593Smuzhiyun
454*4882a593Smuzhiyun#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
455*4882a593Smuzhiyun	vpunpckhdq x1, x0, t2; \
456*4882a593Smuzhiyun	vpunpckldq x1, x0, x0; \
457*4882a593Smuzhiyun	\
458*4882a593Smuzhiyun	vpunpckldq x3, x2, t1; \
459*4882a593Smuzhiyun	vpunpckhdq x3, x2, x2; \
460*4882a593Smuzhiyun	\
461*4882a593Smuzhiyun	vpunpckhqdq t1, x0, x1; \
462*4882a593Smuzhiyun	vpunpcklqdq t1, x0, x0; \
463*4882a593Smuzhiyun	\
464*4882a593Smuzhiyun	vpunpckhqdq x2, t2, x3; \
465*4882a593Smuzhiyun	vpunpcklqdq x2, t2, x2;
466*4882a593Smuzhiyun
467*4882a593Smuzhiyun#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
468*4882a593Smuzhiyun			      a3, b3, c3, d3, st0, st1) \
469*4882a593Smuzhiyun	vmovdqu d2, st0; \
470*4882a593Smuzhiyun	vmovdqu d3, st1; \
471*4882a593Smuzhiyun	transpose_4x4(a0, a1, a2, a3, d2, d3); \
472*4882a593Smuzhiyun	transpose_4x4(b0, b1, b2, b3, d2, d3); \
473*4882a593Smuzhiyun	vmovdqu st0, d2; \
474*4882a593Smuzhiyun	vmovdqu st1, d3; \
475*4882a593Smuzhiyun	\
476*4882a593Smuzhiyun	vmovdqu a0, st0; \
477*4882a593Smuzhiyun	vmovdqu a1, st1; \
478*4882a593Smuzhiyun	transpose_4x4(c0, c1, c2, c3, a0, a1); \
479*4882a593Smuzhiyun	transpose_4x4(d0, d1, d2, d3, a0, a1); \
480*4882a593Smuzhiyun	\
481*4882a593Smuzhiyun	vbroadcasti128 .Lshufb_16x16b, a0; \
482*4882a593Smuzhiyun	vmovdqu st1, a1; \
483*4882a593Smuzhiyun	vpshufb a0, a2, a2; \
484*4882a593Smuzhiyun	vpshufb a0, a3, a3; \
485*4882a593Smuzhiyun	vpshufb a0, b0, b0; \
486*4882a593Smuzhiyun	vpshufb a0, b1, b1; \
487*4882a593Smuzhiyun	vpshufb a0, b2, b2; \
488*4882a593Smuzhiyun	vpshufb a0, b3, b3; \
489*4882a593Smuzhiyun	vpshufb a0, a1, a1; \
490*4882a593Smuzhiyun	vpshufb a0, c0, c0; \
491*4882a593Smuzhiyun	vpshufb a0, c1, c1; \
492*4882a593Smuzhiyun	vpshufb a0, c2, c2; \
493*4882a593Smuzhiyun	vpshufb a0, c3, c3; \
494*4882a593Smuzhiyun	vpshufb a0, d0, d0; \
495*4882a593Smuzhiyun	vpshufb a0, d1, d1; \
496*4882a593Smuzhiyun	vpshufb a0, d2, d2; \
497*4882a593Smuzhiyun	vpshufb a0, d3, d3; \
498*4882a593Smuzhiyun	vmovdqu d3, st1; \
499*4882a593Smuzhiyun	vmovdqu st0, d3; \
500*4882a593Smuzhiyun	vpshufb a0, d3, a0; \
501*4882a593Smuzhiyun	vmovdqu d2, st0; \
502*4882a593Smuzhiyun	\
503*4882a593Smuzhiyun	transpose_4x4(a0, b0, c0, d0, d2, d3); \
504*4882a593Smuzhiyun	transpose_4x4(a1, b1, c1, d1, d2, d3); \
505*4882a593Smuzhiyun	vmovdqu st0, d2; \
506*4882a593Smuzhiyun	vmovdqu st1, d3; \
507*4882a593Smuzhiyun	\
508*4882a593Smuzhiyun	vmovdqu b0, st0; \
509*4882a593Smuzhiyun	vmovdqu b1, st1; \
510*4882a593Smuzhiyun	transpose_4x4(a2, b2, c2, d2, b0, b1); \
511*4882a593Smuzhiyun	transpose_4x4(a3, b3, c3, d3, b0, b1); \
512*4882a593Smuzhiyun	vmovdqu st0, b0; \
513*4882a593Smuzhiyun	vmovdqu st1, b1; \
514*4882a593Smuzhiyun	/* does not adjust output bytes inside vectors */
515*4882a593Smuzhiyun
516*4882a593Smuzhiyun/* load blocks to registers and apply pre-whitening */
517*4882a593Smuzhiyun#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
518*4882a593Smuzhiyun		     y6, y7, rio, key) \
519*4882a593Smuzhiyun	vpbroadcastq key, x0; \
520*4882a593Smuzhiyun	vpshufb .Lpack_bswap, x0, x0; \
521*4882a593Smuzhiyun	\
522*4882a593Smuzhiyun	vpxor 0 * 32(rio), x0, y7; \
523*4882a593Smuzhiyun	vpxor 1 * 32(rio), x0, y6; \
524*4882a593Smuzhiyun	vpxor 2 * 32(rio), x0, y5; \
525*4882a593Smuzhiyun	vpxor 3 * 32(rio), x0, y4; \
526*4882a593Smuzhiyun	vpxor 4 * 32(rio), x0, y3; \
527*4882a593Smuzhiyun	vpxor 5 * 32(rio), x0, y2; \
528*4882a593Smuzhiyun	vpxor 6 * 32(rio), x0, y1; \
529*4882a593Smuzhiyun	vpxor 7 * 32(rio), x0, y0; \
530*4882a593Smuzhiyun	vpxor 8 * 32(rio), x0, x7; \
531*4882a593Smuzhiyun	vpxor 9 * 32(rio), x0, x6; \
532*4882a593Smuzhiyun	vpxor 10 * 32(rio), x0, x5; \
533*4882a593Smuzhiyun	vpxor 11 * 32(rio), x0, x4; \
534*4882a593Smuzhiyun	vpxor 12 * 32(rio), x0, x3; \
535*4882a593Smuzhiyun	vpxor 13 * 32(rio), x0, x2; \
536*4882a593Smuzhiyun	vpxor 14 * 32(rio), x0, x1; \
537*4882a593Smuzhiyun	vpxor 15 * 32(rio), x0, x0;
538*4882a593Smuzhiyun
539*4882a593Smuzhiyun/* byteslice pre-whitened blocks and store to temporary memory */
540*4882a593Smuzhiyun#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
541*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd) \
542*4882a593Smuzhiyun	byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
543*4882a593Smuzhiyun			      y4, y5, y6, y7, (mem_ab), (mem_cd)); \
544*4882a593Smuzhiyun	\
545*4882a593Smuzhiyun	vmovdqu x0, 0 * 32(mem_ab); \
546*4882a593Smuzhiyun	vmovdqu x1, 1 * 32(mem_ab); \
547*4882a593Smuzhiyun	vmovdqu x2, 2 * 32(mem_ab); \
548*4882a593Smuzhiyun	vmovdqu x3, 3 * 32(mem_ab); \
549*4882a593Smuzhiyun	vmovdqu x4, 4 * 32(mem_ab); \
550*4882a593Smuzhiyun	vmovdqu x5, 5 * 32(mem_ab); \
551*4882a593Smuzhiyun	vmovdqu x6, 6 * 32(mem_ab); \
552*4882a593Smuzhiyun	vmovdqu x7, 7 * 32(mem_ab); \
553*4882a593Smuzhiyun	vmovdqu y0, 0 * 32(mem_cd); \
554*4882a593Smuzhiyun	vmovdqu y1, 1 * 32(mem_cd); \
555*4882a593Smuzhiyun	vmovdqu y2, 2 * 32(mem_cd); \
556*4882a593Smuzhiyun	vmovdqu y3, 3 * 32(mem_cd); \
557*4882a593Smuzhiyun	vmovdqu y4, 4 * 32(mem_cd); \
558*4882a593Smuzhiyun	vmovdqu y5, 5 * 32(mem_cd); \
559*4882a593Smuzhiyun	vmovdqu y6, 6 * 32(mem_cd); \
560*4882a593Smuzhiyun	vmovdqu y7, 7 * 32(mem_cd);
561*4882a593Smuzhiyun
562*4882a593Smuzhiyun/* de-byteslice, apply post-whitening and store blocks */
563*4882a593Smuzhiyun#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
564*4882a593Smuzhiyun		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
565*4882a593Smuzhiyun	byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
566*4882a593Smuzhiyun			      y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
567*4882a593Smuzhiyun	\
568*4882a593Smuzhiyun	vmovdqu x0, stack_tmp0; \
569*4882a593Smuzhiyun	\
570*4882a593Smuzhiyun	vpbroadcastq key, x0; \
571*4882a593Smuzhiyun	vpshufb .Lpack_bswap, x0, x0; \
572*4882a593Smuzhiyun	\
573*4882a593Smuzhiyun	vpxor x0, y7, y7; \
574*4882a593Smuzhiyun	vpxor x0, y6, y6; \
575*4882a593Smuzhiyun	vpxor x0, y5, y5; \
576*4882a593Smuzhiyun	vpxor x0, y4, y4; \
577*4882a593Smuzhiyun	vpxor x0, y3, y3; \
578*4882a593Smuzhiyun	vpxor x0, y2, y2; \
579*4882a593Smuzhiyun	vpxor x0, y1, y1; \
580*4882a593Smuzhiyun	vpxor x0, y0, y0; \
581*4882a593Smuzhiyun	vpxor x0, x7, x7; \
582*4882a593Smuzhiyun	vpxor x0, x6, x6; \
583*4882a593Smuzhiyun	vpxor x0, x5, x5; \
584*4882a593Smuzhiyun	vpxor x0, x4, x4; \
585*4882a593Smuzhiyun	vpxor x0, x3, x3; \
586*4882a593Smuzhiyun	vpxor x0, x2, x2; \
587*4882a593Smuzhiyun	vpxor x0, x1, x1; \
588*4882a593Smuzhiyun	vpxor stack_tmp0, x0, x0;
589*4882a593Smuzhiyun
590*4882a593Smuzhiyun#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
591*4882a593Smuzhiyun		     y6, y7, rio) \
592*4882a593Smuzhiyun	vmovdqu x0, 0 * 32(rio); \
593*4882a593Smuzhiyun	vmovdqu x1, 1 * 32(rio); \
594*4882a593Smuzhiyun	vmovdqu x2, 2 * 32(rio); \
595*4882a593Smuzhiyun	vmovdqu x3, 3 * 32(rio); \
596*4882a593Smuzhiyun	vmovdqu x4, 4 * 32(rio); \
597*4882a593Smuzhiyun	vmovdqu x5, 5 * 32(rio); \
598*4882a593Smuzhiyun	vmovdqu x6, 6 * 32(rio); \
599*4882a593Smuzhiyun	vmovdqu x7, 7 * 32(rio); \
600*4882a593Smuzhiyun	vmovdqu y0, 8 * 32(rio); \
601*4882a593Smuzhiyun	vmovdqu y1, 9 * 32(rio); \
602*4882a593Smuzhiyun	vmovdqu y2, 10 * 32(rio); \
603*4882a593Smuzhiyun	vmovdqu y3, 11 * 32(rio); \
604*4882a593Smuzhiyun	vmovdqu y4, 12 * 32(rio); \
605*4882a593Smuzhiyun	vmovdqu y5, 13 * 32(rio); \
606*4882a593Smuzhiyun	vmovdqu y6, 14 * 32(rio); \
607*4882a593Smuzhiyun	vmovdqu y7, 15 * 32(rio);
608*4882a593Smuzhiyun
609*4882a593Smuzhiyun
610*4882a593Smuzhiyun.section	.rodata.cst32.shufb_16x16b, "aM", @progbits, 32
611*4882a593Smuzhiyun.align 32
612*4882a593Smuzhiyun#define SHUFB_BYTES(idx) \
613*4882a593Smuzhiyun	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
614*4882a593Smuzhiyun.Lshufb_16x16b:
615*4882a593Smuzhiyun	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
616*4882a593Smuzhiyun	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
617*4882a593Smuzhiyun
618*4882a593Smuzhiyun.section	.rodata.cst32.pack_bswap, "aM", @progbits, 32
619*4882a593Smuzhiyun.align 32
620*4882a593Smuzhiyun.Lpack_bswap:
621*4882a593Smuzhiyun	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
622*4882a593Smuzhiyun	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
623*4882a593Smuzhiyun
624*4882a593Smuzhiyun/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
625*4882a593Smuzhiyun.section	.rodata.cst16, "aM", @progbits, 16
626*4882a593Smuzhiyun.align 16
627*4882a593Smuzhiyun
628*4882a593Smuzhiyun/* For CTR-mode IV byteswap */
629*4882a593Smuzhiyun.Lbswap128_mask:
630*4882a593Smuzhiyun	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
631*4882a593Smuzhiyun
632*4882a593Smuzhiyun/* For XTS mode */
633*4882a593Smuzhiyun.Lxts_gf128mul_and_shl1_mask_0:
634*4882a593Smuzhiyun	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
635*4882a593Smuzhiyun.Lxts_gf128mul_and_shl1_mask_1:
636*4882a593Smuzhiyun	.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
637*4882a593Smuzhiyun
638*4882a593Smuzhiyun/*
639*4882a593Smuzhiyun * pre-SubByte transform
640*4882a593Smuzhiyun *
641*4882a593Smuzhiyun * pre-lookup for sbox1, sbox2, sbox3:
642*4882a593Smuzhiyun *   swap_bitendianness(
643*4882a593Smuzhiyun *       isom_map_camellia_to_aes(
644*4882a593Smuzhiyun *           camellia_f(
645*4882a593Smuzhiyun *               swap_bitendianess(in)
646*4882a593Smuzhiyun *           )
647*4882a593Smuzhiyun *       )
648*4882a593Smuzhiyun *   )
649*4882a593Smuzhiyun *
650*4882a593Smuzhiyun * (note: '⊕ 0xc5' inside camellia_f())
651*4882a593Smuzhiyun */
652*4882a593Smuzhiyun.Lpre_tf_lo_s1:
653*4882a593Smuzhiyun	.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
654*4882a593Smuzhiyun	.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
655*4882a593Smuzhiyun.Lpre_tf_hi_s1:
656*4882a593Smuzhiyun	.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
657*4882a593Smuzhiyun	.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
658*4882a593Smuzhiyun
659*4882a593Smuzhiyun/*
660*4882a593Smuzhiyun * pre-SubByte transform
661*4882a593Smuzhiyun *
662*4882a593Smuzhiyun * pre-lookup for sbox4:
663*4882a593Smuzhiyun *   swap_bitendianness(
664*4882a593Smuzhiyun *       isom_map_camellia_to_aes(
665*4882a593Smuzhiyun *           camellia_f(
666*4882a593Smuzhiyun *               swap_bitendianess(in <<< 1)
667*4882a593Smuzhiyun *           )
668*4882a593Smuzhiyun *       )
669*4882a593Smuzhiyun *   )
670*4882a593Smuzhiyun *
671*4882a593Smuzhiyun * (note: '⊕ 0xc5' inside camellia_f())
672*4882a593Smuzhiyun */
673*4882a593Smuzhiyun.Lpre_tf_lo_s4:
674*4882a593Smuzhiyun	.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
675*4882a593Smuzhiyun	.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
676*4882a593Smuzhiyun.Lpre_tf_hi_s4:
677*4882a593Smuzhiyun	.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
678*4882a593Smuzhiyun	.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
679*4882a593Smuzhiyun
680*4882a593Smuzhiyun/*
681*4882a593Smuzhiyun * post-SubByte transform
682*4882a593Smuzhiyun *
683*4882a593Smuzhiyun * post-lookup for sbox1, sbox4:
684*4882a593Smuzhiyun *  swap_bitendianness(
685*4882a593Smuzhiyun *      camellia_h(
686*4882a593Smuzhiyun *          isom_map_aes_to_camellia(
687*4882a593Smuzhiyun *              swap_bitendianness(
688*4882a593Smuzhiyun *                  aes_inverse_affine_transform(in)
689*4882a593Smuzhiyun *              )
690*4882a593Smuzhiyun *          )
691*4882a593Smuzhiyun *      )
692*4882a593Smuzhiyun *  )
693*4882a593Smuzhiyun *
694*4882a593Smuzhiyun * (note: '⊕ 0x6e' inside camellia_h())
695*4882a593Smuzhiyun */
696*4882a593Smuzhiyun.Lpost_tf_lo_s1:
697*4882a593Smuzhiyun	.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
698*4882a593Smuzhiyun	.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
699*4882a593Smuzhiyun.Lpost_tf_hi_s1:
700*4882a593Smuzhiyun	.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
701*4882a593Smuzhiyun	.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
702*4882a593Smuzhiyun
703*4882a593Smuzhiyun/*
704*4882a593Smuzhiyun * post-SubByte transform
705*4882a593Smuzhiyun *
706*4882a593Smuzhiyun * post-lookup for sbox2:
707*4882a593Smuzhiyun *  swap_bitendianness(
708*4882a593Smuzhiyun *      camellia_h(
709*4882a593Smuzhiyun *          isom_map_aes_to_camellia(
710*4882a593Smuzhiyun *              swap_bitendianness(
711*4882a593Smuzhiyun *                  aes_inverse_affine_transform(in)
712*4882a593Smuzhiyun *              )
713*4882a593Smuzhiyun *          )
714*4882a593Smuzhiyun *      )
715*4882a593Smuzhiyun *  ) <<< 1
716*4882a593Smuzhiyun *
717*4882a593Smuzhiyun * (note: '⊕ 0x6e' inside camellia_h())
718*4882a593Smuzhiyun */
719*4882a593Smuzhiyun.Lpost_tf_lo_s2:
720*4882a593Smuzhiyun	.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
721*4882a593Smuzhiyun	.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
722*4882a593Smuzhiyun.Lpost_tf_hi_s2:
723*4882a593Smuzhiyun	.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
724*4882a593Smuzhiyun	.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
725*4882a593Smuzhiyun
726*4882a593Smuzhiyun/*
727*4882a593Smuzhiyun * post-SubByte transform
728*4882a593Smuzhiyun *
729*4882a593Smuzhiyun * post-lookup for sbox3:
730*4882a593Smuzhiyun *  swap_bitendianness(
731*4882a593Smuzhiyun *      camellia_h(
732*4882a593Smuzhiyun *          isom_map_aes_to_camellia(
733*4882a593Smuzhiyun *              swap_bitendianness(
734*4882a593Smuzhiyun *                  aes_inverse_affine_transform(in)
735*4882a593Smuzhiyun *              )
736*4882a593Smuzhiyun *          )
737*4882a593Smuzhiyun *      )
738*4882a593Smuzhiyun *  ) >>> 1
739*4882a593Smuzhiyun *
740*4882a593Smuzhiyun * (note: '⊕ 0x6e' inside camellia_h())
741*4882a593Smuzhiyun */
742*4882a593Smuzhiyun.Lpost_tf_lo_s3:
743*4882a593Smuzhiyun	.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
744*4882a593Smuzhiyun	.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
745*4882a593Smuzhiyun.Lpost_tf_hi_s3:
746*4882a593Smuzhiyun	.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
747*4882a593Smuzhiyun	.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
748*4882a593Smuzhiyun
749*4882a593Smuzhiyun/* For isolating SubBytes from AESENCLAST, inverse shift row */
750*4882a593Smuzhiyun.Linv_shift_row:
751*4882a593Smuzhiyun	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
752*4882a593Smuzhiyun	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
753*4882a593Smuzhiyun
754*4882a593Smuzhiyun.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
755*4882a593Smuzhiyun.align 4
756*4882a593Smuzhiyun/* 4-bit mask */
757*4882a593Smuzhiyun.L0f0f0f0f:
758*4882a593Smuzhiyun	.long 0x0f0f0f0f
759*4882a593Smuzhiyun
760*4882a593Smuzhiyun.text
761*4882a593Smuzhiyun
762*4882a593Smuzhiyun.align 8
763*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(__camellia_enc_blk32)
764*4882a593Smuzhiyun	/* input:
765*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
766*4882a593Smuzhiyun	 *	%rax: temporary storage, 512 bytes
767*4882a593Smuzhiyun	 *	%ymm0..%ymm15: 32 plaintext blocks
768*4882a593Smuzhiyun	 * output:
769*4882a593Smuzhiyun	 *	%ymm0..%ymm15: 32 encrypted blocks, order swapped:
770*4882a593Smuzhiyun	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
771*4882a593Smuzhiyun	 */
772*4882a593Smuzhiyun	FRAME_BEGIN
773*4882a593Smuzhiyun
774*4882a593Smuzhiyun	leaq 8 * 32(%rax), %rcx;
775*4882a593Smuzhiyun
776*4882a593Smuzhiyun	inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
777*4882a593Smuzhiyun		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
778*4882a593Smuzhiyun		      %ymm15, %rax, %rcx);
779*4882a593Smuzhiyun
780*4882a593Smuzhiyun	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
781*4882a593Smuzhiyun		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
782*4882a593Smuzhiyun		     %ymm15, %rax, %rcx, 0);
783*4882a593Smuzhiyun
784*4882a593Smuzhiyun	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
785*4882a593Smuzhiyun	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
786*4882a593Smuzhiyun	      %ymm15,
787*4882a593Smuzhiyun	      ((key_table + (8) * 8) + 0)(CTX),
788*4882a593Smuzhiyun	      ((key_table + (8) * 8) + 4)(CTX),
789*4882a593Smuzhiyun	      ((key_table + (8) * 8) + 8)(CTX),
790*4882a593Smuzhiyun	      ((key_table + (8) * 8) + 12)(CTX));
791*4882a593Smuzhiyun
792*4882a593Smuzhiyun	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
793*4882a593Smuzhiyun		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
794*4882a593Smuzhiyun		     %ymm15, %rax, %rcx, 8);
795*4882a593Smuzhiyun
796*4882a593Smuzhiyun	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
797*4882a593Smuzhiyun	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
798*4882a593Smuzhiyun	      %ymm15,
799*4882a593Smuzhiyun	      ((key_table + (16) * 8) + 0)(CTX),
800*4882a593Smuzhiyun	      ((key_table + (16) * 8) + 4)(CTX),
801*4882a593Smuzhiyun	      ((key_table + (16) * 8) + 8)(CTX),
802*4882a593Smuzhiyun	      ((key_table + (16) * 8) + 12)(CTX));
803*4882a593Smuzhiyun
804*4882a593Smuzhiyun	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
805*4882a593Smuzhiyun		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
806*4882a593Smuzhiyun		     %ymm15, %rax, %rcx, 16);
807*4882a593Smuzhiyun
808*4882a593Smuzhiyun	movl $24, %r8d;
809*4882a593Smuzhiyun	cmpl $16, key_length(CTX);
810*4882a593Smuzhiyun	jne .Lenc_max32;
811*4882a593Smuzhiyun
812*4882a593Smuzhiyun.Lenc_done:
813*4882a593Smuzhiyun	/* load CD for output */
814*4882a593Smuzhiyun	vmovdqu 0 * 32(%rcx), %ymm8;
815*4882a593Smuzhiyun	vmovdqu 1 * 32(%rcx), %ymm9;
816*4882a593Smuzhiyun	vmovdqu 2 * 32(%rcx), %ymm10;
817*4882a593Smuzhiyun	vmovdqu 3 * 32(%rcx), %ymm11;
818*4882a593Smuzhiyun	vmovdqu 4 * 32(%rcx), %ymm12;
819*4882a593Smuzhiyun	vmovdqu 5 * 32(%rcx), %ymm13;
820*4882a593Smuzhiyun	vmovdqu 6 * 32(%rcx), %ymm14;
821*4882a593Smuzhiyun	vmovdqu 7 * 32(%rcx), %ymm15;
822*4882a593Smuzhiyun
823*4882a593Smuzhiyun	outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
824*4882a593Smuzhiyun		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
825*4882a593Smuzhiyun		    %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
826*4882a593Smuzhiyun
827*4882a593Smuzhiyun	FRAME_END
828*4882a593Smuzhiyun	RET;
829*4882a593Smuzhiyun
830*4882a593Smuzhiyun.align 8
831*4882a593Smuzhiyun.Lenc_max32:
832*4882a593Smuzhiyun	movl $32, %r8d;
833*4882a593Smuzhiyun
834*4882a593Smuzhiyun	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
835*4882a593Smuzhiyun	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
836*4882a593Smuzhiyun	      %ymm15,
837*4882a593Smuzhiyun	      ((key_table + (24) * 8) + 0)(CTX),
838*4882a593Smuzhiyun	      ((key_table + (24) * 8) + 4)(CTX),
839*4882a593Smuzhiyun	      ((key_table + (24) * 8) + 8)(CTX),
840*4882a593Smuzhiyun	      ((key_table + (24) * 8) + 12)(CTX));
841*4882a593Smuzhiyun
842*4882a593Smuzhiyun	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
843*4882a593Smuzhiyun		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
844*4882a593Smuzhiyun		     %ymm15, %rax, %rcx, 24);
845*4882a593Smuzhiyun
846*4882a593Smuzhiyun	jmp .Lenc_done;
847*4882a593SmuzhiyunSYM_FUNC_END(__camellia_enc_blk32)
848*4882a593Smuzhiyun
849*4882a593Smuzhiyun.align 8
850*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(__camellia_dec_blk32)
851*4882a593Smuzhiyun	/* input:
852*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
853*4882a593Smuzhiyun	 *	%rax: temporary storage, 512 bytes
854*4882a593Smuzhiyun	 *	%r8d: 24 for 16 byte key, 32 for larger
855*4882a593Smuzhiyun	 *	%ymm0..%ymm15: 16 encrypted blocks
856*4882a593Smuzhiyun	 * output:
857*4882a593Smuzhiyun	 *	%ymm0..%ymm15: 16 plaintext blocks, order swapped:
858*4882a593Smuzhiyun	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
859*4882a593Smuzhiyun	 */
860*4882a593Smuzhiyun	FRAME_BEGIN
861*4882a593Smuzhiyun
862*4882a593Smuzhiyun	leaq 8 * 32(%rax), %rcx;
863*4882a593Smuzhiyun
864*4882a593Smuzhiyun	inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
865*4882a593Smuzhiyun		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
866*4882a593Smuzhiyun		      %ymm15, %rax, %rcx);
867*4882a593Smuzhiyun
868*4882a593Smuzhiyun	cmpl $32, %r8d;
869*4882a593Smuzhiyun	je .Ldec_max32;
870*4882a593Smuzhiyun
871*4882a593Smuzhiyun.Ldec_max24:
872*4882a593Smuzhiyun	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
873*4882a593Smuzhiyun		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
874*4882a593Smuzhiyun		     %ymm15, %rax, %rcx, 16);
875*4882a593Smuzhiyun
876*4882a593Smuzhiyun	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
877*4882a593Smuzhiyun	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
878*4882a593Smuzhiyun	      %ymm15,
879*4882a593Smuzhiyun	      ((key_table + (16) * 8) + 8)(CTX),
880*4882a593Smuzhiyun	      ((key_table + (16) * 8) + 12)(CTX),
881*4882a593Smuzhiyun	      ((key_table + (16) * 8) + 0)(CTX),
882*4882a593Smuzhiyun	      ((key_table + (16) * 8) + 4)(CTX));
883*4882a593Smuzhiyun
884*4882a593Smuzhiyun	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
885*4882a593Smuzhiyun		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
886*4882a593Smuzhiyun		     %ymm15, %rax, %rcx, 8);
887*4882a593Smuzhiyun
888*4882a593Smuzhiyun	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
889*4882a593Smuzhiyun	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
890*4882a593Smuzhiyun	      %ymm15,
891*4882a593Smuzhiyun	      ((key_table + (8) * 8) + 8)(CTX),
892*4882a593Smuzhiyun	      ((key_table + (8) * 8) + 12)(CTX),
893*4882a593Smuzhiyun	      ((key_table + (8) * 8) + 0)(CTX),
894*4882a593Smuzhiyun	      ((key_table + (8) * 8) + 4)(CTX));
895*4882a593Smuzhiyun
896*4882a593Smuzhiyun	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
897*4882a593Smuzhiyun		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
898*4882a593Smuzhiyun		     %ymm15, %rax, %rcx, 0);
899*4882a593Smuzhiyun
900*4882a593Smuzhiyun	/* load CD for output */
901*4882a593Smuzhiyun	vmovdqu 0 * 32(%rcx), %ymm8;
902*4882a593Smuzhiyun	vmovdqu 1 * 32(%rcx), %ymm9;
903*4882a593Smuzhiyun	vmovdqu 2 * 32(%rcx), %ymm10;
904*4882a593Smuzhiyun	vmovdqu 3 * 32(%rcx), %ymm11;
905*4882a593Smuzhiyun	vmovdqu 4 * 32(%rcx), %ymm12;
906*4882a593Smuzhiyun	vmovdqu 5 * 32(%rcx), %ymm13;
907*4882a593Smuzhiyun	vmovdqu 6 * 32(%rcx), %ymm14;
908*4882a593Smuzhiyun	vmovdqu 7 * 32(%rcx), %ymm15;
909*4882a593Smuzhiyun
910*4882a593Smuzhiyun	outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
911*4882a593Smuzhiyun		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
912*4882a593Smuzhiyun		    %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
913*4882a593Smuzhiyun
914*4882a593Smuzhiyun	FRAME_END
915*4882a593Smuzhiyun	RET;
916*4882a593Smuzhiyun
917*4882a593Smuzhiyun.align 8
918*4882a593Smuzhiyun.Ldec_max32:
919*4882a593Smuzhiyun	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
920*4882a593Smuzhiyun		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
921*4882a593Smuzhiyun		     %ymm15, %rax, %rcx, 24);
922*4882a593Smuzhiyun
923*4882a593Smuzhiyun	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
924*4882a593Smuzhiyun	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
925*4882a593Smuzhiyun	      %ymm15,
926*4882a593Smuzhiyun	      ((key_table + (24) * 8) + 8)(CTX),
927*4882a593Smuzhiyun	      ((key_table + (24) * 8) + 12)(CTX),
928*4882a593Smuzhiyun	      ((key_table + (24) * 8) + 0)(CTX),
929*4882a593Smuzhiyun	      ((key_table + (24) * 8) + 4)(CTX));
930*4882a593Smuzhiyun
931*4882a593Smuzhiyun	jmp .Ldec_max24;
932*4882a593SmuzhiyunSYM_FUNC_END(__camellia_dec_blk32)
933*4882a593Smuzhiyun
934*4882a593SmuzhiyunSYM_FUNC_START(camellia_ecb_enc_32way)
935*4882a593Smuzhiyun	/* input:
936*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
937*4882a593Smuzhiyun	 *	%rsi: dst (32 blocks)
938*4882a593Smuzhiyun	 *	%rdx: src (32 blocks)
939*4882a593Smuzhiyun	 */
940*4882a593Smuzhiyun	FRAME_BEGIN
941*4882a593Smuzhiyun
942*4882a593Smuzhiyun	vzeroupper;
943*4882a593Smuzhiyun
944*4882a593Smuzhiyun	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
945*4882a593Smuzhiyun		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
946*4882a593Smuzhiyun		     %ymm15, %rdx, (key_table)(CTX));
947*4882a593Smuzhiyun
948*4882a593Smuzhiyun	/* now dst can be used as temporary buffer (even in src == dst case) */
949*4882a593Smuzhiyun	movq	%rsi, %rax;
950*4882a593Smuzhiyun
951*4882a593Smuzhiyun	call __camellia_enc_blk32;
952*4882a593Smuzhiyun
953*4882a593Smuzhiyun	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
954*4882a593Smuzhiyun		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
955*4882a593Smuzhiyun		     %ymm8, %rsi);
956*4882a593Smuzhiyun
957*4882a593Smuzhiyun	vzeroupper;
958*4882a593Smuzhiyun
959*4882a593Smuzhiyun	FRAME_END
960*4882a593Smuzhiyun	RET;
961*4882a593SmuzhiyunSYM_FUNC_END(camellia_ecb_enc_32way)
962*4882a593Smuzhiyun
963*4882a593SmuzhiyunSYM_FUNC_START(camellia_ecb_dec_32way)
964*4882a593Smuzhiyun	/* input:
965*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
966*4882a593Smuzhiyun	 *	%rsi: dst (32 blocks)
967*4882a593Smuzhiyun	 *	%rdx: src (32 blocks)
968*4882a593Smuzhiyun	 */
969*4882a593Smuzhiyun	FRAME_BEGIN
970*4882a593Smuzhiyun
971*4882a593Smuzhiyun	vzeroupper;
972*4882a593Smuzhiyun
973*4882a593Smuzhiyun	cmpl $16, key_length(CTX);
974*4882a593Smuzhiyun	movl $32, %r8d;
975*4882a593Smuzhiyun	movl $24, %eax;
976*4882a593Smuzhiyun	cmovel %eax, %r8d; /* max */
977*4882a593Smuzhiyun
978*4882a593Smuzhiyun	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
979*4882a593Smuzhiyun		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
980*4882a593Smuzhiyun		     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
981*4882a593Smuzhiyun
982*4882a593Smuzhiyun	/* now dst can be used as temporary buffer (even in src == dst case) */
983*4882a593Smuzhiyun	movq	%rsi, %rax;
984*4882a593Smuzhiyun
985*4882a593Smuzhiyun	call __camellia_dec_blk32;
986*4882a593Smuzhiyun
987*4882a593Smuzhiyun	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
988*4882a593Smuzhiyun		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
989*4882a593Smuzhiyun		     %ymm8, %rsi);
990*4882a593Smuzhiyun
991*4882a593Smuzhiyun	vzeroupper;
992*4882a593Smuzhiyun
993*4882a593Smuzhiyun	FRAME_END
994*4882a593Smuzhiyun	RET;
995*4882a593SmuzhiyunSYM_FUNC_END(camellia_ecb_dec_32way)
996*4882a593Smuzhiyun
997*4882a593SmuzhiyunSYM_FUNC_START(camellia_cbc_dec_32way)
998*4882a593Smuzhiyun	/* input:
999*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
1000*4882a593Smuzhiyun	 *	%rsi: dst (32 blocks)
1001*4882a593Smuzhiyun	 *	%rdx: src (32 blocks)
1002*4882a593Smuzhiyun	 */
1003*4882a593Smuzhiyun	FRAME_BEGIN
1004*4882a593Smuzhiyun
1005*4882a593Smuzhiyun	vzeroupper;
1006*4882a593Smuzhiyun
1007*4882a593Smuzhiyun	cmpl $16, key_length(CTX);
1008*4882a593Smuzhiyun	movl $32, %r8d;
1009*4882a593Smuzhiyun	movl $24, %eax;
1010*4882a593Smuzhiyun	cmovel %eax, %r8d; /* max */
1011*4882a593Smuzhiyun
1012*4882a593Smuzhiyun	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1013*4882a593Smuzhiyun		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1014*4882a593Smuzhiyun		     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
1015*4882a593Smuzhiyun
1016*4882a593Smuzhiyun	movq %rsp, %r10;
1017*4882a593Smuzhiyun	cmpq %rsi, %rdx;
1018*4882a593Smuzhiyun	je .Lcbc_dec_use_stack;
1019*4882a593Smuzhiyun
1020*4882a593Smuzhiyun	/* dst can be used as temporary storage, src is not overwritten. */
1021*4882a593Smuzhiyun	movq %rsi, %rax;
1022*4882a593Smuzhiyun	jmp .Lcbc_dec_continue;
1023*4882a593Smuzhiyun
1024*4882a593Smuzhiyun.Lcbc_dec_use_stack:
1025*4882a593Smuzhiyun	/*
1026*4882a593Smuzhiyun	 * dst still in-use (because dst == src), so use stack for temporary
1027*4882a593Smuzhiyun	 * storage.
1028*4882a593Smuzhiyun	 */
1029*4882a593Smuzhiyun	subq $(16 * 32), %rsp;
1030*4882a593Smuzhiyun	movq %rsp, %rax;
1031*4882a593Smuzhiyun
1032*4882a593Smuzhiyun.Lcbc_dec_continue:
1033*4882a593Smuzhiyun	call __camellia_dec_blk32;
1034*4882a593Smuzhiyun
1035*4882a593Smuzhiyun	vmovdqu %ymm7, (%rax);
1036*4882a593Smuzhiyun	vpxor %ymm7, %ymm7, %ymm7;
1037*4882a593Smuzhiyun	vinserti128 $1, (%rdx), %ymm7, %ymm7;
1038*4882a593Smuzhiyun	vpxor (%rax), %ymm7, %ymm7;
1039*4882a593Smuzhiyun	movq %r10, %rsp;
1040*4882a593Smuzhiyun	vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
1041*4882a593Smuzhiyun	vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
1042*4882a593Smuzhiyun	vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
1043*4882a593Smuzhiyun	vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
1044*4882a593Smuzhiyun	vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
1045*4882a593Smuzhiyun	vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
1046*4882a593Smuzhiyun	vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
1047*4882a593Smuzhiyun	vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
1048*4882a593Smuzhiyun	vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
1049*4882a593Smuzhiyun	vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
1050*4882a593Smuzhiyun	vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
1051*4882a593Smuzhiyun	vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
1052*4882a593Smuzhiyun	vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
1053*4882a593Smuzhiyun	vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
1054*4882a593Smuzhiyun	vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
1055*4882a593Smuzhiyun	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1056*4882a593Smuzhiyun		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1057*4882a593Smuzhiyun		     %ymm8, %rsi);
1058*4882a593Smuzhiyun
1059*4882a593Smuzhiyun	vzeroupper;
1060*4882a593Smuzhiyun
1061*4882a593Smuzhiyun	FRAME_END
1062*4882a593Smuzhiyun	RET;
1063*4882a593SmuzhiyunSYM_FUNC_END(camellia_cbc_dec_32way)
1064*4882a593Smuzhiyun
1065*4882a593Smuzhiyun#define inc_le128(x, minus_one, tmp) \
1066*4882a593Smuzhiyun	vpcmpeqq minus_one, x, tmp; \
1067*4882a593Smuzhiyun	vpsubq minus_one, x, x; \
1068*4882a593Smuzhiyun	vpslldq $8, tmp, tmp; \
1069*4882a593Smuzhiyun	vpsubq tmp, x, x;
1070*4882a593Smuzhiyun
1071*4882a593Smuzhiyun#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
1072*4882a593Smuzhiyun	vpcmpeqq minus_one, x, tmp1; \
1073*4882a593Smuzhiyun	vpcmpeqq minus_two, x, tmp2; \
1074*4882a593Smuzhiyun	vpsubq minus_two, x, x; \
1075*4882a593Smuzhiyun	vpor tmp2, tmp1, tmp1; \
1076*4882a593Smuzhiyun	vpslldq $8, tmp1, tmp1; \
1077*4882a593Smuzhiyun	vpsubq tmp1, x, x;
1078*4882a593Smuzhiyun
1079*4882a593SmuzhiyunSYM_FUNC_START(camellia_ctr_32way)
1080*4882a593Smuzhiyun	/* input:
1081*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
1082*4882a593Smuzhiyun	 *	%rsi: dst (32 blocks)
1083*4882a593Smuzhiyun	 *	%rdx: src (32 blocks)
1084*4882a593Smuzhiyun	 *	%rcx: iv (little endian, 128bit)
1085*4882a593Smuzhiyun	 */
1086*4882a593Smuzhiyun	FRAME_BEGIN
1087*4882a593Smuzhiyun
1088*4882a593Smuzhiyun	vzeroupper;
1089*4882a593Smuzhiyun
1090*4882a593Smuzhiyun	movq %rsp, %r10;
1091*4882a593Smuzhiyun	cmpq %rsi, %rdx;
1092*4882a593Smuzhiyun	je .Lctr_use_stack;
1093*4882a593Smuzhiyun
1094*4882a593Smuzhiyun	/* dst can be used as temporary storage, src is not overwritten. */
1095*4882a593Smuzhiyun	movq %rsi, %rax;
1096*4882a593Smuzhiyun	jmp .Lctr_continue;
1097*4882a593Smuzhiyun
1098*4882a593Smuzhiyun.Lctr_use_stack:
1099*4882a593Smuzhiyun	subq $(16 * 32), %rsp;
1100*4882a593Smuzhiyun	movq %rsp, %rax;
1101*4882a593Smuzhiyun
1102*4882a593Smuzhiyun.Lctr_continue:
1103*4882a593Smuzhiyun	vpcmpeqd %ymm15, %ymm15, %ymm15;
1104*4882a593Smuzhiyun	vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
1105*4882a593Smuzhiyun	vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
1106*4882a593Smuzhiyun
1107*4882a593Smuzhiyun	/* load IV and byteswap */
1108*4882a593Smuzhiyun	vmovdqu (%rcx), %xmm0;
1109*4882a593Smuzhiyun	vmovdqa %xmm0, %xmm1;
1110*4882a593Smuzhiyun	inc_le128(%xmm0, %xmm15, %xmm14);
1111*4882a593Smuzhiyun	vbroadcasti128 .Lbswap128_mask, %ymm14;
1112*4882a593Smuzhiyun	vinserti128 $1, %xmm0, %ymm1, %ymm0;
1113*4882a593Smuzhiyun	vpshufb %ymm14, %ymm0, %ymm13;
1114*4882a593Smuzhiyun	vmovdqu %ymm13, 15 * 32(%rax);
1115*4882a593Smuzhiyun
1116*4882a593Smuzhiyun	/* construct IVs */
1117*4882a593Smuzhiyun	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
1118*4882a593Smuzhiyun	vpshufb %ymm14, %ymm0, %ymm13;
1119*4882a593Smuzhiyun	vmovdqu %ymm13, 14 * 32(%rax);
1120*4882a593Smuzhiyun	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1121*4882a593Smuzhiyun	vpshufb %ymm14, %ymm0, %ymm13;
1122*4882a593Smuzhiyun	vmovdqu %ymm13, 13 * 32(%rax);
1123*4882a593Smuzhiyun	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1124*4882a593Smuzhiyun	vpshufb %ymm14, %ymm0, %ymm13;
1125*4882a593Smuzhiyun	vmovdqu %ymm13, 12 * 32(%rax);
1126*4882a593Smuzhiyun	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1127*4882a593Smuzhiyun	vpshufb %ymm14, %ymm0, %ymm13;
1128*4882a593Smuzhiyun	vmovdqu %ymm13, 11 * 32(%rax);
1129*4882a593Smuzhiyun	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1130*4882a593Smuzhiyun	vpshufb %ymm14, %ymm0, %ymm10;
1131*4882a593Smuzhiyun	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1132*4882a593Smuzhiyun	vpshufb %ymm14, %ymm0, %ymm9;
1133*4882a593Smuzhiyun	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1134*4882a593Smuzhiyun	vpshufb %ymm14, %ymm0, %ymm8;
1135*4882a593Smuzhiyun	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1136*4882a593Smuzhiyun	vpshufb %ymm14, %ymm0, %ymm7;
1137*4882a593Smuzhiyun	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1138*4882a593Smuzhiyun	vpshufb %ymm14, %ymm0, %ymm6;
1139*4882a593Smuzhiyun	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1140*4882a593Smuzhiyun	vpshufb %ymm14, %ymm0, %ymm5;
1141*4882a593Smuzhiyun	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1142*4882a593Smuzhiyun	vpshufb %ymm14, %ymm0, %ymm4;
1143*4882a593Smuzhiyun	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1144*4882a593Smuzhiyun	vpshufb %ymm14, %ymm0, %ymm3;
1145*4882a593Smuzhiyun	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1146*4882a593Smuzhiyun	vpshufb %ymm14, %ymm0, %ymm2;
1147*4882a593Smuzhiyun	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1148*4882a593Smuzhiyun	vpshufb %ymm14, %ymm0, %ymm1;
1149*4882a593Smuzhiyun	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1150*4882a593Smuzhiyun	vextracti128 $1, %ymm0, %xmm13;
1151*4882a593Smuzhiyun	vpshufb %ymm14, %ymm0, %ymm0;
1152*4882a593Smuzhiyun	inc_le128(%xmm13, %xmm15, %xmm14);
1153*4882a593Smuzhiyun	vmovdqu %xmm13, (%rcx);
1154*4882a593Smuzhiyun
1155*4882a593Smuzhiyun	/* inpack32_pre: */
1156*4882a593Smuzhiyun	vpbroadcastq (key_table)(CTX), %ymm15;
1157*4882a593Smuzhiyun	vpshufb .Lpack_bswap, %ymm15, %ymm15;
1158*4882a593Smuzhiyun	vpxor %ymm0, %ymm15, %ymm0;
1159*4882a593Smuzhiyun	vpxor %ymm1, %ymm15, %ymm1;
1160*4882a593Smuzhiyun	vpxor %ymm2, %ymm15, %ymm2;
1161*4882a593Smuzhiyun	vpxor %ymm3, %ymm15, %ymm3;
1162*4882a593Smuzhiyun	vpxor %ymm4, %ymm15, %ymm4;
1163*4882a593Smuzhiyun	vpxor %ymm5, %ymm15, %ymm5;
1164*4882a593Smuzhiyun	vpxor %ymm6, %ymm15, %ymm6;
1165*4882a593Smuzhiyun	vpxor %ymm7, %ymm15, %ymm7;
1166*4882a593Smuzhiyun	vpxor %ymm8, %ymm15, %ymm8;
1167*4882a593Smuzhiyun	vpxor %ymm9, %ymm15, %ymm9;
1168*4882a593Smuzhiyun	vpxor %ymm10, %ymm15, %ymm10;
1169*4882a593Smuzhiyun	vpxor 11 * 32(%rax), %ymm15, %ymm11;
1170*4882a593Smuzhiyun	vpxor 12 * 32(%rax), %ymm15, %ymm12;
1171*4882a593Smuzhiyun	vpxor 13 * 32(%rax), %ymm15, %ymm13;
1172*4882a593Smuzhiyun	vpxor 14 * 32(%rax), %ymm15, %ymm14;
1173*4882a593Smuzhiyun	vpxor 15 * 32(%rax), %ymm15, %ymm15;
1174*4882a593Smuzhiyun
1175*4882a593Smuzhiyun	call __camellia_enc_blk32;
1176*4882a593Smuzhiyun
1177*4882a593Smuzhiyun	movq %r10, %rsp;
1178*4882a593Smuzhiyun
1179*4882a593Smuzhiyun	vpxor 0 * 32(%rdx), %ymm7, %ymm7;
1180*4882a593Smuzhiyun	vpxor 1 * 32(%rdx), %ymm6, %ymm6;
1181*4882a593Smuzhiyun	vpxor 2 * 32(%rdx), %ymm5, %ymm5;
1182*4882a593Smuzhiyun	vpxor 3 * 32(%rdx), %ymm4, %ymm4;
1183*4882a593Smuzhiyun	vpxor 4 * 32(%rdx), %ymm3, %ymm3;
1184*4882a593Smuzhiyun	vpxor 5 * 32(%rdx), %ymm2, %ymm2;
1185*4882a593Smuzhiyun	vpxor 6 * 32(%rdx), %ymm1, %ymm1;
1186*4882a593Smuzhiyun	vpxor 7 * 32(%rdx), %ymm0, %ymm0;
1187*4882a593Smuzhiyun	vpxor 8 * 32(%rdx), %ymm15, %ymm15;
1188*4882a593Smuzhiyun	vpxor 9 * 32(%rdx), %ymm14, %ymm14;
1189*4882a593Smuzhiyun	vpxor 10 * 32(%rdx), %ymm13, %ymm13;
1190*4882a593Smuzhiyun	vpxor 11 * 32(%rdx), %ymm12, %ymm12;
1191*4882a593Smuzhiyun	vpxor 12 * 32(%rdx), %ymm11, %ymm11;
1192*4882a593Smuzhiyun	vpxor 13 * 32(%rdx), %ymm10, %ymm10;
1193*4882a593Smuzhiyun	vpxor 14 * 32(%rdx), %ymm9, %ymm9;
1194*4882a593Smuzhiyun	vpxor 15 * 32(%rdx), %ymm8, %ymm8;
1195*4882a593Smuzhiyun	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1196*4882a593Smuzhiyun		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1197*4882a593Smuzhiyun		     %ymm8, %rsi);
1198*4882a593Smuzhiyun
1199*4882a593Smuzhiyun	vzeroupper;
1200*4882a593Smuzhiyun
1201*4882a593Smuzhiyun	FRAME_END
1202*4882a593Smuzhiyun	RET;
1203*4882a593SmuzhiyunSYM_FUNC_END(camellia_ctr_32way)
1204*4882a593Smuzhiyun
1205*4882a593Smuzhiyun#define gf128mul_x_ble(iv, mask, tmp) \
1206*4882a593Smuzhiyun	vpsrad $31, iv, tmp; \
1207*4882a593Smuzhiyun	vpaddq iv, iv, iv; \
1208*4882a593Smuzhiyun	vpshufd $0x13, tmp, tmp; \
1209*4882a593Smuzhiyun	vpand mask, tmp, tmp; \
1210*4882a593Smuzhiyun	vpxor tmp, iv, iv;
1211*4882a593Smuzhiyun
1212*4882a593Smuzhiyun#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
1213*4882a593Smuzhiyun	vpsrad $31, iv, tmp0; \
1214*4882a593Smuzhiyun	vpaddq iv, iv, tmp1; \
1215*4882a593Smuzhiyun	vpsllq $2, iv, iv; \
1216*4882a593Smuzhiyun	vpshufd $0x13, tmp0, tmp0; \
1217*4882a593Smuzhiyun	vpsrad $31, tmp1, tmp1; \
1218*4882a593Smuzhiyun	vpand mask2, tmp0, tmp0; \
1219*4882a593Smuzhiyun	vpshufd $0x13, tmp1, tmp1; \
1220*4882a593Smuzhiyun	vpxor tmp0, iv, iv; \
1221*4882a593Smuzhiyun	vpand mask1, tmp1, tmp1; \
1222*4882a593Smuzhiyun	vpxor tmp1, iv, iv;
1223*4882a593Smuzhiyun
1224*4882a593Smuzhiyun.align 8
1225*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(camellia_xts_crypt_32way)
1226*4882a593Smuzhiyun	/* input:
1227*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
1228*4882a593Smuzhiyun	 *	%rsi: dst (32 blocks)
1229*4882a593Smuzhiyun	 *	%rdx: src (32 blocks)
1230*4882a593Smuzhiyun	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1231*4882a593Smuzhiyun	 *	%r8: index for input whitening key
1232*4882a593Smuzhiyun	 *	%r9: pointer to  __camellia_enc_blk32 or __camellia_dec_blk32
1233*4882a593Smuzhiyun	 */
1234*4882a593Smuzhiyun	FRAME_BEGIN
1235*4882a593Smuzhiyun
1236*4882a593Smuzhiyun	vzeroupper;
1237*4882a593Smuzhiyun
1238*4882a593Smuzhiyun	subq $(16 * 32), %rsp;
1239*4882a593Smuzhiyun	movq %rsp, %rax;
1240*4882a593Smuzhiyun
1241*4882a593Smuzhiyun	vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
1242*4882a593Smuzhiyun
1243*4882a593Smuzhiyun	/* load IV and construct second IV */
1244*4882a593Smuzhiyun	vmovdqu (%rcx), %xmm0;
1245*4882a593Smuzhiyun	vmovdqa %xmm0, %xmm15;
1246*4882a593Smuzhiyun	gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
1247*4882a593Smuzhiyun	vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
1248*4882a593Smuzhiyun	vinserti128 $1, %xmm0, %ymm15, %ymm0;
1249*4882a593Smuzhiyun	vpxor 0 * 32(%rdx), %ymm0, %ymm15;
1250*4882a593Smuzhiyun	vmovdqu %ymm15, 15 * 32(%rax);
1251*4882a593Smuzhiyun	vmovdqu %ymm0, 0 * 32(%rsi);
1252*4882a593Smuzhiyun
1253*4882a593Smuzhiyun	/* construct IVs */
1254*4882a593Smuzhiyun	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1255*4882a593Smuzhiyun	vpxor 1 * 32(%rdx), %ymm0, %ymm15;
1256*4882a593Smuzhiyun	vmovdqu %ymm15, 14 * 32(%rax);
1257*4882a593Smuzhiyun	vmovdqu %ymm0, 1 * 32(%rsi);
1258*4882a593Smuzhiyun
1259*4882a593Smuzhiyun	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1260*4882a593Smuzhiyun	vpxor 2 * 32(%rdx), %ymm0, %ymm15;
1261*4882a593Smuzhiyun	vmovdqu %ymm15, 13 * 32(%rax);
1262*4882a593Smuzhiyun	vmovdqu %ymm0, 2 * 32(%rsi);
1263*4882a593Smuzhiyun
1264*4882a593Smuzhiyun	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1265*4882a593Smuzhiyun	vpxor 3 * 32(%rdx), %ymm0, %ymm15;
1266*4882a593Smuzhiyun	vmovdqu %ymm15, 12 * 32(%rax);
1267*4882a593Smuzhiyun	vmovdqu %ymm0, 3 * 32(%rsi);
1268*4882a593Smuzhiyun
1269*4882a593Smuzhiyun	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1270*4882a593Smuzhiyun	vpxor 4 * 32(%rdx), %ymm0, %ymm11;
1271*4882a593Smuzhiyun	vmovdqu %ymm0, 4 * 32(%rsi);
1272*4882a593Smuzhiyun
1273*4882a593Smuzhiyun	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1274*4882a593Smuzhiyun	vpxor 5 * 32(%rdx), %ymm0, %ymm10;
1275*4882a593Smuzhiyun	vmovdqu %ymm0, 5 * 32(%rsi);
1276*4882a593Smuzhiyun
1277*4882a593Smuzhiyun	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1278*4882a593Smuzhiyun	vpxor 6 * 32(%rdx), %ymm0, %ymm9;
1279*4882a593Smuzhiyun	vmovdqu %ymm0, 6 * 32(%rsi);
1280*4882a593Smuzhiyun
1281*4882a593Smuzhiyun	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1282*4882a593Smuzhiyun	vpxor 7 * 32(%rdx), %ymm0, %ymm8;
1283*4882a593Smuzhiyun	vmovdqu %ymm0, 7 * 32(%rsi);
1284*4882a593Smuzhiyun
1285*4882a593Smuzhiyun	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1286*4882a593Smuzhiyun	vpxor 8 * 32(%rdx), %ymm0, %ymm7;
1287*4882a593Smuzhiyun	vmovdqu %ymm0, 8 * 32(%rsi);
1288*4882a593Smuzhiyun
1289*4882a593Smuzhiyun	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1290*4882a593Smuzhiyun	vpxor 9 * 32(%rdx), %ymm0, %ymm6;
1291*4882a593Smuzhiyun	vmovdqu %ymm0, 9 * 32(%rsi);
1292*4882a593Smuzhiyun
1293*4882a593Smuzhiyun	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1294*4882a593Smuzhiyun	vpxor 10 * 32(%rdx), %ymm0, %ymm5;
1295*4882a593Smuzhiyun	vmovdqu %ymm0, 10 * 32(%rsi);
1296*4882a593Smuzhiyun
1297*4882a593Smuzhiyun	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1298*4882a593Smuzhiyun	vpxor 11 * 32(%rdx), %ymm0, %ymm4;
1299*4882a593Smuzhiyun	vmovdqu %ymm0, 11 * 32(%rsi);
1300*4882a593Smuzhiyun
1301*4882a593Smuzhiyun	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1302*4882a593Smuzhiyun	vpxor 12 * 32(%rdx), %ymm0, %ymm3;
1303*4882a593Smuzhiyun	vmovdqu %ymm0, 12 * 32(%rsi);
1304*4882a593Smuzhiyun
1305*4882a593Smuzhiyun	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1306*4882a593Smuzhiyun	vpxor 13 * 32(%rdx), %ymm0, %ymm2;
1307*4882a593Smuzhiyun	vmovdqu %ymm0, 13 * 32(%rsi);
1308*4882a593Smuzhiyun
1309*4882a593Smuzhiyun	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1310*4882a593Smuzhiyun	vpxor 14 * 32(%rdx), %ymm0, %ymm1;
1311*4882a593Smuzhiyun	vmovdqu %ymm0, 14 * 32(%rsi);
1312*4882a593Smuzhiyun
1313*4882a593Smuzhiyun	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1314*4882a593Smuzhiyun	vpxor 15 * 32(%rdx), %ymm0, %ymm15;
1315*4882a593Smuzhiyun	vmovdqu %ymm15, 0 * 32(%rax);
1316*4882a593Smuzhiyun	vmovdqu %ymm0, 15 * 32(%rsi);
1317*4882a593Smuzhiyun
1318*4882a593Smuzhiyun	vextracti128 $1, %ymm0, %xmm0;
1319*4882a593Smuzhiyun	gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
1320*4882a593Smuzhiyun	vmovdqu %xmm0, (%rcx);
1321*4882a593Smuzhiyun
1322*4882a593Smuzhiyun	/* inpack32_pre: */
1323*4882a593Smuzhiyun	vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
1324*4882a593Smuzhiyun	vpshufb .Lpack_bswap, %ymm15, %ymm15;
1325*4882a593Smuzhiyun	vpxor 0 * 32(%rax), %ymm15, %ymm0;
1326*4882a593Smuzhiyun	vpxor %ymm1, %ymm15, %ymm1;
1327*4882a593Smuzhiyun	vpxor %ymm2, %ymm15, %ymm2;
1328*4882a593Smuzhiyun	vpxor %ymm3, %ymm15, %ymm3;
1329*4882a593Smuzhiyun	vpxor %ymm4, %ymm15, %ymm4;
1330*4882a593Smuzhiyun	vpxor %ymm5, %ymm15, %ymm5;
1331*4882a593Smuzhiyun	vpxor %ymm6, %ymm15, %ymm6;
1332*4882a593Smuzhiyun	vpxor %ymm7, %ymm15, %ymm7;
1333*4882a593Smuzhiyun	vpxor %ymm8, %ymm15, %ymm8;
1334*4882a593Smuzhiyun	vpxor %ymm9, %ymm15, %ymm9;
1335*4882a593Smuzhiyun	vpxor %ymm10, %ymm15, %ymm10;
1336*4882a593Smuzhiyun	vpxor %ymm11, %ymm15, %ymm11;
1337*4882a593Smuzhiyun	vpxor 12 * 32(%rax), %ymm15, %ymm12;
1338*4882a593Smuzhiyun	vpxor 13 * 32(%rax), %ymm15, %ymm13;
1339*4882a593Smuzhiyun	vpxor 14 * 32(%rax), %ymm15, %ymm14;
1340*4882a593Smuzhiyun	vpxor 15 * 32(%rax), %ymm15, %ymm15;
1341*4882a593Smuzhiyun
1342*4882a593Smuzhiyun	CALL_NOSPEC r9;
1343*4882a593Smuzhiyun
1344*4882a593Smuzhiyun	addq $(16 * 32), %rsp;
1345*4882a593Smuzhiyun
1346*4882a593Smuzhiyun	vpxor 0 * 32(%rsi), %ymm7, %ymm7;
1347*4882a593Smuzhiyun	vpxor 1 * 32(%rsi), %ymm6, %ymm6;
1348*4882a593Smuzhiyun	vpxor 2 * 32(%rsi), %ymm5, %ymm5;
1349*4882a593Smuzhiyun	vpxor 3 * 32(%rsi), %ymm4, %ymm4;
1350*4882a593Smuzhiyun	vpxor 4 * 32(%rsi), %ymm3, %ymm3;
1351*4882a593Smuzhiyun	vpxor 5 * 32(%rsi), %ymm2, %ymm2;
1352*4882a593Smuzhiyun	vpxor 6 * 32(%rsi), %ymm1, %ymm1;
1353*4882a593Smuzhiyun	vpxor 7 * 32(%rsi), %ymm0, %ymm0;
1354*4882a593Smuzhiyun	vpxor 8 * 32(%rsi), %ymm15, %ymm15;
1355*4882a593Smuzhiyun	vpxor 9 * 32(%rsi), %ymm14, %ymm14;
1356*4882a593Smuzhiyun	vpxor 10 * 32(%rsi), %ymm13, %ymm13;
1357*4882a593Smuzhiyun	vpxor 11 * 32(%rsi), %ymm12, %ymm12;
1358*4882a593Smuzhiyun	vpxor 12 * 32(%rsi), %ymm11, %ymm11;
1359*4882a593Smuzhiyun	vpxor 13 * 32(%rsi), %ymm10, %ymm10;
1360*4882a593Smuzhiyun	vpxor 14 * 32(%rsi), %ymm9, %ymm9;
1361*4882a593Smuzhiyun	vpxor 15 * 32(%rsi), %ymm8, %ymm8;
1362*4882a593Smuzhiyun	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1363*4882a593Smuzhiyun		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1364*4882a593Smuzhiyun		     %ymm8, %rsi);
1365*4882a593Smuzhiyun
1366*4882a593Smuzhiyun	vzeroupper;
1367*4882a593Smuzhiyun
1368*4882a593Smuzhiyun	FRAME_END
1369*4882a593Smuzhiyun	RET;
1370*4882a593SmuzhiyunSYM_FUNC_END(camellia_xts_crypt_32way)
1371*4882a593Smuzhiyun
1372*4882a593SmuzhiyunSYM_FUNC_START(camellia_xts_enc_32way)
1373*4882a593Smuzhiyun	/* input:
1374*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
1375*4882a593Smuzhiyun	 *	%rsi: dst (32 blocks)
1376*4882a593Smuzhiyun	 *	%rdx: src (32 blocks)
1377*4882a593Smuzhiyun	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1378*4882a593Smuzhiyun	 */
1379*4882a593Smuzhiyun
1380*4882a593Smuzhiyun	xorl %r8d, %r8d; /* input whitening key, 0 for enc */
1381*4882a593Smuzhiyun
1382*4882a593Smuzhiyun	leaq __camellia_enc_blk32, %r9;
1383*4882a593Smuzhiyun
1384*4882a593Smuzhiyun	jmp camellia_xts_crypt_32way;
1385*4882a593SmuzhiyunSYM_FUNC_END(camellia_xts_enc_32way)
1386*4882a593Smuzhiyun
1387*4882a593SmuzhiyunSYM_FUNC_START(camellia_xts_dec_32way)
1388*4882a593Smuzhiyun	/* input:
1389*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
1390*4882a593Smuzhiyun	 *	%rsi: dst (32 blocks)
1391*4882a593Smuzhiyun	 *	%rdx: src (32 blocks)
1392*4882a593Smuzhiyun	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1393*4882a593Smuzhiyun	 */
1394*4882a593Smuzhiyun
1395*4882a593Smuzhiyun	cmpl $16, key_length(CTX);
1396*4882a593Smuzhiyun	movl $32, %r8d;
1397*4882a593Smuzhiyun	movl $24, %eax;
1398*4882a593Smuzhiyun	cmovel %eax, %r8d;  /* input whitening key, last for dec */
1399*4882a593Smuzhiyun
1400*4882a593Smuzhiyun	leaq __camellia_dec_blk32, %r9;
1401*4882a593Smuzhiyun
1402*4882a593Smuzhiyun	jmp camellia_xts_crypt_32way;
1403*4882a593SmuzhiyunSYM_FUNC_END(camellia_xts_dec_32way)
1404