xref: /OK3568_Linux_fs/kernel/arch/x86/crypto/camellia-aesni-avx-asm_64.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/*
2*4882a593Smuzhiyun * x86_64/AVX/AES-NI assembler implementation of Camellia
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun * This program is free software; you can redistribute it and/or modify
7*4882a593Smuzhiyun * it under the terms of the GNU General Public License as published by
8*4882a593Smuzhiyun * the Free Software Foundation; either version 2 of the License, or
9*4882a593Smuzhiyun * (at your option) any later version.
10*4882a593Smuzhiyun *
11*4882a593Smuzhiyun */
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun/*
14*4882a593Smuzhiyun * Version licensed under 2-clause BSD License is available at:
15*4882a593Smuzhiyun *	http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
16*4882a593Smuzhiyun */
17*4882a593Smuzhiyun
18*4882a593Smuzhiyun#include <linux/linkage.h>
19*4882a593Smuzhiyun#include <asm/frame.h>
20*4882a593Smuzhiyun#include <asm/nospec-branch.h>
21*4882a593Smuzhiyun
22*4882a593Smuzhiyun#define CAMELLIA_TABLE_BYTE_LEN 272
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun/* struct camellia_ctx: */
25*4882a593Smuzhiyun#define key_table 0
26*4882a593Smuzhiyun#define key_length CAMELLIA_TABLE_BYTE_LEN
27*4882a593Smuzhiyun
28*4882a593Smuzhiyun/* register macros */
29*4882a593Smuzhiyun#define CTX %rdi
30*4882a593Smuzhiyun
31*4882a593Smuzhiyun/**********************************************************************
32*4882a593Smuzhiyun  16-way camellia
33*4882a593Smuzhiyun **********************************************************************/
34*4882a593Smuzhiyun#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
35*4882a593Smuzhiyun	vpand x, mask4bit, tmp0; \
36*4882a593Smuzhiyun	vpandn x, mask4bit, x; \
37*4882a593Smuzhiyun	vpsrld $4, x, x; \
38*4882a593Smuzhiyun	\
39*4882a593Smuzhiyun	vpshufb tmp0, lo_t, tmp0; \
40*4882a593Smuzhiyun	vpshufb x, hi_t, x; \
41*4882a593Smuzhiyun	vpxor tmp0, x, x;
42*4882a593Smuzhiyun
43*4882a593Smuzhiyun/*
44*4882a593Smuzhiyun * IN:
45*4882a593Smuzhiyun *   x0..x7: byte-sliced AB state
46*4882a593Smuzhiyun *   mem_cd: register pointer storing CD state
47*4882a593Smuzhiyun *   key: index for key material
48*4882a593Smuzhiyun * OUT:
49*4882a593Smuzhiyun *   x0..x7: new byte-sliced CD state
50*4882a593Smuzhiyun */
51*4882a593Smuzhiyun#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
52*4882a593Smuzhiyun		  t7, mem_cd, key) \
53*4882a593Smuzhiyun	/* \
54*4882a593Smuzhiyun	 * S-function with AES subbytes \
55*4882a593Smuzhiyun	 */ \
56*4882a593Smuzhiyun	vmovdqa .Linv_shift_row, t4; \
57*4882a593Smuzhiyun	vbroadcastss .L0f0f0f0f, t7; \
58*4882a593Smuzhiyun	vmovdqa .Lpre_tf_lo_s1, t0; \
59*4882a593Smuzhiyun	vmovdqa .Lpre_tf_hi_s1, t1; \
60*4882a593Smuzhiyun	\
61*4882a593Smuzhiyun	/* AES inverse shift rows */ \
62*4882a593Smuzhiyun	vpshufb t4, x0, x0; \
63*4882a593Smuzhiyun	vpshufb t4, x7, x7; \
64*4882a593Smuzhiyun	vpshufb t4, x1, x1; \
65*4882a593Smuzhiyun	vpshufb t4, x4, x4; \
66*4882a593Smuzhiyun	vpshufb t4, x2, x2; \
67*4882a593Smuzhiyun	vpshufb t4, x5, x5; \
68*4882a593Smuzhiyun	vpshufb t4, x3, x3; \
69*4882a593Smuzhiyun	vpshufb t4, x6, x6; \
70*4882a593Smuzhiyun	\
71*4882a593Smuzhiyun	/* prefilter sboxes 1, 2 and 3 */ \
72*4882a593Smuzhiyun	vmovdqa .Lpre_tf_lo_s4, t2; \
73*4882a593Smuzhiyun	vmovdqa .Lpre_tf_hi_s4, t3; \
74*4882a593Smuzhiyun	filter_8bit(x0, t0, t1, t7, t6); \
75*4882a593Smuzhiyun	filter_8bit(x7, t0, t1, t7, t6); \
76*4882a593Smuzhiyun	filter_8bit(x1, t0, t1, t7, t6); \
77*4882a593Smuzhiyun	filter_8bit(x4, t0, t1, t7, t6); \
78*4882a593Smuzhiyun	filter_8bit(x2, t0, t1, t7, t6); \
79*4882a593Smuzhiyun	filter_8bit(x5, t0, t1, t7, t6); \
80*4882a593Smuzhiyun	\
81*4882a593Smuzhiyun	/* prefilter sbox 4 */ \
82*4882a593Smuzhiyun	vpxor t4, t4, t4; \
83*4882a593Smuzhiyun	filter_8bit(x3, t2, t3, t7, t6); \
84*4882a593Smuzhiyun	filter_8bit(x6, t2, t3, t7, t6); \
85*4882a593Smuzhiyun	\
86*4882a593Smuzhiyun	/* AES subbytes + AES shift rows */ \
87*4882a593Smuzhiyun	vmovdqa .Lpost_tf_lo_s1, t0; \
88*4882a593Smuzhiyun	vmovdqa .Lpost_tf_hi_s1, t1; \
89*4882a593Smuzhiyun	vaesenclast t4, x0, x0; \
90*4882a593Smuzhiyun	vaesenclast t4, x7, x7; \
91*4882a593Smuzhiyun	vaesenclast t4, x1, x1; \
92*4882a593Smuzhiyun	vaesenclast t4, x4, x4; \
93*4882a593Smuzhiyun	vaesenclast t4, x2, x2; \
94*4882a593Smuzhiyun	vaesenclast t4, x5, x5; \
95*4882a593Smuzhiyun	vaesenclast t4, x3, x3; \
96*4882a593Smuzhiyun	vaesenclast t4, x6, x6; \
97*4882a593Smuzhiyun	\
98*4882a593Smuzhiyun	/* postfilter sboxes 1 and 4 */ \
99*4882a593Smuzhiyun	vmovdqa .Lpost_tf_lo_s3, t2; \
100*4882a593Smuzhiyun	vmovdqa .Lpost_tf_hi_s3, t3; \
101*4882a593Smuzhiyun	filter_8bit(x0, t0, t1, t7, t6); \
102*4882a593Smuzhiyun	filter_8bit(x7, t0, t1, t7, t6); \
103*4882a593Smuzhiyun	filter_8bit(x3, t0, t1, t7, t6); \
104*4882a593Smuzhiyun	filter_8bit(x6, t0, t1, t7, t6); \
105*4882a593Smuzhiyun	\
106*4882a593Smuzhiyun	/* postfilter sbox 3 */ \
107*4882a593Smuzhiyun	vmovdqa .Lpost_tf_lo_s2, t4; \
108*4882a593Smuzhiyun	vmovdqa .Lpost_tf_hi_s2, t5; \
109*4882a593Smuzhiyun	filter_8bit(x2, t2, t3, t7, t6); \
110*4882a593Smuzhiyun	filter_8bit(x5, t2, t3, t7, t6); \
111*4882a593Smuzhiyun	\
112*4882a593Smuzhiyun	vpxor t6, t6, t6; \
113*4882a593Smuzhiyun	vmovq key, t0; \
114*4882a593Smuzhiyun	\
115*4882a593Smuzhiyun	/* postfilter sbox 2 */ \
116*4882a593Smuzhiyun	filter_8bit(x1, t4, t5, t7, t2); \
117*4882a593Smuzhiyun	filter_8bit(x4, t4, t5, t7, t2); \
118*4882a593Smuzhiyun	\
119*4882a593Smuzhiyun	vpsrldq $5, t0, t5; \
120*4882a593Smuzhiyun	vpsrldq $1, t0, t1; \
121*4882a593Smuzhiyun	vpsrldq $2, t0, t2; \
122*4882a593Smuzhiyun	vpsrldq $3, t0, t3; \
123*4882a593Smuzhiyun	vpsrldq $4, t0, t4; \
124*4882a593Smuzhiyun	vpshufb t6, t0, t0; \
125*4882a593Smuzhiyun	vpshufb t6, t1, t1; \
126*4882a593Smuzhiyun	vpshufb t6, t2, t2; \
127*4882a593Smuzhiyun	vpshufb t6, t3, t3; \
128*4882a593Smuzhiyun	vpshufb t6, t4, t4; \
129*4882a593Smuzhiyun	vpsrldq $2, t5, t7; \
130*4882a593Smuzhiyun	vpshufb t6, t7, t7; \
131*4882a593Smuzhiyun	\
132*4882a593Smuzhiyun	/* \
133*4882a593Smuzhiyun	 * P-function \
134*4882a593Smuzhiyun	 */ \
135*4882a593Smuzhiyun	vpxor x5, x0, x0; \
136*4882a593Smuzhiyun	vpxor x6, x1, x1; \
137*4882a593Smuzhiyun	vpxor x7, x2, x2; \
138*4882a593Smuzhiyun	vpxor x4, x3, x3; \
139*4882a593Smuzhiyun	\
140*4882a593Smuzhiyun	vpxor x2, x4, x4; \
141*4882a593Smuzhiyun	vpxor x3, x5, x5; \
142*4882a593Smuzhiyun	vpxor x0, x6, x6; \
143*4882a593Smuzhiyun	vpxor x1, x7, x7; \
144*4882a593Smuzhiyun	\
145*4882a593Smuzhiyun	vpxor x7, x0, x0; \
146*4882a593Smuzhiyun	vpxor x4, x1, x1; \
147*4882a593Smuzhiyun	vpxor x5, x2, x2; \
148*4882a593Smuzhiyun	vpxor x6, x3, x3; \
149*4882a593Smuzhiyun	\
150*4882a593Smuzhiyun	vpxor x3, x4, x4; \
151*4882a593Smuzhiyun	vpxor x0, x5, x5; \
152*4882a593Smuzhiyun	vpxor x1, x6, x6; \
153*4882a593Smuzhiyun	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
154*4882a593Smuzhiyun	\
155*4882a593Smuzhiyun	/* \
156*4882a593Smuzhiyun	 * Add key material and result to CD (x becomes new CD) \
157*4882a593Smuzhiyun	 */ \
158*4882a593Smuzhiyun	\
159*4882a593Smuzhiyun	vpxor t3, x4, x4; \
160*4882a593Smuzhiyun	vpxor 0 * 16(mem_cd), x4, x4; \
161*4882a593Smuzhiyun	\
162*4882a593Smuzhiyun	vpxor t2, x5, x5; \
163*4882a593Smuzhiyun	vpxor 1 * 16(mem_cd), x5, x5; \
164*4882a593Smuzhiyun	\
165*4882a593Smuzhiyun	vpsrldq $1, t5, t3; \
166*4882a593Smuzhiyun	vpshufb t6, t5, t5; \
167*4882a593Smuzhiyun	vpshufb t6, t3, t6; \
168*4882a593Smuzhiyun	\
169*4882a593Smuzhiyun	vpxor t1, x6, x6; \
170*4882a593Smuzhiyun	vpxor 2 * 16(mem_cd), x6, x6; \
171*4882a593Smuzhiyun	\
172*4882a593Smuzhiyun	vpxor t0, x7, x7; \
173*4882a593Smuzhiyun	vpxor 3 * 16(mem_cd), x7, x7; \
174*4882a593Smuzhiyun	\
175*4882a593Smuzhiyun	vpxor t7, x0, x0; \
176*4882a593Smuzhiyun	vpxor 4 * 16(mem_cd), x0, x0; \
177*4882a593Smuzhiyun	\
178*4882a593Smuzhiyun	vpxor t6, x1, x1; \
179*4882a593Smuzhiyun	vpxor 5 * 16(mem_cd), x1, x1; \
180*4882a593Smuzhiyun	\
181*4882a593Smuzhiyun	vpxor t5, x2, x2; \
182*4882a593Smuzhiyun	vpxor 6 * 16(mem_cd), x2, x2; \
183*4882a593Smuzhiyun	\
184*4882a593Smuzhiyun	vpxor t4, x3, x3; \
185*4882a593Smuzhiyun	vpxor 7 * 16(mem_cd), x3, x3;
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun/*
188*4882a593Smuzhiyun * Size optimization... with inlined roundsm16, binary would be over 5 times
189*4882a593Smuzhiyun * larger and would only be 0.5% faster (on sandy-bridge).
190*4882a593Smuzhiyun */
191*4882a593Smuzhiyun.align 8
192*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
193*4882a593Smuzhiyun	roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
194*4882a593Smuzhiyun		  %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
195*4882a593Smuzhiyun		  %rcx, (%r9));
196*4882a593Smuzhiyun	RET;
197*4882a593SmuzhiyunSYM_FUNC_END(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
198*4882a593Smuzhiyun
199*4882a593Smuzhiyun.align 8
200*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
201*4882a593Smuzhiyun	roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
202*4882a593Smuzhiyun		  %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
203*4882a593Smuzhiyun		  %rax, (%r9));
204*4882a593Smuzhiyun	RET;
205*4882a593SmuzhiyunSYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
206*4882a593Smuzhiyun
207*4882a593Smuzhiyun/*
208*4882a593Smuzhiyun * IN/OUT:
209*4882a593Smuzhiyun *  x0..x7: byte-sliced AB state preloaded
210*4882a593Smuzhiyun *  mem_ab: byte-sliced AB state in memory
211*4882a593Smuzhiyun *  mem_cb: byte-sliced CD state in memory
212*4882a593Smuzhiyun */
213*4882a593Smuzhiyun#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
214*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
215*4882a593Smuzhiyun	leaq (key_table + (i) * 8)(CTX), %r9; \
216*4882a593Smuzhiyun	call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
217*4882a593Smuzhiyun	\
218*4882a593Smuzhiyun	vmovdqu x4, 0 * 16(mem_cd); \
219*4882a593Smuzhiyun	vmovdqu x5, 1 * 16(mem_cd); \
220*4882a593Smuzhiyun	vmovdqu x6, 2 * 16(mem_cd); \
221*4882a593Smuzhiyun	vmovdqu x7, 3 * 16(mem_cd); \
222*4882a593Smuzhiyun	vmovdqu x0, 4 * 16(mem_cd); \
223*4882a593Smuzhiyun	vmovdqu x1, 5 * 16(mem_cd); \
224*4882a593Smuzhiyun	vmovdqu x2, 6 * 16(mem_cd); \
225*4882a593Smuzhiyun	vmovdqu x3, 7 * 16(mem_cd); \
226*4882a593Smuzhiyun	\
227*4882a593Smuzhiyun	leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
228*4882a593Smuzhiyun	call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
229*4882a593Smuzhiyun	\
230*4882a593Smuzhiyun	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
231*4882a593Smuzhiyun
232*4882a593Smuzhiyun#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
233*4882a593Smuzhiyun
234*4882a593Smuzhiyun#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
235*4882a593Smuzhiyun	/* Store new AB state */ \
236*4882a593Smuzhiyun	vmovdqu x0, 0 * 16(mem_ab); \
237*4882a593Smuzhiyun	vmovdqu x1, 1 * 16(mem_ab); \
238*4882a593Smuzhiyun	vmovdqu x2, 2 * 16(mem_ab); \
239*4882a593Smuzhiyun	vmovdqu x3, 3 * 16(mem_ab); \
240*4882a593Smuzhiyun	vmovdqu x4, 4 * 16(mem_ab); \
241*4882a593Smuzhiyun	vmovdqu x5, 5 * 16(mem_ab); \
242*4882a593Smuzhiyun	vmovdqu x6, 6 * 16(mem_ab); \
243*4882a593Smuzhiyun	vmovdqu x7, 7 * 16(mem_ab);
244*4882a593Smuzhiyun
245*4882a593Smuzhiyun#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
246*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd, i) \
247*4882a593Smuzhiyun	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
248*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
249*4882a593Smuzhiyun	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
250*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
251*4882a593Smuzhiyun	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
252*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
253*4882a593Smuzhiyun
254*4882a593Smuzhiyun#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
255*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd, i) \
256*4882a593Smuzhiyun	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
257*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
258*4882a593Smuzhiyun	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
259*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
260*4882a593Smuzhiyun	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
261*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
262*4882a593Smuzhiyun
263*4882a593Smuzhiyun/*
264*4882a593Smuzhiyun * IN:
265*4882a593Smuzhiyun *  v0..3: byte-sliced 32-bit integers
266*4882a593Smuzhiyun * OUT:
267*4882a593Smuzhiyun *  v0..3: (IN <<< 1)
268*4882a593Smuzhiyun */
269*4882a593Smuzhiyun#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
270*4882a593Smuzhiyun	vpcmpgtb v0, zero, t0; \
271*4882a593Smuzhiyun	vpaddb v0, v0, v0; \
272*4882a593Smuzhiyun	vpabsb t0, t0; \
273*4882a593Smuzhiyun	\
274*4882a593Smuzhiyun	vpcmpgtb v1, zero, t1; \
275*4882a593Smuzhiyun	vpaddb v1, v1, v1; \
276*4882a593Smuzhiyun	vpabsb t1, t1; \
277*4882a593Smuzhiyun	\
278*4882a593Smuzhiyun	vpcmpgtb v2, zero, t2; \
279*4882a593Smuzhiyun	vpaddb v2, v2, v2; \
280*4882a593Smuzhiyun	vpabsb t2, t2; \
281*4882a593Smuzhiyun	\
282*4882a593Smuzhiyun	vpor t0, v1, v1; \
283*4882a593Smuzhiyun	\
284*4882a593Smuzhiyun	vpcmpgtb v3, zero, t0; \
285*4882a593Smuzhiyun	vpaddb v3, v3, v3; \
286*4882a593Smuzhiyun	vpabsb t0, t0; \
287*4882a593Smuzhiyun	\
288*4882a593Smuzhiyun	vpor t1, v2, v2; \
289*4882a593Smuzhiyun	vpor t2, v3, v3; \
290*4882a593Smuzhiyun	vpor t0, v0, v0;
291*4882a593Smuzhiyun
292*4882a593Smuzhiyun/*
293*4882a593Smuzhiyun * IN:
294*4882a593Smuzhiyun *   r: byte-sliced AB state in memory
295*4882a593Smuzhiyun *   l: byte-sliced CD state in memory
296*4882a593Smuzhiyun * OUT:
297*4882a593Smuzhiyun *   x0..x7: new byte-sliced CD state
298*4882a593Smuzhiyun */
299*4882a593Smuzhiyun#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
300*4882a593Smuzhiyun	      tt1, tt2, tt3, kll, klr, krl, krr) \
301*4882a593Smuzhiyun	/* \
302*4882a593Smuzhiyun	 * t0 = kll; \
303*4882a593Smuzhiyun	 * t0 &= ll; \
304*4882a593Smuzhiyun	 * lr ^= rol32(t0, 1); \
305*4882a593Smuzhiyun	 */ \
306*4882a593Smuzhiyun	vpxor tt0, tt0, tt0; \
307*4882a593Smuzhiyun	vmovd kll, t0; \
308*4882a593Smuzhiyun	vpshufb tt0, t0, t3; \
309*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
310*4882a593Smuzhiyun	vpshufb tt0, t0, t2; \
311*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
312*4882a593Smuzhiyun	vpshufb tt0, t0, t1; \
313*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
314*4882a593Smuzhiyun	vpshufb tt0, t0, t0; \
315*4882a593Smuzhiyun	\
316*4882a593Smuzhiyun	vpand l0, t0, t0; \
317*4882a593Smuzhiyun	vpand l1, t1, t1; \
318*4882a593Smuzhiyun	vpand l2, t2, t2; \
319*4882a593Smuzhiyun	vpand l3, t3, t3; \
320*4882a593Smuzhiyun	\
321*4882a593Smuzhiyun	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
322*4882a593Smuzhiyun	\
323*4882a593Smuzhiyun	vpxor l4, t0, l4; \
324*4882a593Smuzhiyun	vmovdqu l4, 4 * 16(l); \
325*4882a593Smuzhiyun	vpxor l5, t1, l5; \
326*4882a593Smuzhiyun	vmovdqu l5, 5 * 16(l); \
327*4882a593Smuzhiyun	vpxor l6, t2, l6; \
328*4882a593Smuzhiyun	vmovdqu l6, 6 * 16(l); \
329*4882a593Smuzhiyun	vpxor l7, t3, l7; \
330*4882a593Smuzhiyun	vmovdqu l7, 7 * 16(l); \
331*4882a593Smuzhiyun	\
332*4882a593Smuzhiyun	/* \
333*4882a593Smuzhiyun	 * t2 = krr; \
334*4882a593Smuzhiyun	 * t2 |= rr; \
335*4882a593Smuzhiyun	 * rl ^= t2; \
336*4882a593Smuzhiyun	 */ \
337*4882a593Smuzhiyun	\
338*4882a593Smuzhiyun	vmovd krr, t0; \
339*4882a593Smuzhiyun	vpshufb tt0, t0, t3; \
340*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
341*4882a593Smuzhiyun	vpshufb tt0, t0, t2; \
342*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
343*4882a593Smuzhiyun	vpshufb tt0, t0, t1; \
344*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
345*4882a593Smuzhiyun	vpshufb tt0, t0, t0; \
346*4882a593Smuzhiyun	\
347*4882a593Smuzhiyun	vpor 4 * 16(r), t0, t0; \
348*4882a593Smuzhiyun	vpor 5 * 16(r), t1, t1; \
349*4882a593Smuzhiyun	vpor 6 * 16(r), t2, t2; \
350*4882a593Smuzhiyun	vpor 7 * 16(r), t3, t3; \
351*4882a593Smuzhiyun	\
352*4882a593Smuzhiyun	vpxor 0 * 16(r), t0, t0; \
353*4882a593Smuzhiyun	vpxor 1 * 16(r), t1, t1; \
354*4882a593Smuzhiyun	vpxor 2 * 16(r), t2, t2; \
355*4882a593Smuzhiyun	vpxor 3 * 16(r), t3, t3; \
356*4882a593Smuzhiyun	vmovdqu t0, 0 * 16(r); \
357*4882a593Smuzhiyun	vmovdqu t1, 1 * 16(r); \
358*4882a593Smuzhiyun	vmovdqu t2, 2 * 16(r); \
359*4882a593Smuzhiyun	vmovdqu t3, 3 * 16(r); \
360*4882a593Smuzhiyun	\
361*4882a593Smuzhiyun	/* \
362*4882a593Smuzhiyun	 * t2 = krl; \
363*4882a593Smuzhiyun	 * t2 &= rl; \
364*4882a593Smuzhiyun	 * rr ^= rol32(t2, 1); \
365*4882a593Smuzhiyun	 */ \
366*4882a593Smuzhiyun	vmovd krl, t0; \
367*4882a593Smuzhiyun	vpshufb tt0, t0, t3; \
368*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
369*4882a593Smuzhiyun	vpshufb tt0, t0, t2; \
370*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
371*4882a593Smuzhiyun	vpshufb tt0, t0, t1; \
372*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
373*4882a593Smuzhiyun	vpshufb tt0, t0, t0; \
374*4882a593Smuzhiyun	\
375*4882a593Smuzhiyun	vpand 0 * 16(r), t0, t0; \
376*4882a593Smuzhiyun	vpand 1 * 16(r), t1, t1; \
377*4882a593Smuzhiyun	vpand 2 * 16(r), t2, t2; \
378*4882a593Smuzhiyun	vpand 3 * 16(r), t3, t3; \
379*4882a593Smuzhiyun	\
380*4882a593Smuzhiyun	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
381*4882a593Smuzhiyun	\
382*4882a593Smuzhiyun	vpxor 4 * 16(r), t0, t0; \
383*4882a593Smuzhiyun	vpxor 5 * 16(r), t1, t1; \
384*4882a593Smuzhiyun	vpxor 6 * 16(r), t2, t2; \
385*4882a593Smuzhiyun	vpxor 7 * 16(r), t3, t3; \
386*4882a593Smuzhiyun	vmovdqu t0, 4 * 16(r); \
387*4882a593Smuzhiyun	vmovdqu t1, 5 * 16(r); \
388*4882a593Smuzhiyun	vmovdqu t2, 6 * 16(r); \
389*4882a593Smuzhiyun	vmovdqu t3, 7 * 16(r); \
390*4882a593Smuzhiyun	\
391*4882a593Smuzhiyun	/* \
392*4882a593Smuzhiyun	 * t0 = klr; \
393*4882a593Smuzhiyun	 * t0 |= lr; \
394*4882a593Smuzhiyun	 * ll ^= t0; \
395*4882a593Smuzhiyun	 */ \
396*4882a593Smuzhiyun	\
397*4882a593Smuzhiyun	vmovd klr, t0; \
398*4882a593Smuzhiyun	vpshufb tt0, t0, t3; \
399*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
400*4882a593Smuzhiyun	vpshufb tt0, t0, t2; \
401*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
402*4882a593Smuzhiyun	vpshufb tt0, t0, t1; \
403*4882a593Smuzhiyun	vpsrldq $1, t0, t0; \
404*4882a593Smuzhiyun	vpshufb tt0, t0, t0; \
405*4882a593Smuzhiyun	\
406*4882a593Smuzhiyun	vpor l4, t0, t0; \
407*4882a593Smuzhiyun	vpor l5, t1, t1; \
408*4882a593Smuzhiyun	vpor l6, t2, t2; \
409*4882a593Smuzhiyun	vpor l7, t3, t3; \
410*4882a593Smuzhiyun	\
411*4882a593Smuzhiyun	vpxor l0, t0, l0; \
412*4882a593Smuzhiyun	vmovdqu l0, 0 * 16(l); \
413*4882a593Smuzhiyun	vpxor l1, t1, l1; \
414*4882a593Smuzhiyun	vmovdqu l1, 1 * 16(l); \
415*4882a593Smuzhiyun	vpxor l2, t2, l2; \
416*4882a593Smuzhiyun	vmovdqu l2, 2 * 16(l); \
417*4882a593Smuzhiyun	vpxor l3, t3, l3; \
418*4882a593Smuzhiyun	vmovdqu l3, 3 * 16(l);
419*4882a593Smuzhiyun
420*4882a593Smuzhiyun#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
421*4882a593Smuzhiyun	vpunpckhdq x1, x0, t2; \
422*4882a593Smuzhiyun	vpunpckldq x1, x0, x0; \
423*4882a593Smuzhiyun	\
424*4882a593Smuzhiyun	vpunpckldq x3, x2, t1; \
425*4882a593Smuzhiyun	vpunpckhdq x3, x2, x2; \
426*4882a593Smuzhiyun	\
427*4882a593Smuzhiyun	vpunpckhqdq t1, x0, x1; \
428*4882a593Smuzhiyun	vpunpcklqdq t1, x0, x0; \
429*4882a593Smuzhiyun	\
430*4882a593Smuzhiyun	vpunpckhqdq x2, t2, x3; \
431*4882a593Smuzhiyun	vpunpcklqdq x2, t2, x2;
432*4882a593Smuzhiyun
433*4882a593Smuzhiyun#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
434*4882a593Smuzhiyun			 b3, c3, d3, st0, st1) \
435*4882a593Smuzhiyun	vmovdqu d2, st0; \
436*4882a593Smuzhiyun	vmovdqu d3, st1; \
437*4882a593Smuzhiyun	transpose_4x4(a0, a1, a2, a3, d2, d3); \
438*4882a593Smuzhiyun	transpose_4x4(b0, b1, b2, b3, d2, d3); \
439*4882a593Smuzhiyun	vmovdqu st0, d2; \
440*4882a593Smuzhiyun	vmovdqu st1, d3; \
441*4882a593Smuzhiyun	\
442*4882a593Smuzhiyun	vmovdqu a0, st0; \
443*4882a593Smuzhiyun	vmovdqu a1, st1; \
444*4882a593Smuzhiyun	transpose_4x4(c0, c1, c2, c3, a0, a1); \
445*4882a593Smuzhiyun	transpose_4x4(d0, d1, d2, d3, a0, a1); \
446*4882a593Smuzhiyun	\
447*4882a593Smuzhiyun	vmovdqu .Lshufb_16x16b, a0; \
448*4882a593Smuzhiyun	vmovdqu st1, a1; \
449*4882a593Smuzhiyun	vpshufb a0, a2, a2; \
450*4882a593Smuzhiyun	vpshufb a0, a3, a3; \
451*4882a593Smuzhiyun	vpshufb a0, b0, b0; \
452*4882a593Smuzhiyun	vpshufb a0, b1, b1; \
453*4882a593Smuzhiyun	vpshufb a0, b2, b2; \
454*4882a593Smuzhiyun	vpshufb a0, b3, b3; \
455*4882a593Smuzhiyun	vpshufb a0, a1, a1; \
456*4882a593Smuzhiyun	vpshufb a0, c0, c0; \
457*4882a593Smuzhiyun	vpshufb a0, c1, c1; \
458*4882a593Smuzhiyun	vpshufb a0, c2, c2; \
459*4882a593Smuzhiyun	vpshufb a0, c3, c3; \
460*4882a593Smuzhiyun	vpshufb a0, d0, d0; \
461*4882a593Smuzhiyun	vpshufb a0, d1, d1; \
462*4882a593Smuzhiyun	vpshufb a0, d2, d2; \
463*4882a593Smuzhiyun	vpshufb a0, d3, d3; \
464*4882a593Smuzhiyun	vmovdqu d3, st1; \
465*4882a593Smuzhiyun	vmovdqu st0, d3; \
466*4882a593Smuzhiyun	vpshufb a0, d3, a0; \
467*4882a593Smuzhiyun	vmovdqu d2, st0; \
468*4882a593Smuzhiyun	\
469*4882a593Smuzhiyun	transpose_4x4(a0, b0, c0, d0, d2, d3); \
470*4882a593Smuzhiyun	transpose_4x4(a1, b1, c1, d1, d2, d3); \
471*4882a593Smuzhiyun	vmovdqu st0, d2; \
472*4882a593Smuzhiyun	vmovdqu st1, d3; \
473*4882a593Smuzhiyun	\
474*4882a593Smuzhiyun	vmovdqu b0, st0; \
475*4882a593Smuzhiyun	vmovdqu b1, st1; \
476*4882a593Smuzhiyun	transpose_4x4(a2, b2, c2, d2, b0, b1); \
477*4882a593Smuzhiyun	transpose_4x4(a3, b3, c3, d3, b0, b1); \
478*4882a593Smuzhiyun	vmovdqu st0, b0; \
479*4882a593Smuzhiyun	vmovdqu st1, b1; \
480*4882a593Smuzhiyun	/* does not adjust output bytes inside vectors */
481*4882a593Smuzhiyun
482*4882a593Smuzhiyun/* load blocks to registers and apply pre-whitening */
483*4882a593Smuzhiyun#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
484*4882a593Smuzhiyun		     y6, y7, rio, key) \
485*4882a593Smuzhiyun	vmovq key, x0; \
486*4882a593Smuzhiyun	vpshufb .Lpack_bswap, x0, x0; \
487*4882a593Smuzhiyun	\
488*4882a593Smuzhiyun	vpxor 0 * 16(rio), x0, y7; \
489*4882a593Smuzhiyun	vpxor 1 * 16(rio), x0, y6; \
490*4882a593Smuzhiyun	vpxor 2 * 16(rio), x0, y5; \
491*4882a593Smuzhiyun	vpxor 3 * 16(rio), x0, y4; \
492*4882a593Smuzhiyun	vpxor 4 * 16(rio), x0, y3; \
493*4882a593Smuzhiyun	vpxor 5 * 16(rio), x0, y2; \
494*4882a593Smuzhiyun	vpxor 6 * 16(rio), x0, y1; \
495*4882a593Smuzhiyun	vpxor 7 * 16(rio), x0, y0; \
496*4882a593Smuzhiyun	vpxor 8 * 16(rio), x0, x7; \
497*4882a593Smuzhiyun	vpxor 9 * 16(rio), x0, x6; \
498*4882a593Smuzhiyun	vpxor 10 * 16(rio), x0, x5; \
499*4882a593Smuzhiyun	vpxor 11 * 16(rio), x0, x4; \
500*4882a593Smuzhiyun	vpxor 12 * 16(rio), x0, x3; \
501*4882a593Smuzhiyun	vpxor 13 * 16(rio), x0, x2; \
502*4882a593Smuzhiyun	vpxor 14 * 16(rio), x0, x1; \
503*4882a593Smuzhiyun	vpxor 15 * 16(rio), x0, x0;
504*4882a593Smuzhiyun
505*4882a593Smuzhiyun/* byteslice pre-whitened blocks and store to temporary memory */
506*4882a593Smuzhiyun#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
507*4882a593Smuzhiyun		      y6, y7, mem_ab, mem_cd) \
508*4882a593Smuzhiyun	byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
509*4882a593Smuzhiyun			 y5, y6, y7, (mem_ab), (mem_cd)); \
510*4882a593Smuzhiyun	\
511*4882a593Smuzhiyun	vmovdqu x0, 0 * 16(mem_ab); \
512*4882a593Smuzhiyun	vmovdqu x1, 1 * 16(mem_ab); \
513*4882a593Smuzhiyun	vmovdqu x2, 2 * 16(mem_ab); \
514*4882a593Smuzhiyun	vmovdqu x3, 3 * 16(mem_ab); \
515*4882a593Smuzhiyun	vmovdqu x4, 4 * 16(mem_ab); \
516*4882a593Smuzhiyun	vmovdqu x5, 5 * 16(mem_ab); \
517*4882a593Smuzhiyun	vmovdqu x6, 6 * 16(mem_ab); \
518*4882a593Smuzhiyun	vmovdqu x7, 7 * 16(mem_ab); \
519*4882a593Smuzhiyun	vmovdqu y0, 0 * 16(mem_cd); \
520*4882a593Smuzhiyun	vmovdqu y1, 1 * 16(mem_cd); \
521*4882a593Smuzhiyun	vmovdqu y2, 2 * 16(mem_cd); \
522*4882a593Smuzhiyun	vmovdqu y3, 3 * 16(mem_cd); \
523*4882a593Smuzhiyun	vmovdqu y4, 4 * 16(mem_cd); \
524*4882a593Smuzhiyun	vmovdqu y5, 5 * 16(mem_cd); \
525*4882a593Smuzhiyun	vmovdqu y6, 6 * 16(mem_cd); \
526*4882a593Smuzhiyun	vmovdqu y7, 7 * 16(mem_cd);
527*4882a593Smuzhiyun
528*4882a593Smuzhiyun/* de-byteslice, apply post-whitening and store blocks */
529*4882a593Smuzhiyun#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
530*4882a593Smuzhiyun		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
531*4882a593Smuzhiyun	byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
532*4882a593Smuzhiyun			 y7, x3, x7, stack_tmp0, stack_tmp1); \
533*4882a593Smuzhiyun	\
534*4882a593Smuzhiyun	vmovdqu x0, stack_tmp0; \
535*4882a593Smuzhiyun	\
536*4882a593Smuzhiyun	vmovq key, x0; \
537*4882a593Smuzhiyun	vpshufb .Lpack_bswap, x0, x0; \
538*4882a593Smuzhiyun	\
539*4882a593Smuzhiyun	vpxor x0, y7, y7; \
540*4882a593Smuzhiyun	vpxor x0, y6, y6; \
541*4882a593Smuzhiyun	vpxor x0, y5, y5; \
542*4882a593Smuzhiyun	vpxor x0, y4, y4; \
543*4882a593Smuzhiyun	vpxor x0, y3, y3; \
544*4882a593Smuzhiyun	vpxor x0, y2, y2; \
545*4882a593Smuzhiyun	vpxor x0, y1, y1; \
546*4882a593Smuzhiyun	vpxor x0, y0, y0; \
547*4882a593Smuzhiyun	vpxor x0, x7, x7; \
548*4882a593Smuzhiyun	vpxor x0, x6, x6; \
549*4882a593Smuzhiyun	vpxor x0, x5, x5; \
550*4882a593Smuzhiyun	vpxor x0, x4, x4; \
551*4882a593Smuzhiyun	vpxor x0, x3, x3; \
552*4882a593Smuzhiyun	vpxor x0, x2, x2; \
553*4882a593Smuzhiyun	vpxor x0, x1, x1; \
554*4882a593Smuzhiyun	vpxor stack_tmp0, x0, x0;
555*4882a593Smuzhiyun
556*4882a593Smuzhiyun#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
557*4882a593Smuzhiyun		     y6, y7, rio) \
558*4882a593Smuzhiyun	vmovdqu x0, 0 * 16(rio); \
559*4882a593Smuzhiyun	vmovdqu x1, 1 * 16(rio); \
560*4882a593Smuzhiyun	vmovdqu x2, 2 * 16(rio); \
561*4882a593Smuzhiyun	vmovdqu x3, 3 * 16(rio); \
562*4882a593Smuzhiyun	vmovdqu x4, 4 * 16(rio); \
563*4882a593Smuzhiyun	vmovdqu x5, 5 * 16(rio); \
564*4882a593Smuzhiyun	vmovdqu x6, 6 * 16(rio); \
565*4882a593Smuzhiyun	vmovdqu x7, 7 * 16(rio); \
566*4882a593Smuzhiyun	vmovdqu y0, 8 * 16(rio); \
567*4882a593Smuzhiyun	vmovdqu y1, 9 * 16(rio); \
568*4882a593Smuzhiyun	vmovdqu y2, 10 * 16(rio); \
569*4882a593Smuzhiyun	vmovdqu y3, 11 * 16(rio); \
570*4882a593Smuzhiyun	vmovdqu y4, 12 * 16(rio); \
571*4882a593Smuzhiyun	vmovdqu y5, 13 * 16(rio); \
572*4882a593Smuzhiyun	vmovdqu y6, 14 * 16(rio); \
573*4882a593Smuzhiyun	vmovdqu y7, 15 * 16(rio);
574*4882a593Smuzhiyun
575*4882a593Smuzhiyun
576*4882a593Smuzhiyun/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
577*4882a593Smuzhiyun.section	.rodata.cst16, "aM", @progbits, 16
578*4882a593Smuzhiyun.align 16
579*4882a593Smuzhiyun
580*4882a593Smuzhiyun#define SHUFB_BYTES(idx) \
581*4882a593Smuzhiyun	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
582*4882a593Smuzhiyun
583*4882a593Smuzhiyun.Lshufb_16x16b:
584*4882a593Smuzhiyun	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
585*4882a593Smuzhiyun
586*4882a593Smuzhiyun.Lpack_bswap:
587*4882a593Smuzhiyun	.long 0x00010203
588*4882a593Smuzhiyun	.long 0x04050607
589*4882a593Smuzhiyun	.long 0x80808080
590*4882a593Smuzhiyun	.long 0x80808080
591*4882a593Smuzhiyun
592*4882a593Smuzhiyun/* For CTR-mode IV byteswap */
593*4882a593Smuzhiyun.Lbswap128_mask:
594*4882a593Smuzhiyun	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
595*4882a593Smuzhiyun
596*4882a593Smuzhiyun/* For XTS mode IV generation */
597*4882a593Smuzhiyun.Lxts_gf128mul_and_shl1_mask:
598*4882a593Smuzhiyun	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
599*4882a593Smuzhiyun
600*4882a593Smuzhiyun/*
601*4882a593Smuzhiyun * pre-SubByte transform
602*4882a593Smuzhiyun *
603*4882a593Smuzhiyun * pre-lookup for sbox1, sbox2, sbox3:
604*4882a593Smuzhiyun *   swap_bitendianness(
605*4882a593Smuzhiyun *       isom_map_camellia_to_aes(
606*4882a593Smuzhiyun *           camellia_f(
607*4882a593Smuzhiyun *               swap_bitendianess(in)
608*4882a593Smuzhiyun *           )
609*4882a593Smuzhiyun *       )
610*4882a593Smuzhiyun *   )
611*4882a593Smuzhiyun *
612*4882a593Smuzhiyun * (note: '⊕ 0xc5' inside camellia_f())
613*4882a593Smuzhiyun */
614*4882a593Smuzhiyun.Lpre_tf_lo_s1:
615*4882a593Smuzhiyun	.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
616*4882a593Smuzhiyun	.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
617*4882a593Smuzhiyun.Lpre_tf_hi_s1:
618*4882a593Smuzhiyun	.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
619*4882a593Smuzhiyun	.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
620*4882a593Smuzhiyun
621*4882a593Smuzhiyun/*
622*4882a593Smuzhiyun * pre-SubByte transform
623*4882a593Smuzhiyun *
624*4882a593Smuzhiyun * pre-lookup for sbox4:
625*4882a593Smuzhiyun *   swap_bitendianness(
626*4882a593Smuzhiyun *       isom_map_camellia_to_aes(
627*4882a593Smuzhiyun *           camellia_f(
628*4882a593Smuzhiyun *               swap_bitendianess(in <<< 1)
629*4882a593Smuzhiyun *           )
630*4882a593Smuzhiyun *       )
631*4882a593Smuzhiyun *   )
632*4882a593Smuzhiyun *
633*4882a593Smuzhiyun * (note: '⊕ 0xc5' inside camellia_f())
634*4882a593Smuzhiyun */
635*4882a593Smuzhiyun.Lpre_tf_lo_s4:
636*4882a593Smuzhiyun	.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
637*4882a593Smuzhiyun	.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
638*4882a593Smuzhiyun.Lpre_tf_hi_s4:
639*4882a593Smuzhiyun	.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
640*4882a593Smuzhiyun	.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
641*4882a593Smuzhiyun
642*4882a593Smuzhiyun/*
643*4882a593Smuzhiyun * post-SubByte transform
644*4882a593Smuzhiyun *
645*4882a593Smuzhiyun * post-lookup for sbox1, sbox4:
646*4882a593Smuzhiyun *  swap_bitendianness(
647*4882a593Smuzhiyun *      camellia_h(
648*4882a593Smuzhiyun *          isom_map_aes_to_camellia(
649*4882a593Smuzhiyun *              swap_bitendianness(
650*4882a593Smuzhiyun *                  aes_inverse_affine_transform(in)
651*4882a593Smuzhiyun *              )
652*4882a593Smuzhiyun *          )
653*4882a593Smuzhiyun *      )
654*4882a593Smuzhiyun *  )
655*4882a593Smuzhiyun *
656*4882a593Smuzhiyun * (note: '⊕ 0x6e' inside camellia_h())
657*4882a593Smuzhiyun */
658*4882a593Smuzhiyun.Lpost_tf_lo_s1:
659*4882a593Smuzhiyun	.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
660*4882a593Smuzhiyun	.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
661*4882a593Smuzhiyun.Lpost_tf_hi_s1:
662*4882a593Smuzhiyun	.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
663*4882a593Smuzhiyun	.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
664*4882a593Smuzhiyun
665*4882a593Smuzhiyun/*
666*4882a593Smuzhiyun * post-SubByte transform
667*4882a593Smuzhiyun *
668*4882a593Smuzhiyun * post-lookup for sbox2:
669*4882a593Smuzhiyun *  swap_bitendianness(
670*4882a593Smuzhiyun *      camellia_h(
671*4882a593Smuzhiyun *          isom_map_aes_to_camellia(
672*4882a593Smuzhiyun *              swap_bitendianness(
673*4882a593Smuzhiyun *                  aes_inverse_affine_transform(in)
674*4882a593Smuzhiyun *              )
675*4882a593Smuzhiyun *          )
676*4882a593Smuzhiyun *      )
677*4882a593Smuzhiyun *  ) <<< 1
678*4882a593Smuzhiyun *
679*4882a593Smuzhiyun * (note: '⊕ 0x6e' inside camellia_h())
680*4882a593Smuzhiyun */
681*4882a593Smuzhiyun.Lpost_tf_lo_s2:
682*4882a593Smuzhiyun	.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
683*4882a593Smuzhiyun	.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
684*4882a593Smuzhiyun.Lpost_tf_hi_s2:
685*4882a593Smuzhiyun	.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
686*4882a593Smuzhiyun	.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
687*4882a593Smuzhiyun
688*4882a593Smuzhiyun/*
689*4882a593Smuzhiyun * post-SubByte transform
690*4882a593Smuzhiyun *
691*4882a593Smuzhiyun * post-lookup for sbox3:
692*4882a593Smuzhiyun *  swap_bitendianness(
693*4882a593Smuzhiyun *      camellia_h(
694*4882a593Smuzhiyun *          isom_map_aes_to_camellia(
695*4882a593Smuzhiyun *              swap_bitendianness(
696*4882a593Smuzhiyun *                  aes_inverse_affine_transform(in)
697*4882a593Smuzhiyun *              )
698*4882a593Smuzhiyun *          )
699*4882a593Smuzhiyun *      )
700*4882a593Smuzhiyun *  ) >>> 1
701*4882a593Smuzhiyun *
702*4882a593Smuzhiyun * (note: '⊕ 0x6e' inside camellia_h())
703*4882a593Smuzhiyun */
704*4882a593Smuzhiyun.Lpost_tf_lo_s3:
705*4882a593Smuzhiyun	.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
706*4882a593Smuzhiyun	.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
707*4882a593Smuzhiyun.Lpost_tf_hi_s3:
708*4882a593Smuzhiyun	.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
709*4882a593Smuzhiyun	.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
710*4882a593Smuzhiyun
711*4882a593Smuzhiyun/* For isolating SubBytes from AESENCLAST, inverse shift row */
712*4882a593Smuzhiyun.Linv_shift_row:
713*4882a593Smuzhiyun	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
714*4882a593Smuzhiyun	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
715*4882a593Smuzhiyun
716*4882a593Smuzhiyun/* 4-bit mask */
717*4882a593Smuzhiyun.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
718*4882a593Smuzhiyun.align 4
719*4882a593Smuzhiyun.L0f0f0f0f:
720*4882a593Smuzhiyun	.long 0x0f0f0f0f
721*4882a593Smuzhiyun
722*4882a593Smuzhiyun.text
723*4882a593Smuzhiyun
724*4882a593Smuzhiyun.align 8
725*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(__camellia_enc_blk16)
726*4882a593Smuzhiyun	/* input:
727*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
728*4882a593Smuzhiyun	 *	%rax: temporary storage, 256 bytes
729*4882a593Smuzhiyun	 *	%xmm0..%xmm15: 16 plaintext blocks
730*4882a593Smuzhiyun	 * output:
731*4882a593Smuzhiyun	 *	%xmm0..%xmm15: 16 encrypted blocks, order swapped:
732*4882a593Smuzhiyun	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
733*4882a593Smuzhiyun	 */
734*4882a593Smuzhiyun	FRAME_BEGIN
735*4882a593Smuzhiyun
736*4882a593Smuzhiyun	leaq 8 * 16(%rax), %rcx;
737*4882a593Smuzhiyun
738*4882a593Smuzhiyun	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
739*4882a593Smuzhiyun		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
740*4882a593Smuzhiyun		      %xmm15, %rax, %rcx);
741*4882a593Smuzhiyun
742*4882a593Smuzhiyun	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
743*4882a593Smuzhiyun		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
744*4882a593Smuzhiyun		     %xmm15, %rax, %rcx, 0);
745*4882a593Smuzhiyun
746*4882a593Smuzhiyun	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
747*4882a593Smuzhiyun	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
748*4882a593Smuzhiyun	      %xmm15,
749*4882a593Smuzhiyun	      ((key_table + (8) * 8) + 0)(CTX),
750*4882a593Smuzhiyun	      ((key_table + (8) * 8) + 4)(CTX),
751*4882a593Smuzhiyun	      ((key_table + (8) * 8) + 8)(CTX),
752*4882a593Smuzhiyun	      ((key_table + (8) * 8) + 12)(CTX));
753*4882a593Smuzhiyun
754*4882a593Smuzhiyun	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
755*4882a593Smuzhiyun		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
756*4882a593Smuzhiyun		     %xmm15, %rax, %rcx, 8);
757*4882a593Smuzhiyun
758*4882a593Smuzhiyun	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
759*4882a593Smuzhiyun	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
760*4882a593Smuzhiyun	      %xmm15,
761*4882a593Smuzhiyun	      ((key_table + (16) * 8) + 0)(CTX),
762*4882a593Smuzhiyun	      ((key_table + (16) * 8) + 4)(CTX),
763*4882a593Smuzhiyun	      ((key_table + (16) * 8) + 8)(CTX),
764*4882a593Smuzhiyun	      ((key_table + (16) * 8) + 12)(CTX));
765*4882a593Smuzhiyun
766*4882a593Smuzhiyun	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
767*4882a593Smuzhiyun		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
768*4882a593Smuzhiyun		     %xmm15, %rax, %rcx, 16);
769*4882a593Smuzhiyun
770*4882a593Smuzhiyun	movl $24, %r8d;
771*4882a593Smuzhiyun	cmpl $16, key_length(CTX);
772*4882a593Smuzhiyun	jne .Lenc_max32;
773*4882a593Smuzhiyun
774*4882a593Smuzhiyun.Lenc_done:
775*4882a593Smuzhiyun	/* load CD for output */
776*4882a593Smuzhiyun	vmovdqu 0 * 16(%rcx), %xmm8;
777*4882a593Smuzhiyun	vmovdqu 1 * 16(%rcx), %xmm9;
778*4882a593Smuzhiyun	vmovdqu 2 * 16(%rcx), %xmm10;
779*4882a593Smuzhiyun	vmovdqu 3 * 16(%rcx), %xmm11;
780*4882a593Smuzhiyun	vmovdqu 4 * 16(%rcx), %xmm12;
781*4882a593Smuzhiyun	vmovdqu 5 * 16(%rcx), %xmm13;
782*4882a593Smuzhiyun	vmovdqu 6 * 16(%rcx), %xmm14;
783*4882a593Smuzhiyun	vmovdqu 7 * 16(%rcx), %xmm15;
784*4882a593Smuzhiyun
785*4882a593Smuzhiyun	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
786*4882a593Smuzhiyun		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
787*4882a593Smuzhiyun		    %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
788*4882a593Smuzhiyun
789*4882a593Smuzhiyun	FRAME_END
790*4882a593Smuzhiyun	RET;
791*4882a593Smuzhiyun
792*4882a593Smuzhiyun.align 8
793*4882a593Smuzhiyun.Lenc_max32:
794*4882a593Smuzhiyun	movl $32, %r8d;
795*4882a593Smuzhiyun
796*4882a593Smuzhiyun	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
797*4882a593Smuzhiyun	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
798*4882a593Smuzhiyun	      %xmm15,
799*4882a593Smuzhiyun	      ((key_table + (24) * 8) + 0)(CTX),
800*4882a593Smuzhiyun	      ((key_table + (24) * 8) + 4)(CTX),
801*4882a593Smuzhiyun	      ((key_table + (24) * 8) + 8)(CTX),
802*4882a593Smuzhiyun	      ((key_table + (24) * 8) + 12)(CTX));
803*4882a593Smuzhiyun
804*4882a593Smuzhiyun	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
805*4882a593Smuzhiyun		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
806*4882a593Smuzhiyun		     %xmm15, %rax, %rcx, 24);
807*4882a593Smuzhiyun
808*4882a593Smuzhiyun	jmp .Lenc_done;
809*4882a593SmuzhiyunSYM_FUNC_END(__camellia_enc_blk16)
810*4882a593Smuzhiyun
811*4882a593Smuzhiyun.align 8
812*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(__camellia_dec_blk16)
813*4882a593Smuzhiyun	/* input:
814*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
815*4882a593Smuzhiyun	 *	%rax: temporary storage, 256 bytes
816*4882a593Smuzhiyun	 *	%r8d: 24 for 16 byte key, 32 for larger
817*4882a593Smuzhiyun	 *	%xmm0..%xmm15: 16 encrypted blocks
818*4882a593Smuzhiyun	 * output:
819*4882a593Smuzhiyun	 *	%xmm0..%xmm15: 16 plaintext blocks, order swapped:
820*4882a593Smuzhiyun	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
821*4882a593Smuzhiyun	 */
822*4882a593Smuzhiyun	FRAME_BEGIN
823*4882a593Smuzhiyun
824*4882a593Smuzhiyun	leaq 8 * 16(%rax), %rcx;
825*4882a593Smuzhiyun
826*4882a593Smuzhiyun	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
827*4882a593Smuzhiyun		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
828*4882a593Smuzhiyun		      %xmm15, %rax, %rcx);
829*4882a593Smuzhiyun
830*4882a593Smuzhiyun	cmpl $32, %r8d;
831*4882a593Smuzhiyun	je .Ldec_max32;
832*4882a593Smuzhiyun
833*4882a593Smuzhiyun.Ldec_max24:
834*4882a593Smuzhiyun	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
835*4882a593Smuzhiyun		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
836*4882a593Smuzhiyun		     %xmm15, %rax, %rcx, 16);
837*4882a593Smuzhiyun
838*4882a593Smuzhiyun	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
839*4882a593Smuzhiyun	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
840*4882a593Smuzhiyun	      %xmm15,
841*4882a593Smuzhiyun	      ((key_table + (16) * 8) + 8)(CTX),
842*4882a593Smuzhiyun	      ((key_table + (16) * 8) + 12)(CTX),
843*4882a593Smuzhiyun	      ((key_table + (16) * 8) + 0)(CTX),
844*4882a593Smuzhiyun	      ((key_table + (16) * 8) + 4)(CTX));
845*4882a593Smuzhiyun
846*4882a593Smuzhiyun	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
847*4882a593Smuzhiyun		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
848*4882a593Smuzhiyun		     %xmm15, %rax, %rcx, 8);
849*4882a593Smuzhiyun
850*4882a593Smuzhiyun	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
851*4882a593Smuzhiyun	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
852*4882a593Smuzhiyun	      %xmm15,
853*4882a593Smuzhiyun	      ((key_table + (8) * 8) + 8)(CTX),
854*4882a593Smuzhiyun	      ((key_table + (8) * 8) + 12)(CTX),
855*4882a593Smuzhiyun	      ((key_table + (8) * 8) + 0)(CTX),
856*4882a593Smuzhiyun	      ((key_table + (8) * 8) + 4)(CTX));
857*4882a593Smuzhiyun
858*4882a593Smuzhiyun	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
859*4882a593Smuzhiyun		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
860*4882a593Smuzhiyun		     %xmm15, %rax, %rcx, 0);
861*4882a593Smuzhiyun
862*4882a593Smuzhiyun	/* load CD for output */
863*4882a593Smuzhiyun	vmovdqu 0 * 16(%rcx), %xmm8;
864*4882a593Smuzhiyun	vmovdqu 1 * 16(%rcx), %xmm9;
865*4882a593Smuzhiyun	vmovdqu 2 * 16(%rcx), %xmm10;
866*4882a593Smuzhiyun	vmovdqu 3 * 16(%rcx), %xmm11;
867*4882a593Smuzhiyun	vmovdqu 4 * 16(%rcx), %xmm12;
868*4882a593Smuzhiyun	vmovdqu 5 * 16(%rcx), %xmm13;
869*4882a593Smuzhiyun	vmovdqu 6 * 16(%rcx), %xmm14;
870*4882a593Smuzhiyun	vmovdqu 7 * 16(%rcx), %xmm15;
871*4882a593Smuzhiyun
872*4882a593Smuzhiyun	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
873*4882a593Smuzhiyun		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
874*4882a593Smuzhiyun		    %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
875*4882a593Smuzhiyun
876*4882a593Smuzhiyun	FRAME_END
877*4882a593Smuzhiyun	RET;
878*4882a593Smuzhiyun
879*4882a593Smuzhiyun.align 8
880*4882a593Smuzhiyun.Ldec_max32:
881*4882a593Smuzhiyun	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
882*4882a593Smuzhiyun		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
883*4882a593Smuzhiyun		     %xmm15, %rax, %rcx, 24);
884*4882a593Smuzhiyun
885*4882a593Smuzhiyun	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
886*4882a593Smuzhiyun	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
887*4882a593Smuzhiyun	      %xmm15,
888*4882a593Smuzhiyun	      ((key_table + (24) * 8) + 8)(CTX),
889*4882a593Smuzhiyun	      ((key_table + (24) * 8) + 12)(CTX),
890*4882a593Smuzhiyun	      ((key_table + (24) * 8) + 0)(CTX),
891*4882a593Smuzhiyun	      ((key_table + (24) * 8) + 4)(CTX));
892*4882a593Smuzhiyun
893*4882a593Smuzhiyun	jmp .Ldec_max24;
894*4882a593SmuzhiyunSYM_FUNC_END(__camellia_dec_blk16)
895*4882a593Smuzhiyun
896*4882a593SmuzhiyunSYM_FUNC_START(camellia_ecb_enc_16way)
897*4882a593Smuzhiyun	/* input:
898*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
899*4882a593Smuzhiyun	 *	%rsi: dst (16 blocks)
900*4882a593Smuzhiyun	 *	%rdx: src (16 blocks)
901*4882a593Smuzhiyun	 */
902*4882a593Smuzhiyun	 FRAME_BEGIN
903*4882a593Smuzhiyun
904*4882a593Smuzhiyun	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
905*4882a593Smuzhiyun		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
906*4882a593Smuzhiyun		     %xmm15, %rdx, (key_table)(CTX));
907*4882a593Smuzhiyun
908*4882a593Smuzhiyun	/* now dst can be used as temporary buffer (even in src == dst case) */
909*4882a593Smuzhiyun	movq	%rsi, %rax;
910*4882a593Smuzhiyun
911*4882a593Smuzhiyun	call __camellia_enc_blk16;
912*4882a593Smuzhiyun
913*4882a593Smuzhiyun	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
914*4882a593Smuzhiyun		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
915*4882a593Smuzhiyun		     %xmm8, %rsi);
916*4882a593Smuzhiyun
917*4882a593Smuzhiyun	FRAME_END
918*4882a593Smuzhiyun	RET;
919*4882a593SmuzhiyunSYM_FUNC_END(camellia_ecb_enc_16way)
920*4882a593Smuzhiyun
921*4882a593SmuzhiyunSYM_FUNC_START(camellia_ecb_dec_16way)
922*4882a593Smuzhiyun	/* input:
923*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
924*4882a593Smuzhiyun	 *	%rsi: dst (16 blocks)
925*4882a593Smuzhiyun	 *	%rdx: src (16 blocks)
926*4882a593Smuzhiyun	 */
927*4882a593Smuzhiyun	 FRAME_BEGIN
928*4882a593Smuzhiyun
929*4882a593Smuzhiyun	cmpl $16, key_length(CTX);
930*4882a593Smuzhiyun	movl $32, %r8d;
931*4882a593Smuzhiyun	movl $24, %eax;
932*4882a593Smuzhiyun	cmovel %eax, %r8d; /* max */
933*4882a593Smuzhiyun
934*4882a593Smuzhiyun	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
935*4882a593Smuzhiyun		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
936*4882a593Smuzhiyun		     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
937*4882a593Smuzhiyun
938*4882a593Smuzhiyun	/* now dst can be used as temporary buffer (even in src == dst case) */
939*4882a593Smuzhiyun	movq	%rsi, %rax;
940*4882a593Smuzhiyun
941*4882a593Smuzhiyun	call __camellia_dec_blk16;
942*4882a593Smuzhiyun
943*4882a593Smuzhiyun	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
944*4882a593Smuzhiyun		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
945*4882a593Smuzhiyun		     %xmm8, %rsi);
946*4882a593Smuzhiyun
947*4882a593Smuzhiyun	FRAME_END
948*4882a593Smuzhiyun	RET;
949*4882a593SmuzhiyunSYM_FUNC_END(camellia_ecb_dec_16way)
950*4882a593Smuzhiyun
951*4882a593SmuzhiyunSYM_FUNC_START(camellia_cbc_dec_16way)
952*4882a593Smuzhiyun	/* input:
953*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
954*4882a593Smuzhiyun	 *	%rsi: dst (16 blocks)
955*4882a593Smuzhiyun	 *	%rdx: src (16 blocks)
956*4882a593Smuzhiyun	 */
957*4882a593Smuzhiyun	FRAME_BEGIN
958*4882a593Smuzhiyun
959*4882a593Smuzhiyun	cmpl $16, key_length(CTX);
960*4882a593Smuzhiyun	movl $32, %r8d;
961*4882a593Smuzhiyun	movl $24, %eax;
962*4882a593Smuzhiyun	cmovel %eax, %r8d; /* max */
963*4882a593Smuzhiyun
964*4882a593Smuzhiyun	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
965*4882a593Smuzhiyun		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
966*4882a593Smuzhiyun		     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
967*4882a593Smuzhiyun
968*4882a593Smuzhiyun	/*
969*4882a593Smuzhiyun	 * dst might still be in-use (in case dst == src), so use stack for
970*4882a593Smuzhiyun	 * temporary storage.
971*4882a593Smuzhiyun	 */
972*4882a593Smuzhiyun	subq $(16 * 16), %rsp;
973*4882a593Smuzhiyun	movq %rsp, %rax;
974*4882a593Smuzhiyun
975*4882a593Smuzhiyun	call __camellia_dec_blk16;
976*4882a593Smuzhiyun
977*4882a593Smuzhiyun	addq $(16 * 16), %rsp;
978*4882a593Smuzhiyun
979*4882a593Smuzhiyun	vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
980*4882a593Smuzhiyun	vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
981*4882a593Smuzhiyun	vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
982*4882a593Smuzhiyun	vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
983*4882a593Smuzhiyun	vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
984*4882a593Smuzhiyun	vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
985*4882a593Smuzhiyun	vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
986*4882a593Smuzhiyun	vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
987*4882a593Smuzhiyun	vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
988*4882a593Smuzhiyun	vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
989*4882a593Smuzhiyun	vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
990*4882a593Smuzhiyun	vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
991*4882a593Smuzhiyun	vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
992*4882a593Smuzhiyun	vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
993*4882a593Smuzhiyun	vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
994*4882a593Smuzhiyun	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
995*4882a593Smuzhiyun		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
996*4882a593Smuzhiyun		     %xmm8, %rsi);
997*4882a593Smuzhiyun
998*4882a593Smuzhiyun	FRAME_END
999*4882a593Smuzhiyun	RET;
1000*4882a593SmuzhiyunSYM_FUNC_END(camellia_cbc_dec_16way)
1001*4882a593Smuzhiyun
1002*4882a593Smuzhiyun#define inc_le128(x, minus_one, tmp) \
1003*4882a593Smuzhiyun	vpcmpeqq minus_one, x, tmp; \
1004*4882a593Smuzhiyun	vpsubq minus_one, x, x; \
1005*4882a593Smuzhiyun	vpslldq $8, tmp, tmp; \
1006*4882a593Smuzhiyun	vpsubq tmp, x, x;
1007*4882a593Smuzhiyun
1008*4882a593SmuzhiyunSYM_FUNC_START(camellia_ctr_16way)
1009*4882a593Smuzhiyun	/* input:
1010*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
1011*4882a593Smuzhiyun	 *	%rsi: dst (16 blocks)
1012*4882a593Smuzhiyun	 *	%rdx: src (16 blocks)
1013*4882a593Smuzhiyun	 *	%rcx: iv (little endian, 128bit)
1014*4882a593Smuzhiyun	 */
1015*4882a593Smuzhiyun	FRAME_BEGIN
1016*4882a593Smuzhiyun
1017*4882a593Smuzhiyun	subq $(16 * 16), %rsp;
1018*4882a593Smuzhiyun	movq %rsp, %rax;
1019*4882a593Smuzhiyun
1020*4882a593Smuzhiyun	vmovdqa .Lbswap128_mask, %xmm14;
1021*4882a593Smuzhiyun
1022*4882a593Smuzhiyun	/* load IV and byteswap */
1023*4882a593Smuzhiyun	vmovdqu (%rcx), %xmm0;
1024*4882a593Smuzhiyun	vpshufb %xmm14, %xmm0, %xmm15;
1025*4882a593Smuzhiyun	vmovdqu %xmm15, 15 * 16(%rax);
1026*4882a593Smuzhiyun
1027*4882a593Smuzhiyun	vpcmpeqd %xmm15, %xmm15, %xmm15;
1028*4882a593Smuzhiyun	vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
1029*4882a593Smuzhiyun
1030*4882a593Smuzhiyun	/* construct IVs */
1031*4882a593Smuzhiyun	inc_le128(%xmm0, %xmm15, %xmm13);
1032*4882a593Smuzhiyun	vpshufb %xmm14, %xmm0, %xmm13;
1033*4882a593Smuzhiyun	vmovdqu %xmm13, 14 * 16(%rax);
1034*4882a593Smuzhiyun	inc_le128(%xmm0, %xmm15, %xmm13);
1035*4882a593Smuzhiyun	vpshufb %xmm14, %xmm0, %xmm13;
1036*4882a593Smuzhiyun	vmovdqu %xmm13, 13 * 16(%rax);
1037*4882a593Smuzhiyun	inc_le128(%xmm0, %xmm15, %xmm13);
1038*4882a593Smuzhiyun	vpshufb %xmm14, %xmm0, %xmm12;
1039*4882a593Smuzhiyun	inc_le128(%xmm0, %xmm15, %xmm13);
1040*4882a593Smuzhiyun	vpshufb %xmm14, %xmm0, %xmm11;
1041*4882a593Smuzhiyun	inc_le128(%xmm0, %xmm15, %xmm13);
1042*4882a593Smuzhiyun	vpshufb %xmm14, %xmm0, %xmm10;
1043*4882a593Smuzhiyun	inc_le128(%xmm0, %xmm15, %xmm13);
1044*4882a593Smuzhiyun	vpshufb %xmm14, %xmm0, %xmm9;
1045*4882a593Smuzhiyun	inc_le128(%xmm0, %xmm15, %xmm13);
1046*4882a593Smuzhiyun	vpshufb %xmm14, %xmm0, %xmm8;
1047*4882a593Smuzhiyun	inc_le128(%xmm0, %xmm15, %xmm13);
1048*4882a593Smuzhiyun	vpshufb %xmm14, %xmm0, %xmm7;
1049*4882a593Smuzhiyun	inc_le128(%xmm0, %xmm15, %xmm13);
1050*4882a593Smuzhiyun	vpshufb %xmm14, %xmm0, %xmm6;
1051*4882a593Smuzhiyun	inc_le128(%xmm0, %xmm15, %xmm13);
1052*4882a593Smuzhiyun	vpshufb %xmm14, %xmm0, %xmm5;
1053*4882a593Smuzhiyun	inc_le128(%xmm0, %xmm15, %xmm13);
1054*4882a593Smuzhiyun	vpshufb %xmm14, %xmm0, %xmm4;
1055*4882a593Smuzhiyun	inc_le128(%xmm0, %xmm15, %xmm13);
1056*4882a593Smuzhiyun	vpshufb %xmm14, %xmm0, %xmm3;
1057*4882a593Smuzhiyun	inc_le128(%xmm0, %xmm15, %xmm13);
1058*4882a593Smuzhiyun	vpshufb %xmm14, %xmm0, %xmm2;
1059*4882a593Smuzhiyun	inc_le128(%xmm0, %xmm15, %xmm13);
1060*4882a593Smuzhiyun	vpshufb %xmm14, %xmm0, %xmm1;
1061*4882a593Smuzhiyun	inc_le128(%xmm0, %xmm15, %xmm13);
1062*4882a593Smuzhiyun	vmovdqa %xmm0, %xmm13;
1063*4882a593Smuzhiyun	vpshufb %xmm14, %xmm0, %xmm0;
1064*4882a593Smuzhiyun	inc_le128(%xmm13, %xmm15, %xmm14);
1065*4882a593Smuzhiyun	vmovdqu %xmm13, (%rcx);
1066*4882a593Smuzhiyun
1067*4882a593Smuzhiyun	/* inpack16_pre: */
1068*4882a593Smuzhiyun	vmovq (key_table)(CTX), %xmm15;
1069*4882a593Smuzhiyun	vpshufb .Lpack_bswap, %xmm15, %xmm15;
1070*4882a593Smuzhiyun	vpxor %xmm0, %xmm15, %xmm0;
1071*4882a593Smuzhiyun	vpxor %xmm1, %xmm15, %xmm1;
1072*4882a593Smuzhiyun	vpxor %xmm2, %xmm15, %xmm2;
1073*4882a593Smuzhiyun	vpxor %xmm3, %xmm15, %xmm3;
1074*4882a593Smuzhiyun	vpxor %xmm4, %xmm15, %xmm4;
1075*4882a593Smuzhiyun	vpxor %xmm5, %xmm15, %xmm5;
1076*4882a593Smuzhiyun	vpxor %xmm6, %xmm15, %xmm6;
1077*4882a593Smuzhiyun	vpxor %xmm7, %xmm15, %xmm7;
1078*4882a593Smuzhiyun	vpxor %xmm8, %xmm15, %xmm8;
1079*4882a593Smuzhiyun	vpxor %xmm9, %xmm15, %xmm9;
1080*4882a593Smuzhiyun	vpxor %xmm10, %xmm15, %xmm10;
1081*4882a593Smuzhiyun	vpxor %xmm11, %xmm15, %xmm11;
1082*4882a593Smuzhiyun	vpxor %xmm12, %xmm15, %xmm12;
1083*4882a593Smuzhiyun	vpxor 13 * 16(%rax), %xmm15, %xmm13;
1084*4882a593Smuzhiyun	vpxor 14 * 16(%rax), %xmm15, %xmm14;
1085*4882a593Smuzhiyun	vpxor 15 * 16(%rax), %xmm15, %xmm15;
1086*4882a593Smuzhiyun
1087*4882a593Smuzhiyun	call __camellia_enc_blk16;
1088*4882a593Smuzhiyun
1089*4882a593Smuzhiyun	addq $(16 * 16), %rsp;
1090*4882a593Smuzhiyun
1091*4882a593Smuzhiyun	vpxor 0 * 16(%rdx), %xmm7, %xmm7;
1092*4882a593Smuzhiyun	vpxor 1 * 16(%rdx), %xmm6, %xmm6;
1093*4882a593Smuzhiyun	vpxor 2 * 16(%rdx), %xmm5, %xmm5;
1094*4882a593Smuzhiyun	vpxor 3 * 16(%rdx), %xmm4, %xmm4;
1095*4882a593Smuzhiyun	vpxor 4 * 16(%rdx), %xmm3, %xmm3;
1096*4882a593Smuzhiyun	vpxor 5 * 16(%rdx), %xmm2, %xmm2;
1097*4882a593Smuzhiyun	vpxor 6 * 16(%rdx), %xmm1, %xmm1;
1098*4882a593Smuzhiyun	vpxor 7 * 16(%rdx), %xmm0, %xmm0;
1099*4882a593Smuzhiyun	vpxor 8 * 16(%rdx), %xmm15, %xmm15;
1100*4882a593Smuzhiyun	vpxor 9 * 16(%rdx), %xmm14, %xmm14;
1101*4882a593Smuzhiyun	vpxor 10 * 16(%rdx), %xmm13, %xmm13;
1102*4882a593Smuzhiyun	vpxor 11 * 16(%rdx), %xmm12, %xmm12;
1103*4882a593Smuzhiyun	vpxor 12 * 16(%rdx), %xmm11, %xmm11;
1104*4882a593Smuzhiyun	vpxor 13 * 16(%rdx), %xmm10, %xmm10;
1105*4882a593Smuzhiyun	vpxor 14 * 16(%rdx), %xmm9, %xmm9;
1106*4882a593Smuzhiyun	vpxor 15 * 16(%rdx), %xmm8, %xmm8;
1107*4882a593Smuzhiyun	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1108*4882a593Smuzhiyun		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1109*4882a593Smuzhiyun		     %xmm8, %rsi);
1110*4882a593Smuzhiyun
1111*4882a593Smuzhiyun	FRAME_END
1112*4882a593Smuzhiyun	RET;
1113*4882a593SmuzhiyunSYM_FUNC_END(camellia_ctr_16way)
1114*4882a593Smuzhiyun
1115*4882a593Smuzhiyun#define gf128mul_x_ble(iv, mask, tmp) \
1116*4882a593Smuzhiyun	vpsrad $31, iv, tmp; \
1117*4882a593Smuzhiyun	vpaddq iv, iv, iv; \
1118*4882a593Smuzhiyun	vpshufd $0x13, tmp, tmp; \
1119*4882a593Smuzhiyun	vpand mask, tmp, tmp; \
1120*4882a593Smuzhiyun	vpxor tmp, iv, iv;
1121*4882a593Smuzhiyun
1122*4882a593Smuzhiyun.align 8
1123*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(camellia_xts_crypt_16way)
1124*4882a593Smuzhiyun	/* input:
1125*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
1126*4882a593Smuzhiyun	 *	%rsi: dst (16 blocks)
1127*4882a593Smuzhiyun	 *	%rdx: src (16 blocks)
1128*4882a593Smuzhiyun	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1129*4882a593Smuzhiyun	 *	%r8: index for input whitening key
1130*4882a593Smuzhiyun	 *	%r9: pointer to  __camellia_enc_blk16 or __camellia_dec_blk16
1131*4882a593Smuzhiyun	 */
1132*4882a593Smuzhiyun	FRAME_BEGIN
1133*4882a593Smuzhiyun
1134*4882a593Smuzhiyun	subq $(16 * 16), %rsp;
1135*4882a593Smuzhiyun	movq %rsp, %rax;
1136*4882a593Smuzhiyun
1137*4882a593Smuzhiyun	vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
1138*4882a593Smuzhiyun
1139*4882a593Smuzhiyun	/* load IV */
1140*4882a593Smuzhiyun	vmovdqu (%rcx), %xmm0;
1141*4882a593Smuzhiyun	vpxor 0 * 16(%rdx), %xmm0, %xmm15;
1142*4882a593Smuzhiyun	vmovdqu %xmm15, 15 * 16(%rax);
1143*4882a593Smuzhiyun	vmovdqu %xmm0, 0 * 16(%rsi);
1144*4882a593Smuzhiyun
1145*4882a593Smuzhiyun	/* construct IVs */
1146*4882a593Smuzhiyun	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1147*4882a593Smuzhiyun	vpxor 1 * 16(%rdx), %xmm0, %xmm15;
1148*4882a593Smuzhiyun	vmovdqu %xmm15, 14 * 16(%rax);
1149*4882a593Smuzhiyun	vmovdqu %xmm0, 1 * 16(%rsi);
1150*4882a593Smuzhiyun
1151*4882a593Smuzhiyun	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1152*4882a593Smuzhiyun	vpxor 2 * 16(%rdx), %xmm0, %xmm13;
1153*4882a593Smuzhiyun	vmovdqu %xmm0, 2 * 16(%rsi);
1154*4882a593Smuzhiyun
1155*4882a593Smuzhiyun	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1156*4882a593Smuzhiyun	vpxor 3 * 16(%rdx), %xmm0, %xmm12;
1157*4882a593Smuzhiyun	vmovdqu %xmm0, 3 * 16(%rsi);
1158*4882a593Smuzhiyun
1159*4882a593Smuzhiyun	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1160*4882a593Smuzhiyun	vpxor 4 * 16(%rdx), %xmm0, %xmm11;
1161*4882a593Smuzhiyun	vmovdqu %xmm0, 4 * 16(%rsi);
1162*4882a593Smuzhiyun
1163*4882a593Smuzhiyun	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1164*4882a593Smuzhiyun	vpxor 5 * 16(%rdx), %xmm0, %xmm10;
1165*4882a593Smuzhiyun	vmovdqu %xmm0, 5 * 16(%rsi);
1166*4882a593Smuzhiyun
1167*4882a593Smuzhiyun	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1168*4882a593Smuzhiyun	vpxor 6 * 16(%rdx), %xmm0, %xmm9;
1169*4882a593Smuzhiyun	vmovdqu %xmm0, 6 * 16(%rsi);
1170*4882a593Smuzhiyun
1171*4882a593Smuzhiyun	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1172*4882a593Smuzhiyun	vpxor 7 * 16(%rdx), %xmm0, %xmm8;
1173*4882a593Smuzhiyun	vmovdqu %xmm0, 7 * 16(%rsi);
1174*4882a593Smuzhiyun
1175*4882a593Smuzhiyun	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1176*4882a593Smuzhiyun	vpxor 8 * 16(%rdx), %xmm0, %xmm7;
1177*4882a593Smuzhiyun	vmovdqu %xmm0, 8 * 16(%rsi);
1178*4882a593Smuzhiyun
1179*4882a593Smuzhiyun	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1180*4882a593Smuzhiyun	vpxor 9 * 16(%rdx), %xmm0, %xmm6;
1181*4882a593Smuzhiyun	vmovdqu %xmm0, 9 * 16(%rsi);
1182*4882a593Smuzhiyun
1183*4882a593Smuzhiyun	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1184*4882a593Smuzhiyun	vpxor 10 * 16(%rdx), %xmm0, %xmm5;
1185*4882a593Smuzhiyun	vmovdqu %xmm0, 10 * 16(%rsi);
1186*4882a593Smuzhiyun
1187*4882a593Smuzhiyun	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1188*4882a593Smuzhiyun	vpxor 11 * 16(%rdx), %xmm0, %xmm4;
1189*4882a593Smuzhiyun	vmovdqu %xmm0, 11 * 16(%rsi);
1190*4882a593Smuzhiyun
1191*4882a593Smuzhiyun	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1192*4882a593Smuzhiyun	vpxor 12 * 16(%rdx), %xmm0, %xmm3;
1193*4882a593Smuzhiyun	vmovdqu %xmm0, 12 * 16(%rsi);
1194*4882a593Smuzhiyun
1195*4882a593Smuzhiyun	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1196*4882a593Smuzhiyun	vpxor 13 * 16(%rdx), %xmm0, %xmm2;
1197*4882a593Smuzhiyun	vmovdqu %xmm0, 13 * 16(%rsi);
1198*4882a593Smuzhiyun
1199*4882a593Smuzhiyun	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1200*4882a593Smuzhiyun	vpxor 14 * 16(%rdx), %xmm0, %xmm1;
1201*4882a593Smuzhiyun	vmovdqu %xmm0, 14 * 16(%rsi);
1202*4882a593Smuzhiyun
1203*4882a593Smuzhiyun	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1204*4882a593Smuzhiyun	vpxor 15 * 16(%rdx), %xmm0, %xmm15;
1205*4882a593Smuzhiyun	vmovdqu %xmm15, 0 * 16(%rax);
1206*4882a593Smuzhiyun	vmovdqu %xmm0, 15 * 16(%rsi);
1207*4882a593Smuzhiyun
1208*4882a593Smuzhiyun	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1209*4882a593Smuzhiyun	vmovdqu %xmm0, (%rcx);
1210*4882a593Smuzhiyun
1211*4882a593Smuzhiyun	/* inpack16_pre: */
1212*4882a593Smuzhiyun	vmovq (key_table)(CTX, %r8, 8), %xmm15;
1213*4882a593Smuzhiyun	vpshufb .Lpack_bswap, %xmm15, %xmm15;
1214*4882a593Smuzhiyun	vpxor 0 * 16(%rax), %xmm15, %xmm0;
1215*4882a593Smuzhiyun	vpxor %xmm1, %xmm15, %xmm1;
1216*4882a593Smuzhiyun	vpxor %xmm2, %xmm15, %xmm2;
1217*4882a593Smuzhiyun	vpxor %xmm3, %xmm15, %xmm3;
1218*4882a593Smuzhiyun	vpxor %xmm4, %xmm15, %xmm4;
1219*4882a593Smuzhiyun	vpxor %xmm5, %xmm15, %xmm5;
1220*4882a593Smuzhiyun	vpxor %xmm6, %xmm15, %xmm6;
1221*4882a593Smuzhiyun	vpxor %xmm7, %xmm15, %xmm7;
1222*4882a593Smuzhiyun	vpxor %xmm8, %xmm15, %xmm8;
1223*4882a593Smuzhiyun	vpxor %xmm9, %xmm15, %xmm9;
1224*4882a593Smuzhiyun	vpxor %xmm10, %xmm15, %xmm10;
1225*4882a593Smuzhiyun	vpxor %xmm11, %xmm15, %xmm11;
1226*4882a593Smuzhiyun	vpxor %xmm12, %xmm15, %xmm12;
1227*4882a593Smuzhiyun	vpxor %xmm13, %xmm15, %xmm13;
1228*4882a593Smuzhiyun	vpxor 14 * 16(%rax), %xmm15, %xmm14;
1229*4882a593Smuzhiyun	vpxor 15 * 16(%rax), %xmm15, %xmm15;
1230*4882a593Smuzhiyun
1231*4882a593Smuzhiyun	CALL_NOSPEC r9;
1232*4882a593Smuzhiyun
1233*4882a593Smuzhiyun	addq $(16 * 16), %rsp;
1234*4882a593Smuzhiyun
1235*4882a593Smuzhiyun	vpxor 0 * 16(%rsi), %xmm7, %xmm7;
1236*4882a593Smuzhiyun	vpxor 1 * 16(%rsi), %xmm6, %xmm6;
1237*4882a593Smuzhiyun	vpxor 2 * 16(%rsi), %xmm5, %xmm5;
1238*4882a593Smuzhiyun	vpxor 3 * 16(%rsi), %xmm4, %xmm4;
1239*4882a593Smuzhiyun	vpxor 4 * 16(%rsi), %xmm3, %xmm3;
1240*4882a593Smuzhiyun	vpxor 5 * 16(%rsi), %xmm2, %xmm2;
1241*4882a593Smuzhiyun	vpxor 6 * 16(%rsi), %xmm1, %xmm1;
1242*4882a593Smuzhiyun	vpxor 7 * 16(%rsi), %xmm0, %xmm0;
1243*4882a593Smuzhiyun	vpxor 8 * 16(%rsi), %xmm15, %xmm15;
1244*4882a593Smuzhiyun	vpxor 9 * 16(%rsi), %xmm14, %xmm14;
1245*4882a593Smuzhiyun	vpxor 10 * 16(%rsi), %xmm13, %xmm13;
1246*4882a593Smuzhiyun	vpxor 11 * 16(%rsi), %xmm12, %xmm12;
1247*4882a593Smuzhiyun	vpxor 12 * 16(%rsi), %xmm11, %xmm11;
1248*4882a593Smuzhiyun	vpxor 13 * 16(%rsi), %xmm10, %xmm10;
1249*4882a593Smuzhiyun	vpxor 14 * 16(%rsi), %xmm9, %xmm9;
1250*4882a593Smuzhiyun	vpxor 15 * 16(%rsi), %xmm8, %xmm8;
1251*4882a593Smuzhiyun	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1252*4882a593Smuzhiyun		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1253*4882a593Smuzhiyun		     %xmm8, %rsi);
1254*4882a593Smuzhiyun
1255*4882a593Smuzhiyun	FRAME_END
1256*4882a593Smuzhiyun	RET;
1257*4882a593SmuzhiyunSYM_FUNC_END(camellia_xts_crypt_16way)
1258*4882a593Smuzhiyun
1259*4882a593SmuzhiyunSYM_FUNC_START(camellia_xts_enc_16way)
1260*4882a593Smuzhiyun	/* input:
1261*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
1262*4882a593Smuzhiyun	 *	%rsi: dst (16 blocks)
1263*4882a593Smuzhiyun	 *	%rdx: src (16 blocks)
1264*4882a593Smuzhiyun	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1265*4882a593Smuzhiyun	 */
1266*4882a593Smuzhiyun	xorl %r8d, %r8d; /* input whitening key, 0 for enc */
1267*4882a593Smuzhiyun
1268*4882a593Smuzhiyun	leaq __camellia_enc_blk16, %r9;
1269*4882a593Smuzhiyun
1270*4882a593Smuzhiyun	jmp camellia_xts_crypt_16way;
1271*4882a593SmuzhiyunSYM_FUNC_END(camellia_xts_enc_16way)
1272*4882a593Smuzhiyun
1273*4882a593SmuzhiyunSYM_FUNC_START(camellia_xts_dec_16way)
1274*4882a593Smuzhiyun	/* input:
1275*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
1276*4882a593Smuzhiyun	 *	%rsi: dst (16 blocks)
1277*4882a593Smuzhiyun	 *	%rdx: src (16 blocks)
1278*4882a593Smuzhiyun	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
1279*4882a593Smuzhiyun	 */
1280*4882a593Smuzhiyun
1281*4882a593Smuzhiyun	cmpl $16, key_length(CTX);
1282*4882a593Smuzhiyun	movl $32, %r8d;
1283*4882a593Smuzhiyun	movl $24, %eax;
1284*4882a593Smuzhiyun	cmovel %eax, %r8d;  /* input whitening key, last for dec */
1285*4882a593Smuzhiyun
1286*4882a593Smuzhiyun	leaq __camellia_dec_blk16, %r9;
1287*4882a593Smuzhiyun
1288*4882a593Smuzhiyun	jmp camellia_xts_crypt_16way;
1289*4882a593SmuzhiyunSYM_FUNC_END(camellia_xts_dec_16way)
1290