xref: /OK3568_Linux_fs/kernel/arch/x86/crypto/camellia-x86_64-asm_64.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Camellia Cipher Algorithm (x86_64)
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6*4882a593Smuzhiyun */
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun#include <linux/linkage.h>
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun.file "camellia-x86_64-asm_64.S"
11*4882a593Smuzhiyun.text
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun.extern camellia_sp10011110;
14*4882a593Smuzhiyun.extern camellia_sp22000222;
15*4882a593Smuzhiyun.extern camellia_sp03303033;
16*4882a593Smuzhiyun.extern camellia_sp00444404;
17*4882a593Smuzhiyun.extern camellia_sp02220222;
18*4882a593Smuzhiyun.extern camellia_sp30333033;
19*4882a593Smuzhiyun.extern camellia_sp44044404;
20*4882a593Smuzhiyun.extern camellia_sp11101110;
21*4882a593Smuzhiyun
22*4882a593Smuzhiyun#define sp10011110 camellia_sp10011110
23*4882a593Smuzhiyun#define sp22000222 camellia_sp22000222
24*4882a593Smuzhiyun#define sp03303033 camellia_sp03303033
25*4882a593Smuzhiyun#define sp00444404 camellia_sp00444404
26*4882a593Smuzhiyun#define sp02220222 camellia_sp02220222
27*4882a593Smuzhiyun#define sp30333033 camellia_sp30333033
28*4882a593Smuzhiyun#define sp44044404 camellia_sp44044404
29*4882a593Smuzhiyun#define sp11101110 camellia_sp11101110
30*4882a593Smuzhiyun
31*4882a593Smuzhiyun#define CAMELLIA_TABLE_BYTE_LEN 272
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun/* struct camellia_ctx: */
34*4882a593Smuzhiyun#define key_table 0
35*4882a593Smuzhiyun#define key_length CAMELLIA_TABLE_BYTE_LEN
36*4882a593Smuzhiyun
37*4882a593Smuzhiyun/* register macros */
38*4882a593Smuzhiyun#define CTX %rdi
39*4882a593Smuzhiyun#define RIO %rsi
40*4882a593Smuzhiyun#define RIOd %esi
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun#define RAB0 %rax
43*4882a593Smuzhiyun#define RCD0 %rcx
44*4882a593Smuzhiyun#define RAB1 %rbx
45*4882a593Smuzhiyun#define RCD1 %rdx
46*4882a593Smuzhiyun
47*4882a593Smuzhiyun#define RAB0d %eax
48*4882a593Smuzhiyun#define RCD0d %ecx
49*4882a593Smuzhiyun#define RAB1d %ebx
50*4882a593Smuzhiyun#define RCD1d %edx
51*4882a593Smuzhiyun
52*4882a593Smuzhiyun#define RAB0bl %al
53*4882a593Smuzhiyun#define RCD0bl %cl
54*4882a593Smuzhiyun#define RAB1bl %bl
55*4882a593Smuzhiyun#define RCD1bl %dl
56*4882a593Smuzhiyun
57*4882a593Smuzhiyun#define RAB0bh %ah
58*4882a593Smuzhiyun#define RCD0bh %ch
59*4882a593Smuzhiyun#define RAB1bh %bh
60*4882a593Smuzhiyun#define RCD1bh %dh
61*4882a593Smuzhiyun
62*4882a593Smuzhiyun#define RT0 %rsi
63*4882a593Smuzhiyun#define RT1 %r12
64*4882a593Smuzhiyun#define RT2 %r8
65*4882a593Smuzhiyun
66*4882a593Smuzhiyun#define RT0d %esi
67*4882a593Smuzhiyun#define RT1d %r12d
68*4882a593Smuzhiyun#define RT2d %r8d
69*4882a593Smuzhiyun
70*4882a593Smuzhiyun#define RT2bl %r8b
71*4882a593Smuzhiyun
72*4882a593Smuzhiyun#define RXOR %r9
73*4882a593Smuzhiyun#define RR12 %r10
74*4882a593Smuzhiyun#define RDST %r11
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun#define RXORd %r9d
77*4882a593Smuzhiyun#define RXORbl %r9b
78*4882a593Smuzhiyun
79*4882a593Smuzhiyun#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
80*4882a593Smuzhiyun	movzbl ab ## bl,		tmp2 ## d; \
81*4882a593Smuzhiyun	movzbl ab ## bh,		tmp1 ## d; \
82*4882a593Smuzhiyun	rorq $16,			ab; \
83*4882a593Smuzhiyun	xorq T0(, tmp2, 8),		dst; \
84*4882a593Smuzhiyun	xorq T1(, tmp1, 8),		dst;
85*4882a593Smuzhiyun
86*4882a593Smuzhiyun/**********************************************************************
87*4882a593Smuzhiyun  1-way camellia
88*4882a593Smuzhiyun **********************************************************************/
89*4882a593Smuzhiyun#define roundsm(ab, subkey, cd) \
90*4882a593Smuzhiyun	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
91*4882a593Smuzhiyun	\
92*4882a593Smuzhiyun	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
93*4882a593Smuzhiyun	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
94*4882a593Smuzhiyun	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
95*4882a593Smuzhiyun	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
96*4882a593Smuzhiyun	\
97*4882a593Smuzhiyun	xorq RT2,					cd ## 0;
98*4882a593Smuzhiyun
99*4882a593Smuzhiyun#define fls(l, r, kl, kr) \
100*4882a593Smuzhiyun	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
101*4882a593Smuzhiyun	andl l ## 0d,					RT0d; \
102*4882a593Smuzhiyun	roll $1,					RT0d; \
103*4882a593Smuzhiyun	shlq $32,					RT0; \
104*4882a593Smuzhiyun	xorq RT0,					l ## 0; \
105*4882a593Smuzhiyun	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
106*4882a593Smuzhiyun	orq r ## 0,					RT1; \
107*4882a593Smuzhiyun	shrq $32,					RT1; \
108*4882a593Smuzhiyun	xorq RT1,					r ## 0; \
109*4882a593Smuzhiyun	\
110*4882a593Smuzhiyun	movq (key_table + ((kl) * 2) * 4)(CTX),		RT2; \
111*4882a593Smuzhiyun	orq l ## 0,					RT2; \
112*4882a593Smuzhiyun	shrq $32,					RT2; \
113*4882a593Smuzhiyun	xorq RT2,					l ## 0; \
114*4882a593Smuzhiyun	movl (key_table + ((kr) * 2) * 4)(CTX),		RT0d; \
115*4882a593Smuzhiyun	andl r ## 0d,					RT0d; \
116*4882a593Smuzhiyun	roll $1,					RT0d; \
117*4882a593Smuzhiyun	shlq $32,					RT0; \
118*4882a593Smuzhiyun	xorq RT0,					r ## 0;
119*4882a593Smuzhiyun
120*4882a593Smuzhiyun#define enc_rounds(i) \
121*4882a593Smuzhiyun	roundsm(RAB, i + 2, RCD); \
122*4882a593Smuzhiyun	roundsm(RCD, i + 3, RAB); \
123*4882a593Smuzhiyun	roundsm(RAB, i + 4, RCD); \
124*4882a593Smuzhiyun	roundsm(RCD, i + 5, RAB); \
125*4882a593Smuzhiyun	roundsm(RAB, i + 6, RCD); \
126*4882a593Smuzhiyun	roundsm(RCD, i + 7, RAB);
127*4882a593Smuzhiyun
128*4882a593Smuzhiyun#define enc_fls(i) \
129*4882a593Smuzhiyun	fls(RAB, RCD, i + 0, i + 1);
130*4882a593Smuzhiyun
131*4882a593Smuzhiyun#define enc_inpack() \
132*4882a593Smuzhiyun	movq (RIO),			RAB0; \
133*4882a593Smuzhiyun	bswapq				RAB0; \
134*4882a593Smuzhiyun	rolq $32,			RAB0; \
135*4882a593Smuzhiyun	movq 4*2(RIO),			RCD0; \
136*4882a593Smuzhiyun	bswapq				RCD0; \
137*4882a593Smuzhiyun	rorq $32,			RCD0; \
138*4882a593Smuzhiyun	xorq key_table(CTX),		RAB0;
139*4882a593Smuzhiyun
140*4882a593Smuzhiyun#define enc_outunpack(op, max) \
141*4882a593Smuzhiyun	xorq key_table(CTX, max, 8),	RCD0; \
142*4882a593Smuzhiyun	rorq $32,			RCD0; \
143*4882a593Smuzhiyun	bswapq				RCD0; \
144*4882a593Smuzhiyun	op ## q RCD0,			(RIO); \
145*4882a593Smuzhiyun	rolq $32,			RAB0; \
146*4882a593Smuzhiyun	bswapq				RAB0; \
147*4882a593Smuzhiyun	op ## q RAB0,			4*2(RIO);
148*4882a593Smuzhiyun
149*4882a593Smuzhiyun#define dec_rounds(i) \
150*4882a593Smuzhiyun	roundsm(RAB, i + 7, RCD); \
151*4882a593Smuzhiyun	roundsm(RCD, i + 6, RAB); \
152*4882a593Smuzhiyun	roundsm(RAB, i + 5, RCD); \
153*4882a593Smuzhiyun	roundsm(RCD, i + 4, RAB); \
154*4882a593Smuzhiyun	roundsm(RAB, i + 3, RCD); \
155*4882a593Smuzhiyun	roundsm(RCD, i + 2, RAB);
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun#define dec_fls(i) \
158*4882a593Smuzhiyun	fls(RAB, RCD, i + 1, i + 0);
159*4882a593Smuzhiyun
160*4882a593Smuzhiyun#define dec_inpack(max) \
161*4882a593Smuzhiyun	movq (RIO),			RAB0; \
162*4882a593Smuzhiyun	bswapq				RAB0; \
163*4882a593Smuzhiyun	rolq $32,			RAB0; \
164*4882a593Smuzhiyun	movq 4*2(RIO),			RCD0; \
165*4882a593Smuzhiyun	bswapq				RCD0; \
166*4882a593Smuzhiyun	rorq $32,			RCD0; \
167*4882a593Smuzhiyun	xorq key_table(CTX, max, 8),	RAB0;
168*4882a593Smuzhiyun
169*4882a593Smuzhiyun#define dec_outunpack() \
170*4882a593Smuzhiyun	xorq key_table(CTX),		RCD0; \
171*4882a593Smuzhiyun	rorq $32,			RCD0; \
172*4882a593Smuzhiyun	bswapq				RCD0; \
173*4882a593Smuzhiyun	movq RCD0,			(RIO); \
174*4882a593Smuzhiyun	rolq $32,			RAB0; \
175*4882a593Smuzhiyun	bswapq				RAB0; \
176*4882a593Smuzhiyun	movq RAB0,			4*2(RIO);
177*4882a593Smuzhiyun
178*4882a593SmuzhiyunSYM_FUNC_START(__camellia_enc_blk)
179*4882a593Smuzhiyun	/* input:
180*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
181*4882a593Smuzhiyun	 *	%rsi: dst
182*4882a593Smuzhiyun	 *	%rdx: src
183*4882a593Smuzhiyun	 *	%rcx: bool xor
184*4882a593Smuzhiyun	 */
185*4882a593Smuzhiyun	movq %r12, RR12;
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun	movq %rcx, RXOR;
188*4882a593Smuzhiyun	movq %rsi, RDST;
189*4882a593Smuzhiyun	movq %rdx, RIO;
190*4882a593Smuzhiyun
191*4882a593Smuzhiyun	enc_inpack();
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun	enc_rounds(0);
194*4882a593Smuzhiyun	enc_fls(8);
195*4882a593Smuzhiyun	enc_rounds(8);
196*4882a593Smuzhiyun	enc_fls(16);
197*4882a593Smuzhiyun	enc_rounds(16);
198*4882a593Smuzhiyun	movl $24, RT1d; /* max */
199*4882a593Smuzhiyun
200*4882a593Smuzhiyun	cmpb $16, key_length(CTX);
201*4882a593Smuzhiyun	je .L__enc_done;
202*4882a593Smuzhiyun
203*4882a593Smuzhiyun	enc_fls(24);
204*4882a593Smuzhiyun	enc_rounds(24);
205*4882a593Smuzhiyun	movl $32, RT1d; /* max */
206*4882a593Smuzhiyun
207*4882a593Smuzhiyun.L__enc_done:
208*4882a593Smuzhiyun	testb RXORbl, RXORbl;
209*4882a593Smuzhiyun	movq RDST, RIO;
210*4882a593Smuzhiyun
211*4882a593Smuzhiyun	jnz .L__enc_xor;
212*4882a593Smuzhiyun
213*4882a593Smuzhiyun	enc_outunpack(mov, RT1);
214*4882a593Smuzhiyun
215*4882a593Smuzhiyun	movq RR12, %r12;
216*4882a593Smuzhiyun	RET;
217*4882a593Smuzhiyun
218*4882a593Smuzhiyun.L__enc_xor:
219*4882a593Smuzhiyun	enc_outunpack(xor, RT1);
220*4882a593Smuzhiyun
221*4882a593Smuzhiyun	movq RR12, %r12;
222*4882a593Smuzhiyun	RET;
223*4882a593SmuzhiyunSYM_FUNC_END(__camellia_enc_blk)
224*4882a593Smuzhiyun
225*4882a593SmuzhiyunSYM_FUNC_START(camellia_dec_blk)
226*4882a593Smuzhiyun	/* input:
227*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
228*4882a593Smuzhiyun	 *	%rsi: dst
229*4882a593Smuzhiyun	 *	%rdx: src
230*4882a593Smuzhiyun	 */
231*4882a593Smuzhiyun	cmpl $16, key_length(CTX);
232*4882a593Smuzhiyun	movl $32, RT2d;
233*4882a593Smuzhiyun	movl $24, RXORd;
234*4882a593Smuzhiyun	cmovel RXORd, RT2d; /* max */
235*4882a593Smuzhiyun
236*4882a593Smuzhiyun	movq %r12, RR12;
237*4882a593Smuzhiyun	movq %rsi, RDST;
238*4882a593Smuzhiyun	movq %rdx, RIO;
239*4882a593Smuzhiyun
240*4882a593Smuzhiyun	dec_inpack(RT2);
241*4882a593Smuzhiyun
242*4882a593Smuzhiyun	cmpb $24, RT2bl;
243*4882a593Smuzhiyun	je .L__dec_rounds16;
244*4882a593Smuzhiyun
245*4882a593Smuzhiyun	dec_rounds(24);
246*4882a593Smuzhiyun	dec_fls(24);
247*4882a593Smuzhiyun
248*4882a593Smuzhiyun.L__dec_rounds16:
249*4882a593Smuzhiyun	dec_rounds(16);
250*4882a593Smuzhiyun	dec_fls(16);
251*4882a593Smuzhiyun	dec_rounds(8);
252*4882a593Smuzhiyun	dec_fls(8);
253*4882a593Smuzhiyun	dec_rounds(0);
254*4882a593Smuzhiyun
255*4882a593Smuzhiyun	movq RDST, RIO;
256*4882a593Smuzhiyun
257*4882a593Smuzhiyun	dec_outunpack();
258*4882a593Smuzhiyun
259*4882a593Smuzhiyun	movq RR12, %r12;
260*4882a593Smuzhiyun	RET;
261*4882a593SmuzhiyunSYM_FUNC_END(camellia_dec_blk)
262*4882a593Smuzhiyun
263*4882a593Smuzhiyun/**********************************************************************
264*4882a593Smuzhiyun  2-way camellia
265*4882a593Smuzhiyun **********************************************************************/
266*4882a593Smuzhiyun#define roundsm2(ab, subkey, cd) \
267*4882a593Smuzhiyun	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
268*4882a593Smuzhiyun	xorq RT2,					cd ## 1; \
269*4882a593Smuzhiyun	\
270*4882a593Smuzhiyun	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
271*4882a593Smuzhiyun	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
272*4882a593Smuzhiyun	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
273*4882a593Smuzhiyun	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
274*4882a593Smuzhiyun	\
275*4882a593Smuzhiyun		xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
276*4882a593Smuzhiyun		xorq RT2,					cd ## 0; \
277*4882a593Smuzhiyun		xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
278*4882a593Smuzhiyun		xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
279*4882a593Smuzhiyun		xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
280*4882a593Smuzhiyun
281*4882a593Smuzhiyun#define fls2(l, r, kl, kr) \
282*4882a593Smuzhiyun	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
283*4882a593Smuzhiyun	andl l ## 0d,					RT0d; \
284*4882a593Smuzhiyun	roll $1,					RT0d; \
285*4882a593Smuzhiyun	shlq $32,					RT0; \
286*4882a593Smuzhiyun	xorq RT0,					l ## 0; \
287*4882a593Smuzhiyun	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
288*4882a593Smuzhiyun	orq r ## 0,					RT1; \
289*4882a593Smuzhiyun	shrq $32,					RT1; \
290*4882a593Smuzhiyun	xorq RT1,					r ## 0; \
291*4882a593Smuzhiyun	\
292*4882a593Smuzhiyun		movl (key_table + ((kl) * 2) * 4)(CTX),		RT2d; \
293*4882a593Smuzhiyun		andl l ## 1d,					RT2d; \
294*4882a593Smuzhiyun		roll $1,					RT2d; \
295*4882a593Smuzhiyun		shlq $32,					RT2; \
296*4882a593Smuzhiyun		xorq RT2,					l ## 1; \
297*4882a593Smuzhiyun		movq (key_table + ((kr) * 2) * 4)(CTX),		RT0; \
298*4882a593Smuzhiyun		orq r ## 1,					RT0; \
299*4882a593Smuzhiyun		shrq $32,					RT0; \
300*4882a593Smuzhiyun		xorq RT0,					r ## 1; \
301*4882a593Smuzhiyun	\
302*4882a593Smuzhiyun	movq (key_table + ((kl) * 2) * 4)(CTX),		RT1; \
303*4882a593Smuzhiyun	orq l ## 0,					RT1; \
304*4882a593Smuzhiyun	shrq $32,					RT1; \
305*4882a593Smuzhiyun	xorq RT1,					l ## 0; \
306*4882a593Smuzhiyun	movl (key_table + ((kr) * 2) * 4)(CTX),		RT2d; \
307*4882a593Smuzhiyun	andl r ## 0d,					RT2d; \
308*4882a593Smuzhiyun	roll $1,					RT2d; \
309*4882a593Smuzhiyun	shlq $32,					RT2; \
310*4882a593Smuzhiyun	xorq RT2,					r ## 0; \
311*4882a593Smuzhiyun	\
312*4882a593Smuzhiyun		movq (key_table + ((kl) * 2) * 4)(CTX),		RT0; \
313*4882a593Smuzhiyun		orq l ## 1,					RT0; \
314*4882a593Smuzhiyun		shrq $32,					RT0; \
315*4882a593Smuzhiyun		xorq RT0,					l ## 1; \
316*4882a593Smuzhiyun		movl (key_table + ((kr) * 2) * 4)(CTX),		RT1d; \
317*4882a593Smuzhiyun		andl r ## 1d,					RT1d; \
318*4882a593Smuzhiyun		roll $1,					RT1d; \
319*4882a593Smuzhiyun		shlq $32,					RT1; \
320*4882a593Smuzhiyun		xorq RT1,					r ## 1;
321*4882a593Smuzhiyun
322*4882a593Smuzhiyun#define enc_rounds2(i) \
323*4882a593Smuzhiyun	roundsm2(RAB, i + 2, RCD); \
324*4882a593Smuzhiyun	roundsm2(RCD, i + 3, RAB); \
325*4882a593Smuzhiyun	roundsm2(RAB, i + 4, RCD); \
326*4882a593Smuzhiyun	roundsm2(RCD, i + 5, RAB); \
327*4882a593Smuzhiyun	roundsm2(RAB, i + 6, RCD); \
328*4882a593Smuzhiyun	roundsm2(RCD, i + 7, RAB);
329*4882a593Smuzhiyun
330*4882a593Smuzhiyun#define enc_fls2(i) \
331*4882a593Smuzhiyun	fls2(RAB, RCD, i + 0, i + 1);
332*4882a593Smuzhiyun
333*4882a593Smuzhiyun#define enc_inpack2() \
334*4882a593Smuzhiyun	movq (RIO),			RAB0; \
335*4882a593Smuzhiyun	bswapq				RAB0; \
336*4882a593Smuzhiyun	rorq $32,			RAB0; \
337*4882a593Smuzhiyun	movq 4*2(RIO),			RCD0; \
338*4882a593Smuzhiyun	bswapq				RCD0; \
339*4882a593Smuzhiyun	rolq $32,			RCD0; \
340*4882a593Smuzhiyun	xorq key_table(CTX),		RAB0; \
341*4882a593Smuzhiyun	\
342*4882a593Smuzhiyun		movq 8*2(RIO),			RAB1; \
343*4882a593Smuzhiyun		bswapq				RAB1; \
344*4882a593Smuzhiyun		rorq $32,			RAB1; \
345*4882a593Smuzhiyun		movq 12*2(RIO),			RCD1; \
346*4882a593Smuzhiyun		bswapq				RCD1; \
347*4882a593Smuzhiyun		rolq $32,			RCD1; \
348*4882a593Smuzhiyun		xorq key_table(CTX),		RAB1;
349*4882a593Smuzhiyun
350*4882a593Smuzhiyun#define enc_outunpack2(op, max) \
351*4882a593Smuzhiyun	xorq key_table(CTX, max, 8),	RCD0; \
352*4882a593Smuzhiyun	rolq $32,			RCD0; \
353*4882a593Smuzhiyun	bswapq				RCD0; \
354*4882a593Smuzhiyun	op ## q RCD0,			(RIO); \
355*4882a593Smuzhiyun	rorq $32,			RAB0; \
356*4882a593Smuzhiyun	bswapq				RAB0; \
357*4882a593Smuzhiyun	op ## q RAB0,			4*2(RIO); \
358*4882a593Smuzhiyun	\
359*4882a593Smuzhiyun		xorq key_table(CTX, max, 8),	RCD1; \
360*4882a593Smuzhiyun		rolq $32,			RCD1; \
361*4882a593Smuzhiyun		bswapq				RCD1; \
362*4882a593Smuzhiyun		op ## q RCD1,			8*2(RIO); \
363*4882a593Smuzhiyun		rorq $32,			RAB1; \
364*4882a593Smuzhiyun		bswapq				RAB1; \
365*4882a593Smuzhiyun		op ## q RAB1,			12*2(RIO);
366*4882a593Smuzhiyun
367*4882a593Smuzhiyun#define dec_rounds2(i) \
368*4882a593Smuzhiyun	roundsm2(RAB, i + 7, RCD); \
369*4882a593Smuzhiyun	roundsm2(RCD, i + 6, RAB); \
370*4882a593Smuzhiyun	roundsm2(RAB, i + 5, RCD); \
371*4882a593Smuzhiyun	roundsm2(RCD, i + 4, RAB); \
372*4882a593Smuzhiyun	roundsm2(RAB, i + 3, RCD); \
373*4882a593Smuzhiyun	roundsm2(RCD, i + 2, RAB);
374*4882a593Smuzhiyun
375*4882a593Smuzhiyun#define dec_fls2(i) \
376*4882a593Smuzhiyun	fls2(RAB, RCD, i + 1, i + 0);
377*4882a593Smuzhiyun
378*4882a593Smuzhiyun#define dec_inpack2(max) \
379*4882a593Smuzhiyun	movq (RIO),			RAB0; \
380*4882a593Smuzhiyun	bswapq				RAB0; \
381*4882a593Smuzhiyun	rorq $32,			RAB0; \
382*4882a593Smuzhiyun	movq 4*2(RIO),			RCD0; \
383*4882a593Smuzhiyun	bswapq				RCD0; \
384*4882a593Smuzhiyun	rolq $32,			RCD0; \
385*4882a593Smuzhiyun	xorq key_table(CTX, max, 8),	RAB0; \
386*4882a593Smuzhiyun	\
387*4882a593Smuzhiyun		movq 8*2(RIO),			RAB1; \
388*4882a593Smuzhiyun		bswapq				RAB1; \
389*4882a593Smuzhiyun		rorq $32,			RAB1; \
390*4882a593Smuzhiyun		movq 12*2(RIO),			RCD1; \
391*4882a593Smuzhiyun		bswapq				RCD1; \
392*4882a593Smuzhiyun		rolq $32,			RCD1; \
393*4882a593Smuzhiyun		xorq key_table(CTX, max, 8),	RAB1;
394*4882a593Smuzhiyun
395*4882a593Smuzhiyun#define dec_outunpack2() \
396*4882a593Smuzhiyun	xorq key_table(CTX),		RCD0; \
397*4882a593Smuzhiyun	rolq $32,			RCD0; \
398*4882a593Smuzhiyun	bswapq				RCD0; \
399*4882a593Smuzhiyun	movq RCD0,			(RIO); \
400*4882a593Smuzhiyun	rorq $32,			RAB0; \
401*4882a593Smuzhiyun	bswapq				RAB0; \
402*4882a593Smuzhiyun	movq RAB0,			4*2(RIO); \
403*4882a593Smuzhiyun	\
404*4882a593Smuzhiyun		xorq key_table(CTX),		RCD1; \
405*4882a593Smuzhiyun		rolq $32,			RCD1; \
406*4882a593Smuzhiyun		bswapq				RCD1; \
407*4882a593Smuzhiyun		movq RCD1,			8*2(RIO); \
408*4882a593Smuzhiyun		rorq $32,			RAB1; \
409*4882a593Smuzhiyun		bswapq				RAB1; \
410*4882a593Smuzhiyun		movq RAB1,			12*2(RIO);
411*4882a593Smuzhiyun
412*4882a593SmuzhiyunSYM_FUNC_START(__camellia_enc_blk_2way)
413*4882a593Smuzhiyun	/* input:
414*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
415*4882a593Smuzhiyun	 *	%rsi: dst
416*4882a593Smuzhiyun	 *	%rdx: src
417*4882a593Smuzhiyun	 *	%rcx: bool xor
418*4882a593Smuzhiyun	 */
419*4882a593Smuzhiyun	pushq %rbx;
420*4882a593Smuzhiyun
421*4882a593Smuzhiyun	movq %r12, RR12;
422*4882a593Smuzhiyun	movq %rcx, RXOR;
423*4882a593Smuzhiyun	movq %rsi, RDST;
424*4882a593Smuzhiyun	movq %rdx, RIO;
425*4882a593Smuzhiyun
426*4882a593Smuzhiyun	enc_inpack2();
427*4882a593Smuzhiyun
428*4882a593Smuzhiyun	enc_rounds2(0);
429*4882a593Smuzhiyun	enc_fls2(8);
430*4882a593Smuzhiyun	enc_rounds2(8);
431*4882a593Smuzhiyun	enc_fls2(16);
432*4882a593Smuzhiyun	enc_rounds2(16);
433*4882a593Smuzhiyun	movl $24, RT2d; /* max */
434*4882a593Smuzhiyun
435*4882a593Smuzhiyun	cmpb $16, key_length(CTX);
436*4882a593Smuzhiyun	je .L__enc2_done;
437*4882a593Smuzhiyun
438*4882a593Smuzhiyun	enc_fls2(24);
439*4882a593Smuzhiyun	enc_rounds2(24);
440*4882a593Smuzhiyun	movl $32, RT2d; /* max */
441*4882a593Smuzhiyun
442*4882a593Smuzhiyun.L__enc2_done:
443*4882a593Smuzhiyun	test RXORbl, RXORbl;
444*4882a593Smuzhiyun	movq RDST, RIO;
445*4882a593Smuzhiyun	jnz .L__enc2_xor;
446*4882a593Smuzhiyun
447*4882a593Smuzhiyun	enc_outunpack2(mov, RT2);
448*4882a593Smuzhiyun
449*4882a593Smuzhiyun	movq RR12, %r12;
450*4882a593Smuzhiyun	popq %rbx;
451*4882a593Smuzhiyun	RET;
452*4882a593Smuzhiyun
453*4882a593Smuzhiyun.L__enc2_xor:
454*4882a593Smuzhiyun	enc_outunpack2(xor, RT2);
455*4882a593Smuzhiyun
456*4882a593Smuzhiyun	movq RR12, %r12;
457*4882a593Smuzhiyun	popq %rbx;
458*4882a593Smuzhiyun	RET;
459*4882a593SmuzhiyunSYM_FUNC_END(__camellia_enc_blk_2way)
460*4882a593Smuzhiyun
461*4882a593SmuzhiyunSYM_FUNC_START(camellia_dec_blk_2way)
462*4882a593Smuzhiyun	/* input:
463*4882a593Smuzhiyun	 *	%rdi: ctx, CTX
464*4882a593Smuzhiyun	 *	%rsi: dst
465*4882a593Smuzhiyun	 *	%rdx: src
466*4882a593Smuzhiyun	 */
467*4882a593Smuzhiyun	cmpl $16, key_length(CTX);
468*4882a593Smuzhiyun	movl $32, RT2d;
469*4882a593Smuzhiyun	movl $24, RXORd;
470*4882a593Smuzhiyun	cmovel RXORd, RT2d; /* max */
471*4882a593Smuzhiyun
472*4882a593Smuzhiyun	movq %rbx, RXOR;
473*4882a593Smuzhiyun	movq %r12, RR12;
474*4882a593Smuzhiyun	movq %rsi, RDST;
475*4882a593Smuzhiyun	movq %rdx, RIO;
476*4882a593Smuzhiyun
477*4882a593Smuzhiyun	dec_inpack2(RT2);
478*4882a593Smuzhiyun
479*4882a593Smuzhiyun	cmpb $24, RT2bl;
480*4882a593Smuzhiyun	je .L__dec2_rounds16;
481*4882a593Smuzhiyun
482*4882a593Smuzhiyun	dec_rounds2(24);
483*4882a593Smuzhiyun	dec_fls2(24);
484*4882a593Smuzhiyun
485*4882a593Smuzhiyun.L__dec2_rounds16:
486*4882a593Smuzhiyun	dec_rounds2(16);
487*4882a593Smuzhiyun	dec_fls2(16);
488*4882a593Smuzhiyun	dec_rounds2(8);
489*4882a593Smuzhiyun	dec_fls2(8);
490*4882a593Smuzhiyun	dec_rounds2(0);
491*4882a593Smuzhiyun
492*4882a593Smuzhiyun	movq RDST, RIO;
493*4882a593Smuzhiyun
494*4882a593Smuzhiyun	dec_outunpack2();
495*4882a593Smuzhiyun
496*4882a593Smuzhiyun	movq RR12, %r12;
497*4882a593Smuzhiyun	movq RXOR, %rbx;
498*4882a593Smuzhiyun	RET;
499*4882a593SmuzhiyunSYM_FUNC_END(camellia_dec_blk_2way)
500