xref: /OK3568_Linux_fs/kernel/arch/x86/crypto/blowfish-x86_64-asm_64.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Blowfish Cipher Algorithm (x86_64)
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6*4882a593Smuzhiyun */
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun#include <linux/linkage.h>
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun.file "blowfish-x86_64-asm.S"
11*4882a593Smuzhiyun.text
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun/* structure of crypto context */
14*4882a593Smuzhiyun#define p	0
15*4882a593Smuzhiyun#define s0	((16 + 2) * 4)
16*4882a593Smuzhiyun#define s1	((16 + 2 + (1 * 256)) * 4)
17*4882a593Smuzhiyun#define s2	((16 + 2 + (2 * 256)) * 4)
18*4882a593Smuzhiyun#define s3	((16 + 2 + (3 * 256)) * 4)
19*4882a593Smuzhiyun
20*4882a593Smuzhiyun/* register macros */
21*4882a593Smuzhiyun#define CTX %r12
22*4882a593Smuzhiyun#define RIO %rsi
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun#define RX0 %rax
25*4882a593Smuzhiyun#define RX1 %rbx
26*4882a593Smuzhiyun#define RX2 %rcx
27*4882a593Smuzhiyun#define RX3 %rdx
28*4882a593Smuzhiyun
29*4882a593Smuzhiyun#define RX0d %eax
30*4882a593Smuzhiyun#define RX1d %ebx
31*4882a593Smuzhiyun#define RX2d %ecx
32*4882a593Smuzhiyun#define RX3d %edx
33*4882a593Smuzhiyun
34*4882a593Smuzhiyun#define RX0bl %al
35*4882a593Smuzhiyun#define RX1bl %bl
36*4882a593Smuzhiyun#define RX2bl %cl
37*4882a593Smuzhiyun#define RX3bl %dl
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun#define RX0bh %ah
40*4882a593Smuzhiyun#define RX1bh %bh
41*4882a593Smuzhiyun#define RX2bh %ch
42*4882a593Smuzhiyun#define RX3bh %dh
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun#define RT0 %rdi
45*4882a593Smuzhiyun#define RT1 %rsi
46*4882a593Smuzhiyun#define RT2 %r8
47*4882a593Smuzhiyun#define RT3 %r9
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun#define RT0d %edi
50*4882a593Smuzhiyun#define RT1d %esi
51*4882a593Smuzhiyun#define RT2d %r8d
52*4882a593Smuzhiyun#define RT3d %r9d
53*4882a593Smuzhiyun
54*4882a593Smuzhiyun#define RKEY %r10
55*4882a593Smuzhiyun
56*4882a593Smuzhiyun/***********************************************************************
57*4882a593Smuzhiyun * 1-way blowfish
58*4882a593Smuzhiyun ***********************************************************************/
59*4882a593Smuzhiyun#define F() \
60*4882a593Smuzhiyun	rorq $16,		RX0; \
61*4882a593Smuzhiyun	movzbl RX0bh,		RT0d; \
62*4882a593Smuzhiyun	movzbl RX0bl,		RT1d; \
63*4882a593Smuzhiyun	rolq $16,		RX0; \
64*4882a593Smuzhiyun	movl s0(CTX,RT0,4),	RT0d; \
65*4882a593Smuzhiyun	addl s1(CTX,RT1,4),	RT0d; \
66*4882a593Smuzhiyun	movzbl RX0bh,		RT1d; \
67*4882a593Smuzhiyun	movzbl RX0bl,		RT2d; \
68*4882a593Smuzhiyun	rolq $32,		RX0; \
69*4882a593Smuzhiyun	xorl s2(CTX,RT1,4),	RT0d; \
70*4882a593Smuzhiyun	addl s3(CTX,RT2,4),	RT0d; \
71*4882a593Smuzhiyun	xorq RT0,		RX0;
72*4882a593Smuzhiyun
73*4882a593Smuzhiyun#define add_roundkey_enc(n) \
74*4882a593Smuzhiyun	xorq p+4*(n)(CTX), 	RX0;
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun#define round_enc(n) \
77*4882a593Smuzhiyun	add_roundkey_enc(n); \
78*4882a593Smuzhiyun	\
79*4882a593Smuzhiyun	F(); \
80*4882a593Smuzhiyun	F();
81*4882a593Smuzhiyun
82*4882a593Smuzhiyun#define add_roundkey_dec(n) \
83*4882a593Smuzhiyun	movq p+4*(n-1)(CTX),	RT0; \
84*4882a593Smuzhiyun	rorq $32,		RT0; \
85*4882a593Smuzhiyun	xorq RT0,		RX0;
86*4882a593Smuzhiyun
87*4882a593Smuzhiyun#define round_dec(n) \
88*4882a593Smuzhiyun	add_roundkey_dec(n); \
89*4882a593Smuzhiyun	\
90*4882a593Smuzhiyun	F(); \
91*4882a593Smuzhiyun	F(); \
92*4882a593Smuzhiyun
93*4882a593Smuzhiyun#define read_block() \
94*4882a593Smuzhiyun	movq (RIO), 		RX0; \
95*4882a593Smuzhiyun	rorq $32, 		RX0; \
96*4882a593Smuzhiyun	bswapq 			RX0;
97*4882a593Smuzhiyun
98*4882a593Smuzhiyun#define write_block() \
99*4882a593Smuzhiyun	bswapq 			RX0; \
100*4882a593Smuzhiyun	movq RX0, 		(RIO);
101*4882a593Smuzhiyun
102*4882a593Smuzhiyun#define xor_block() \
103*4882a593Smuzhiyun	bswapq 			RX0; \
104*4882a593Smuzhiyun	xorq RX0, 		(RIO);
105*4882a593Smuzhiyun
106*4882a593SmuzhiyunSYM_FUNC_START(__blowfish_enc_blk)
107*4882a593Smuzhiyun	/* input:
108*4882a593Smuzhiyun	 *	%rdi: ctx
109*4882a593Smuzhiyun	 *	%rsi: dst
110*4882a593Smuzhiyun	 *	%rdx: src
111*4882a593Smuzhiyun	 *	%rcx: bool, if true: xor output
112*4882a593Smuzhiyun	 */
113*4882a593Smuzhiyun	movq %r12, %r11;
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun	movq %rdi, CTX;
116*4882a593Smuzhiyun	movq %rsi, %r10;
117*4882a593Smuzhiyun	movq %rdx, RIO;
118*4882a593Smuzhiyun
119*4882a593Smuzhiyun	read_block();
120*4882a593Smuzhiyun
121*4882a593Smuzhiyun	round_enc(0);
122*4882a593Smuzhiyun	round_enc(2);
123*4882a593Smuzhiyun	round_enc(4);
124*4882a593Smuzhiyun	round_enc(6);
125*4882a593Smuzhiyun	round_enc(8);
126*4882a593Smuzhiyun	round_enc(10);
127*4882a593Smuzhiyun	round_enc(12);
128*4882a593Smuzhiyun	round_enc(14);
129*4882a593Smuzhiyun	add_roundkey_enc(16);
130*4882a593Smuzhiyun
131*4882a593Smuzhiyun	movq %r11, %r12;
132*4882a593Smuzhiyun
133*4882a593Smuzhiyun	movq %r10, RIO;
134*4882a593Smuzhiyun	test %cl, %cl;
135*4882a593Smuzhiyun	jnz .L__enc_xor;
136*4882a593Smuzhiyun
137*4882a593Smuzhiyun	write_block();
138*4882a593Smuzhiyun	RET;
139*4882a593Smuzhiyun.L__enc_xor:
140*4882a593Smuzhiyun	xor_block();
141*4882a593Smuzhiyun	RET;
142*4882a593SmuzhiyunSYM_FUNC_END(__blowfish_enc_blk)
143*4882a593Smuzhiyun
144*4882a593SmuzhiyunSYM_FUNC_START(blowfish_dec_blk)
145*4882a593Smuzhiyun	/* input:
146*4882a593Smuzhiyun	 *	%rdi: ctx
147*4882a593Smuzhiyun	 *	%rsi: dst
148*4882a593Smuzhiyun	 *	%rdx: src
149*4882a593Smuzhiyun	 */
150*4882a593Smuzhiyun	movq %r12, %r11;
151*4882a593Smuzhiyun
152*4882a593Smuzhiyun	movq %rdi, CTX;
153*4882a593Smuzhiyun	movq %rsi, %r10;
154*4882a593Smuzhiyun	movq %rdx, RIO;
155*4882a593Smuzhiyun
156*4882a593Smuzhiyun	read_block();
157*4882a593Smuzhiyun
158*4882a593Smuzhiyun	round_dec(17);
159*4882a593Smuzhiyun	round_dec(15);
160*4882a593Smuzhiyun	round_dec(13);
161*4882a593Smuzhiyun	round_dec(11);
162*4882a593Smuzhiyun	round_dec(9);
163*4882a593Smuzhiyun	round_dec(7);
164*4882a593Smuzhiyun	round_dec(5);
165*4882a593Smuzhiyun	round_dec(3);
166*4882a593Smuzhiyun	add_roundkey_dec(1);
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun	movq %r10, RIO;
169*4882a593Smuzhiyun	write_block();
170*4882a593Smuzhiyun
171*4882a593Smuzhiyun	movq %r11, %r12;
172*4882a593Smuzhiyun
173*4882a593Smuzhiyun	RET;
174*4882a593SmuzhiyunSYM_FUNC_END(blowfish_dec_blk)
175*4882a593Smuzhiyun
176*4882a593Smuzhiyun/**********************************************************************
177*4882a593Smuzhiyun  4-way blowfish, four blocks parallel
178*4882a593Smuzhiyun **********************************************************************/
179*4882a593Smuzhiyun
180*4882a593Smuzhiyun/* F() for 4-way. Slower when used alone/1-way, but faster when used
181*4882a593Smuzhiyun * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
182*4882a593Smuzhiyun */
183*4882a593Smuzhiyun#define F4(x) \
184*4882a593Smuzhiyun	movzbl x ## bh,		RT1d; \
185*4882a593Smuzhiyun	movzbl x ## bl,		RT3d; \
186*4882a593Smuzhiyun	rorq $16,		x; \
187*4882a593Smuzhiyun	movzbl x ## bh,		RT0d; \
188*4882a593Smuzhiyun	movzbl x ## bl,		RT2d; \
189*4882a593Smuzhiyun	rorq $16,		x; \
190*4882a593Smuzhiyun	movl s0(CTX,RT0,4),	RT0d; \
191*4882a593Smuzhiyun	addl s1(CTX,RT2,4),	RT0d; \
192*4882a593Smuzhiyun	xorl s2(CTX,RT1,4),	RT0d; \
193*4882a593Smuzhiyun	addl s3(CTX,RT3,4),	RT0d; \
194*4882a593Smuzhiyun	xorq RT0,		x;
195*4882a593Smuzhiyun
196*4882a593Smuzhiyun#define add_preloaded_roundkey4() \
197*4882a593Smuzhiyun	xorq RKEY,		RX0; \
198*4882a593Smuzhiyun	xorq RKEY,		RX1; \
199*4882a593Smuzhiyun	xorq RKEY,		RX2; \
200*4882a593Smuzhiyun	xorq RKEY,		RX3;
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun#define preload_roundkey_enc(n) \
203*4882a593Smuzhiyun	movq p+4*(n)(CTX),	RKEY;
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun#define add_roundkey_enc4(n) \
206*4882a593Smuzhiyun	add_preloaded_roundkey4(); \
207*4882a593Smuzhiyun	preload_roundkey_enc(n + 2);
208*4882a593Smuzhiyun
209*4882a593Smuzhiyun#define round_enc4(n) \
210*4882a593Smuzhiyun	add_roundkey_enc4(n); \
211*4882a593Smuzhiyun	\
212*4882a593Smuzhiyun	F4(RX0); \
213*4882a593Smuzhiyun	F4(RX1); \
214*4882a593Smuzhiyun	F4(RX2); \
215*4882a593Smuzhiyun	F4(RX3); \
216*4882a593Smuzhiyun	\
217*4882a593Smuzhiyun	F4(RX0); \
218*4882a593Smuzhiyun	F4(RX1); \
219*4882a593Smuzhiyun	F4(RX2); \
220*4882a593Smuzhiyun	F4(RX3);
221*4882a593Smuzhiyun
222*4882a593Smuzhiyun#define preload_roundkey_dec(n) \
223*4882a593Smuzhiyun	movq p+4*((n)-1)(CTX),	RKEY; \
224*4882a593Smuzhiyun	rorq $32,		RKEY;
225*4882a593Smuzhiyun
226*4882a593Smuzhiyun#define add_roundkey_dec4(n) \
227*4882a593Smuzhiyun	add_preloaded_roundkey4(); \
228*4882a593Smuzhiyun	preload_roundkey_dec(n - 2);
229*4882a593Smuzhiyun
230*4882a593Smuzhiyun#define round_dec4(n) \
231*4882a593Smuzhiyun	add_roundkey_dec4(n); \
232*4882a593Smuzhiyun	\
233*4882a593Smuzhiyun	F4(RX0); \
234*4882a593Smuzhiyun	F4(RX1); \
235*4882a593Smuzhiyun	F4(RX2); \
236*4882a593Smuzhiyun	F4(RX3); \
237*4882a593Smuzhiyun	\
238*4882a593Smuzhiyun	F4(RX0); \
239*4882a593Smuzhiyun	F4(RX1); \
240*4882a593Smuzhiyun	F4(RX2); \
241*4882a593Smuzhiyun	F4(RX3);
242*4882a593Smuzhiyun
243*4882a593Smuzhiyun#define read_block4() \
244*4882a593Smuzhiyun	movq (RIO),		RX0; \
245*4882a593Smuzhiyun	rorq $32,		RX0; \
246*4882a593Smuzhiyun	bswapq 			RX0; \
247*4882a593Smuzhiyun	\
248*4882a593Smuzhiyun	movq 8(RIO),		RX1; \
249*4882a593Smuzhiyun	rorq $32,		RX1; \
250*4882a593Smuzhiyun	bswapq 			RX1; \
251*4882a593Smuzhiyun	\
252*4882a593Smuzhiyun	movq 16(RIO),		RX2; \
253*4882a593Smuzhiyun	rorq $32,		RX2; \
254*4882a593Smuzhiyun	bswapq 			RX2; \
255*4882a593Smuzhiyun	\
256*4882a593Smuzhiyun	movq 24(RIO),		RX3; \
257*4882a593Smuzhiyun	rorq $32,		RX3; \
258*4882a593Smuzhiyun	bswapq 			RX3;
259*4882a593Smuzhiyun
260*4882a593Smuzhiyun#define write_block4() \
261*4882a593Smuzhiyun	bswapq 			RX0; \
262*4882a593Smuzhiyun	movq RX0,		(RIO); \
263*4882a593Smuzhiyun	\
264*4882a593Smuzhiyun	bswapq 			RX1; \
265*4882a593Smuzhiyun	movq RX1,		8(RIO); \
266*4882a593Smuzhiyun	\
267*4882a593Smuzhiyun	bswapq 			RX2; \
268*4882a593Smuzhiyun	movq RX2,		16(RIO); \
269*4882a593Smuzhiyun	\
270*4882a593Smuzhiyun	bswapq 			RX3; \
271*4882a593Smuzhiyun	movq RX3,		24(RIO);
272*4882a593Smuzhiyun
273*4882a593Smuzhiyun#define xor_block4() \
274*4882a593Smuzhiyun	bswapq 			RX0; \
275*4882a593Smuzhiyun	xorq RX0,		(RIO); \
276*4882a593Smuzhiyun	\
277*4882a593Smuzhiyun	bswapq 			RX1; \
278*4882a593Smuzhiyun	xorq RX1,		8(RIO); \
279*4882a593Smuzhiyun	\
280*4882a593Smuzhiyun	bswapq 			RX2; \
281*4882a593Smuzhiyun	xorq RX2,		16(RIO); \
282*4882a593Smuzhiyun	\
283*4882a593Smuzhiyun	bswapq 			RX3; \
284*4882a593Smuzhiyun	xorq RX3,		24(RIO);
285*4882a593Smuzhiyun
286*4882a593SmuzhiyunSYM_FUNC_START(__blowfish_enc_blk_4way)
287*4882a593Smuzhiyun	/* input:
288*4882a593Smuzhiyun	 *	%rdi: ctx
289*4882a593Smuzhiyun	 *	%rsi: dst
290*4882a593Smuzhiyun	 *	%rdx: src
291*4882a593Smuzhiyun	 *	%rcx: bool, if true: xor output
292*4882a593Smuzhiyun	 */
293*4882a593Smuzhiyun	pushq %r12;
294*4882a593Smuzhiyun	pushq %rbx;
295*4882a593Smuzhiyun	pushq %rcx;
296*4882a593Smuzhiyun
297*4882a593Smuzhiyun	movq %rdi, CTX
298*4882a593Smuzhiyun	movq %rsi, %r11;
299*4882a593Smuzhiyun	movq %rdx, RIO;
300*4882a593Smuzhiyun
301*4882a593Smuzhiyun	preload_roundkey_enc(0);
302*4882a593Smuzhiyun
303*4882a593Smuzhiyun	read_block4();
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun	round_enc4(0);
306*4882a593Smuzhiyun	round_enc4(2);
307*4882a593Smuzhiyun	round_enc4(4);
308*4882a593Smuzhiyun	round_enc4(6);
309*4882a593Smuzhiyun	round_enc4(8);
310*4882a593Smuzhiyun	round_enc4(10);
311*4882a593Smuzhiyun	round_enc4(12);
312*4882a593Smuzhiyun	round_enc4(14);
313*4882a593Smuzhiyun	add_preloaded_roundkey4();
314*4882a593Smuzhiyun
315*4882a593Smuzhiyun	popq %r12;
316*4882a593Smuzhiyun	movq %r11, RIO;
317*4882a593Smuzhiyun
318*4882a593Smuzhiyun	test %r12b, %r12b;
319*4882a593Smuzhiyun	jnz .L__enc_xor4;
320*4882a593Smuzhiyun
321*4882a593Smuzhiyun	write_block4();
322*4882a593Smuzhiyun
323*4882a593Smuzhiyun	popq %rbx;
324*4882a593Smuzhiyun	popq %r12;
325*4882a593Smuzhiyun	RET;
326*4882a593Smuzhiyun
327*4882a593Smuzhiyun.L__enc_xor4:
328*4882a593Smuzhiyun	xor_block4();
329*4882a593Smuzhiyun
330*4882a593Smuzhiyun	popq %rbx;
331*4882a593Smuzhiyun	popq %r12;
332*4882a593Smuzhiyun	RET;
333*4882a593SmuzhiyunSYM_FUNC_END(__blowfish_enc_blk_4way)
334*4882a593Smuzhiyun
335*4882a593SmuzhiyunSYM_FUNC_START(blowfish_dec_blk_4way)
336*4882a593Smuzhiyun	/* input:
337*4882a593Smuzhiyun	 *	%rdi: ctx
338*4882a593Smuzhiyun	 *	%rsi: dst
339*4882a593Smuzhiyun	 *	%rdx: src
340*4882a593Smuzhiyun	 */
341*4882a593Smuzhiyun	pushq %r12;
342*4882a593Smuzhiyun	pushq %rbx;
343*4882a593Smuzhiyun
344*4882a593Smuzhiyun	movq %rdi, CTX;
345*4882a593Smuzhiyun	movq %rsi, %r11
346*4882a593Smuzhiyun	movq %rdx, RIO;
347*4882a593Smuzhiyun
348*4882a593Smuzhiyun	preload_roundkey_dec(17);
349*4882a593Smuzhiyun	read_block4();
350*4882a593Smuzhiyun
351*4882a593Smuzhiyun	round_dec4(17);
352*4882a593Smuzhiyun	round_dec4(15);
353*4882a593Smuzhiyun	round_dec4(13);
354*4882a593Smuzhiyun	round_dec4(11);
355*4882a593Smuzhiyun	round_dec4(9);
356*4882a593Smuzhiyun	round_dec4(7);
357*4882a593Smuzhiyun	round_dec4(5);
358*4882a593Smuzhiyun	round_dec4(3);
359*4882a593Smuzhiyun	add_preloaded_roundkey4();
360*4882a593Smuzhiyun
361*4882a593Smuzhiyun	movq %r11, RIO;
362*4882a593Smuzhiyun	write_block4();
363*4882a593Smuzhiyun
364*4882a593Smuzhiyun	popq %rbx;
365*4882a593Smuzhiyun	popq %r12;
366*4882a593Smuzhiyun
367*4882a593Smuzhiyun	RET;
368*4882a593SmuzhiyunSYM_FUNC_END(blowfish_dec_blk_4way)
369