xref: /OK3568_Linux_fs/kernel/arch/x86/crypto/twofish-x86_64-asm_64.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/***************************************************************************
3*4882a593Smuzhiyun*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
4*4882a593Smuzhiyun*                                                                         *
5*4882a593Smuzhiyun***************************************************************************/
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun.file "twofish-x86_64-asm.S"
8*4882a593Smuzhiyun.text
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun#include <linux/linkage.h>
11*4882a593Smuzhiyun#include <asm/asm-offsets.h>
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun#define a_offset	0
14*4882a593Smuzhiyun#define b_offset	4
15*4882a593Smuzhiyun#define c_offset	8
16*4882a593Smuzhiyun#define d_offset	12
17*4882a593Smuzhiyun
18*4882a593Smuzhiyun/* Structure of the crypto context struct*/
19*4882a593Smuzhiyun
20*4882a593Smuzhiyun#define s0	0	/* S0 Array 256 Words each */
21*4882a593Smuzhiyun#define s1	1024	/* S1 Array */
22*4882a593Smuzhiyun#define s2	2048	/* S2 Array */
23*4882a593Smuzhiyun#define s3	3072	/* S3 Array */
24*4882a593Smuzhiyun#define w	4096	/* 8 whitening keys (word) */
25*4882a593Smuzhiyun#define k	4128	/* key 1-32 ( word ) */
26*4882a593Smuzhiyun
27*4882a593Smuzhiyun/* define a few register aliases to allow macro substitution */
28*4882a593Smuzhiyun
29*4882a593Smuzhiyun#define R0     %rax
30*4882a593Smuzhiyun#define R0D    %eax
31*4882a593Smuzhiyun#define R0B    %al
32*4882a593Smuzhiyun#define R0H    %ah
33*4882a593Smuzhiyun
34*4882a593Smuzhiyun#define R1     %rbx
35*4882a593Smuzhiyun#define R1D    %ebx
36*4882a593Smuzhiyun#define R1B    %bl
37*4882a593Smuzhiyun#define R1H    %bh
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun#define R2     %rcx
40*4882a593Smuzhiyun#define R2D    %ecx
41*4882a593Smuzhiyun#define R2B    %cl
42*4882a593Smuzhiyun#define R2H    %ch
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun#define R3     %rdx
45*4882a593Smuzhiyun#define R3D    %edx
46*4882a593Smuzhiyun#define R3B    %dl
47*4882a593Smuzhiyun#define R3H    %dh
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun
50*4882a593Smuzhiyun/* performs input whitening */
51*4882a593Smuzhiyun#define input_whitening(src,context,offset)\
52*4882a593Smuzhiyun	xor	w+offset(context),	src;
53*4882a593Smuzhiyun
54*4882a593Smuzhiyun/* performs input whitening */
55*4882a593Smuzhiyun#define output_whitening(src,context,offset)\
56*4882a593Smuzhiyun	xor	w+16+offset(context),	src;
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun
59*4882a593Smuzhiyun/*
60*4882a593Smuzhiyun * a input register containing a (rotated 16)
61*4882a593Smuzhiyun * b input register containing b
62*4882a593Smuzhiyun * c input register containing c
63*4882a593Smuzhiyun * d input register containing d (already rol $1)
64*4882a593Smuzhiyun * operations on a and b are interleaved to increase performance
65*4882a593Smuzhiyun */
66*4882a593Smuzhiyun#define encrypt_round(a,b,c,d,round)\
67*4882a593Smuzhiyun	movzx	b ## B,		%edi;\
68*4882a593Smuzhiyun	mov	s1(%r11,%rdi,4),%r8d;\
69*4882a593Smuzhiyun	movzx	a ## B,		%edi;\
70*4882a593Smuzhiyun	mov	s2(%r11,%rdi,4),%r9d;\
71*4882a593Smuzhiyun	movzx	b ## H,		%edi;\
72*4882a593Smuzhiyun	ror	$16,		b ## D;\
73*4882a593Smuzhiyun	xor	s2(%r11,%rdi,4),%r8d;\
74*4882a593Smuzhiyun	movzx	a ## H,		%edi;\
75*4882a593Smuzhiyun	ror	$16,		a ## D;\
76*4882a593Smuzhiyun	xor	s3(%r11,%rdi,4),%r9d;\
77*4882a593Smuzhiyun	movzx	b ## B,		%edi;\
78*4882a593Smuzhiyun	xor	s3(%r11,%rdi,4),%r8d;\
79*4882a593Smuzhiyun	movzx	a ## B,		%edi;\
80*4882a593Smuzhiyun	xor	(%r11,%rdi,4),	%r9d;\
81*4882a593Smuzhiyun	movzx	b ## H,		%edi;\
82*4882a593Smuzhiyun	ror	$15,		b ## D;\
83*4882a593Smuzhiyun	xor	(%r11,%rdi,4),	%r8d;\
84*4882a593Smuzhiyun	movzx	a ## H,		%edi;\
85*4882a593Smuzhiyun	xor	s1(%r11,%rdi,4),%r9d;\
86*4882a593Smuzhiyun	add	%r8d,		%r9d;\
87*4882a593Smuzhiyun	add	%r9d,		%r8d;\
88*4882a593Smuzhiyun	add	k+round(%r11),	%r9d;\
89*4882a593Smuzhiyun	xor	%r9d,		c ## D;\
90*4882a593Smuzhiyun	rol	$15,		c ## D;\
91*4882a593Smuzhiyun	add	k+4+round(%r11),%r8d;\
92*4882a593Smuzhiyun	xor	%r8d,		d ## D;
93*4882a593Smuzhiyun
94*4882a593Smuzhiyun/*
95*4882a593Smuzhiyun * a input register containing a(rotated 16)
96*4882a593Smuzhiyun * b input register containing b
97*4882a593Smuzhiyun * c input register containing c
98*4882a593Smuzhiyun * d input register containing d (already rol $1)
99*4882a593Smuzhiyun * operations on a and b are interleaved to increase performance
100*4882a593Smuzhiyun * during the round a and b are prepared for the output whitening
101*4882a593Smuzhiyun */
102*4882a593Smuzhiyun#define encrypt_last_round(a,b,c,d,round)\
103*4882a593Smuzhiyun	mov	b ## D,		%r10d;\
104*4882a593Smuzhiyun	shl	$32,		%r10;\
105*4882a593Smuzhiyun	movzx	b ## B,		%edi;\
106*4882a593Smuzhiyun	mov	s1(%r11,%rdi,4),%r8d;\
107*4882a593Smuzhiyun	movzx	a ## B,		%edi;\
108*4882a593Smuzhiyun	mov	s2(%r11,%rdi,4),%r9d;\
109*4882a593Smuzhiyun	movzx	b ## H,		%edi;\
110*4882a593Smuzhiyun	ror	$16,		b ## D;\
111*4882a593Smuzhiyun	xor	s2(%r11,%rdi,4),%r8d;\
112*4882a593Smuzhiyun	movzx	a ## H,		%edi;\
113*4882a593Smuzhiyun	ror	$16,		a ## D;\
114*4882a593Smuzhiyun	xor	s3(%r11,%rdi,4),%r9d;\
115*4882a593Smuzhiyun	movzx	b ## B,		%edi;\
116*4882a593Smuzhiyun	xor	s3(%r11,%rdi,4),%r8d;\
117*4882a593Smuzhiyun	movzx	a ## B,		%edi;\
118*4882a593Smuzhiyun	xor	(%r11,%rdi,4),	%r9d;\
119*4882a593Smuzhiyun	xor	a,		%r10;\
120*4882a593Smuzhiyun	movzx	b ## H,		%edi;\
121*4882a593Smuzhiyun	xor	(%r11,%rdi,4),	%r8d;\
122*4882a593Smuzhiyun	movzx	a ## H,		%edi;\
123*4882a593Smuzhiyun	xor	s1(%r11,%rdi,4),%r9d;\
124*4882a593Smuzhiyun	add	%r8d,		%r9d;\
125*4882a593Smuzhiyun	add	%r9d,		%r8d;\
126*4882a593Smuzhiyun	add	k+round(%r11),	%r9d;\
127*4882a593Smuzhiyun	xor	%r9d,		c ## D;\
128*4882a593Smuzhiyun	ror	$1,		c ## D;\
129*4882a593Smuzhiyun	add	k+4+round(%r11),%r8d;\
130*4882a593Smuzhiyun	xor	%r8d,		d ## D
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun/*
133*4882a593Smuzhiyun * a input register containing a
134*4882a593Smuzhiyun * b input register containing b (rotated 16)
135*4882a593Smuzhiyun * c input register containing c (already rol $1)
136*4882a593Smuzhiyun * d input register containing d
137*4882a593Smuzhiyun * operations on a and b are interleaved to increase performance
138*4882a593Smuzhiyun */
139*4882a593Smuzhiyun#define decrypt_round(a,b,c,d,round)\
140*4882a593Smuzhiyun	movzx	a ## B,		%edi;\
141*4882a593Smuzhiyun	mov	(%r11,%rdi,4),	%r9d;\
142*4882a593Smuzhiyun	movzx	b ## B,		%edi;\
143*4882a593Smuzhiyun	mov	s3(%r11,%rdi,4),%r8d;\
144*4882a593Smuzhiyun	movzx	a ## H,		%edi;\
145*4882a593Smuzhiyun	ror	$16,		a ## D;\
146*4882a593Smuzhiyun	xor	s1(%r11,%rdi,4),%r9d;\
147*4882a593Smuzhiyun	movzx	b ## H,		%edi;\
148*4882a593Smuzhiyun	ror	$16,		b ## D;\
149*4882a593Smuzhiyun	xor	(%r11,%rdi,4),	%r8d;\
150*4882a593Smuzhiyun	movzx	a ## B,		%edi;\
151*4882a593Smuzhiyun	xor	s2(%r11,%rdi,4),%r9d;\
152*4882a593Smuzhiyun	movzx	b ## B,		%edi;\
153*4882a593Smuzhiyun	xor	s1(%r11,%rdi,4),%r8d;\
154*4882a593Smuzhiyun	movzx	a ## H,		%edi;\
155*4882a593Smuzhiyun	ror	$15,		a ## D;\
156*4882a593Smuzhiyun	xor	s3(%r11,%rdi,4),%r9d;\
157*4882a593Smuzhiyun	movzx	b ## H,		%edi;\
158*4882a593Smuzhiyun	xor	s2(%r11,%rdi,4),%r8d;\
159*4882a593Smuzhiyun	add	%r8d,		%r9d;\
160*4882a593Smuzhiyun	add	%r9d,		%r8d;\
161*4882a593Smuzhiyun	add	k+round(%r11),	%r9d;\
162*4882a593Smuzhiyun	xor	%r9d,		c ## D;\
163*4882a593Smuzhiyun	add	k+4+round(%r11),%r8d;\
164*4882a593Smuzhiyun	xor	%r8d,		d ## D;\
165*4882a593Smuzhiyun	rol	$15,		d ## D;
166*4882a593Smuzhiyun
167*4882a593Smuzhiyun/*
168*4882a593Smuzhiyun * a input register containing a
169*4882a593Smuzhiyun * b input register containing b
170*4882a593Smuzhiyun * c input register containing c (already rol $1)
171*4882a593Smuzhiyun * d input register containing d
172*4882a593Smuzhiyun * operations on a and b are interleaved to increase performance
173*4882a593Smuzhiyun * during the round a and b are prepared for the output whitening
174*4882a593Smuzhiyun */
175*4882a593Smuzhiyun#define decrypt_last_round(a,b,c,d,round)\
176*4882a593Smuzhiyun	movzx	a ## B,		%edi;\
177*4882a593Smuzhiyun	mov	(%r11,%rdi,4),	%r9d;\
178*4882a593Smuzhiyun	movzx	b ## B,		%edi;\
179*4882a593Smuzhiyun	mov	s3(%r11,%rdi,4),%r8d;\
180*4882a593Smuzhiyun	movzx	b ## H,		%edi;\
181*4882a593Smuzhiyun	ror	$16,		b ## D;\
182*4882a593Smuzhiyun	xor	(%r11,%rdi,4),	%r8d;\
183*4882a593Smuzhiyun	movzx	a ## H,		%edi;\
184*4882a593Smuzhiyun	mov	b ## D,		%r10d;\
185*4882a593Smuzhiyun	shl	$32,		%r10;\
186*4882a593Smuzhiyun	xor	a,		%r10;\
187*4882a593Smuzhiyun	ror	$16,		a ## D;\
188*4882a593Smuzhiyun	xor	s1(%r11,%rdi,4),%r9d;\
189*4882a593Smuzhiyun	movzx	b ## B,		%edi;\
190*4882a593Smuzhiyun	xor	s1(%r11,%rdi,4),%r8d;\
191*4882a593Smuzhiyun	movzx	a ## B,		%edi;\
192*4882a593Smuzhiyun	xor	s2(%r11,%rdi,4),%r9d;\
193*4882a593Smuzhiyun	movzx	b ## H,		%edi;\
194*4882a593Smuzhiyun	xor	s2(%r11,%rdi,4),%r8d;\
195*4882a593Smuzhiyun	movzx	a ## H,		%edi;\
196*4882a593Smuzhiyun	xor	s3(%r11,%rdi,4),%r9d;\
197*4882a593Smuzhiyun	add	%r8d,		%r9d;\
198*4882a593Smuzhiyun	add	%r9d,		%r8d;\
199*4882a593Smuzhiyun	add	k+round(%r11),	%r9d;\
200*4882a593Smuzhiyun	xor	%r9d,		c ## D;\
201*4882a593Smuzhiyun	add	k+4+round(%r11),%r8d;\
202*4882a593Smuzhiyun	xor	%r8d,		d ## D;\
203*4882a593Smuzhiyun	ror	$1,		d ## D;
204*4882a593Smuzhiyun
205*4882a593SmuzhiyunSYM_FUNC_START(twofish_enc_blk)
206*4882a593Smuzhiyun	pushq    R1
207*4882a593Smuzhiyun
208*4882a593Smuzhiyun	/* %rdi contains the ctx address */
209*4882a593Smuzhiyun	/* %rsi contains the output address */
210*4882a593Smuzhiyun	/* %rdx contains the input address */
211*4882a593Smuzhiyun	/* ctx address is moved to free one non-rex register
212*4882a593Smuzhiyun	as target for the 8bit high operations */
213*4882a593Smuzhiyun	mov	%rdi,		%r11
214*4882a593Smuzhiyun
215*4882a593Smuzhiyun	movq	(R3),	R1
216*4882a593Smuzhiyun	movq	8(R3),	R3
217*4882a593Smuzhiyun	input_whitening(R1,%r11,a_offset)
218*4882a593Smuzhiyun	input_whitening(R3,%r11,c_offset)
219*4882a593Smuzhiyun	mov	R1D,	R0D
220*4882a593Smuzhiyun	rol	$16,	R0D
221*4882a593Smuzhiyun	shr	$32,	R1
222*4882a593Smuzhiyun	mov	R3D,	R2D
223*4882a593Smuzhiyun	shr	$32,	R3
224*4882a593Smuzhiyun	rol	$1,	R3D
225*4882a593Smuzhiyun
226*4882a593Smuzhiyun	encrypt_round(R0,R1,R2,R3,0);
227*4882a593Smuzhiyun	encrypt_round(R2,R3,R0,R1,8);
228*4882a593Smuzhiyun	encrypt_round(R0,R1,R2,R3,2*8);
229*4882a593Smuzhiyun	encrypt_round(R2,R3,R0,R1,3*8);
230*4882a593Smuzhiyun	encrypt_round(R0,R1,R2,R3,4*8);
231*4882a593Smuzhiyun	encrypt_round(R2,R3,R0,R1,5*8);
232*4882a593Smuzhiyun	encrypt_round(R0,R1,R2,R3,6*8);
233*4882a593Smuzhiyun	encrypt_round(R2,R3,R0,R1,7*8);
234*4882a593Smuzhiyun	encrypt_round(R0,R1,R2,R3,8*8);
235*4882a593Smuzhiyun	encrypt_round(R2,R3,R0,R1,9*8);
236*4882a593Smuzhiyun	encrypt_round(R0,R1,R2,R3,10*8);
237*4882a593Smuzhiyun	encrypt_round(R2,R3,R0,R1,11*8);
238*4882a593Smuzhiyun	encrypt_round(R0,R1,R2,R3,12*8);
239*4882a593Smuzhiyun	encrypt_round(R2,R3,R0,R1,13*8);
240*4882a593Smuzhiyun	encrypt_round(R0,R1,R2,R3,14*8);
241*4882a593Smuzhiyun	encrypt_last_round(R2,R3,R0,R1,15*8);
242*4882a593Smuzhiyun
243*4882a593Smuzhiyun
244*4882a593Smuzhiyun	output_whitening(%r10,%r11,a_offset)
245*4882a593Smuzhiyun	movq	%r10,	(%rsi)
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun	shl	$32,	R1
248*4882a593Smuzhiyun	xor	R0,	R1
249*4882a593Smuzhiyun
250*4882a593Smuzhiyun	output_whitening(R1,%r11,c_offset)
251*4882a593Smuzhiyun	movq	R1,	8(%rsi)
252*4882a593Smuzhiyun
253*4882a593Smuzhiyun	popq	R1
254*4882a593Smuzhiyun	movl	$1,%eax
255*4882a593Smuzhiyun	RET
256*4882a593SmuzhiyunSYM_FUNC_END(twofish_enc_blk)
257*4882a593Smuzhiyun
258*4882a593SmuzhiyunSYM_FUNC_START(twofish_dec_blk)
259*4882a593Smuzhiyun	pushq    R1
260*4882a593Smuzhiyun
261*4882a593Smuzhiyun	/* %rdi contains the ctx address */
262*4882a593Smuzhiyun	/* %rsi contains the output address */
263*4882a593Smuzhiyun	/* %rdx contains the input address */
264*4882a593Smuzhiyun	/* ctx address is moved to free one non-rex register
265*4882a593Smuzhiyun	as target for the 8bit high operations */
266*4882a593Smuzhiyun	mov	%rdi,		%r11
267*4882a593Smuzhiyun
268*4882a593Smuzhiyun	movq	(R3),	R1
269*4882a593Smuzhiyun	movq	8(R3),	R3
270*4882a593Smuzhiyun	output_whitening(R1,%r11,a_offset)
271*4882a593Smuzhiyun	output_whitening(R3,%r11,c_offset)
272*4882a593Smuzhiyun	mov	R1D,	R0D
273*4882a593Smuzhiyun	shr	$32,	R1
274*4882a593Smuzhiyun	rol	$16,	R1D
275*4882a593Smuzhiyun	mov	R3D,	R2D
276*4882a593Smuzhiyun	shr	$32,	R3
277*4882a593Smuzhiyun	rol	$1,	R2D
278*4882a593Smuzhiyun
279*4882a593Smuzhiyun	decrypt_round(R0,R1,R2,R3,15*8);
280*4882a593Smuzhiyun	decrypt_round(R2,R3,R0,R1,14*8);
281*4882a593Smuzhiyun	decrypt_round(R0,R1,R2,R3,13*8);
282*4882a593Smuzhiyun	decrypt_round(R2,R3,R0,R1,12*8);
283*4882a593Smuzhiyun	decrypt_round(R0,R1,R2,R3,11*8);
284*4882a593Smuzhiyun	decrypt_round(R2,R3,R0,R1,10*8);
285*4882a593Smuzhiyun	decrypt_round(R0,R1,R2,R3,9*8);
286*4882a593Smuzhiyun	decrypt_round(R2,R3,R0,R1,8*8);
287*4882a593Smuzhiyun	decrypt_round(R0,R1,R2,R3,7*8);
288*4882a593Smuzhiyun	decrypt_round(R2,R3,R0,R1,6*8);
289*4882a593Smuzhiyun	decrypt_round(R0,R1,R2,R3,5*8);
290*4882a593Smuzhiyun	decrypt_round(R2,R3,R0,R1,4*8);
291*4882a593Smuzhiyun	decrypt_round(R0,R1,R2,R3,3*8);
292*4882a593Smuzhiyun	decrypt_round(R2,R3,R0,R1,2*8);
293*4882a593Smuzhiyun	decrypt_round(R0,R1,R2,R3,1*8);
294*4882a593Smuzhiyun	decrypt_last_round(R2,R3,R0,R1,0);
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun	input_whitening(%r10,%r11,a_offset)
297*4882a593Smuzhiyun	movq	%r10,	(%rsi)
298*4882a593Smuzhiyun
299*4882a593Smuzhiyun	shl	$32,	R1
300*4882a593Smuzhiyun	xor	R0,	R1
301*4882a593Smuzhiyun
302*4882a593Smuzhiyun	input_whitening(R1,%r11,c_offset)
303*4882a593Smuzhiyun	movq	R1,	8(%rsi)
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun	popq	R1
306*4882a593Smuzhiyun	movl	$1,%eax
307*4882a593Smuzhiyun	RET
308*4882a593SmuzhiyunSYM_FUNC_END(twofish_dec_blk)
309