xref: /OK3568_Linux_fs/kernel/arch/x86/crypto/twofish-i586-asm_32.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/***************************************************************************
3*4882a593Smuzhiyun*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
4*4882a593Smuzhiyun*                                                                         *
5*4882a593Smuzhiyun***************************************************************************/
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun.file "twofish-i586-asm.S"
8*4882a593Smuzhiyun.text
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun#include <linux/linkage.h>
11*4882a593Smuzhiyun#include <asm/asm-offsets.h>
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun/* return address at 0 */
14*4882a593Smuzhiyun
15*4882a593Smuzhiyun#define in_blk    12  /* input byte array address parameter*/
16*4882a593Smuzhiyun#define out_blk   8  /* output byte array address parameter*/
17*4882a593Smuzhiyun#define ctx       4  /* Twofish context structure */
18*4882a593Smuzhiyun
19*4882a593Smuzhiyun#define a_offset	0
20*4882a593Smuzhiyun#define b_offset	4
21*4882a593Smuzhiyun#define c_offset	8
22*4882a593Smuzhiyun#define d_offset	12
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun/* Structure of the crypto context struct*/
25*4882a593Smuzhiyun
26*4882a593Smuzhiyun#define s0	0	/* S0 Array 256 Words each */
27*4882a593Smuzhiyun#define s1	1024	/* S1 Array */
28*4882a593Smuzhiyun#define s2	2048	/* S2 Array */
29*4882a593Smuzhiyun#define s3	3072	/* S3 Array */
30*4882a593Smuzhiyun#define w	4096	/* 8 whitening keys (word) */
31*4882a593Smuzhiyun#define k	4128	/* key 1-32 ( word ) */
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun/* define a few register aliases to allow macro substitution */
34*4882a593Smuzhiyun
35*4882a593Smuzhiyun#define R0D    %eax
36*4882a593Smuzhiyun#define R0B    %al
37*4882a593Smuzhiyun#define R0H    %ah
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun#define R1D    %ebx
40*4882a593Smuzhiyun#define R1B    %bl
41*4882a593Smuzhiyun#define R1H    %bh
42*4882a593Smuzhiyun
43*4882a593Smuzhiyun#define R2D    %ecx
44*4882a593Smuzhiyun#define R2B    %cl
45*4882a593Smuzhiyun#define R2H    %ch
46*4882a593Smuzhiyun
47*4882a593Smuzhiyun#define R3D    %edx
48*4882a593Smuzhiyun#define R3B    %dl
49*4882a593Smuzhiyun#define R3H    %dh
50*4882a593Smuzhiyun
51*4882a593Smuzhiyun
52*4882a593Smuzhiyun/* performs input whitening */
53*4882a593Smuzhiyun#define input_whitening(src,context,offset)\
54*4882a593Smuzhiyun	xor	w+offset(context),	src;
55*4882a593Smuzhiyun
56*4882a593Smuzhiyun/* performs input whitening */
57*4882a593Smuzhiyun#define output_whitening(src,context,offset)\
58*4882a593Smuzhiyun	xor	w+16+offset(context),	src;
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun/*
61*4882a593Smuzhiyun * a input register containing a (rotated 16)
62*4882a593Smuzhiyun * b input register containing b
63*4882a593Smuzhiyun * c input register containing c
64*4882a593Smuzhiyun * d input register containing d (already rol $1)
65*4882a593Smuzhiyun * operations on a and b are interleaved to increase performance
66*4882a593Smuzhiyun */
67*4882a593Smuzhiyun#define encrypt_round(a,b,c,d,round)\
68*4882a593Smuzhiyun	push	d ## D;\
69*4882a593Smuzhiyun	movzx	b ## B,		%edi;\
70*4882a593Smuzhiyun	mov	s1(%ebp,%edi,4),d ## D;\
71*4882a593Smuzhiyun	movzx	a ## B,		%edi;\
72*4882a593Smuzhiyun	mov	s2(%ebp,%edi,4),%esi;\
73*4882a593Smuzhiyun	movzx	b ## H,		%edi;\
74*4882a593Smuzhiyun	ror	$16,		b ## D;\
75*4882a593Smuzhiyun	xor	s2(%ebp,%edi,4),d ## D;\
76*4882a593Smuzhiyun	movzx	a ## H,		%edi;\
77*4882a593Smuzhiyun	ror	$16,		a ## D;\
78*4882a593Smuzhiyun	xor	s3(%ebp,%edi,4),%esi;\
79*4882a593Smuzhiyun	movzx	b ## B,		%edi;\
80*4882a593Smuzhiyun	xor	s3(%ebp,%edi,4),d ## D;\
81*4882a593Smuzhiyun	movzx	a ## B,		%edi;\
82*4882a593Smuzhiyun	xor	(%ebp,%edi,4),	%esi;\
83*4882a593Smuzhiyun	movzx	b ## H,		%edi;\
84*4882a593Smuzhiyun	ror	$15,		b ## D;\
85*4882a593Smuzhiyun	xor	(%ebp,%edi,4),	d ## D;\
86*4882a593Smuzhiyun	movzx	a ## H,		%edi;\
87*4882a593Smuzhiyun	xor	s1(%ebp,%edi,4),%esi;\
88*4882a593Smuzhiyun	pop	%edi;\
89*4882a593Smuzhiyun	add	d ## D,		%esi;\
90*4882a593Smuzhiyun	add	%esi,		d ## D;\
91*4882a593Smuzhiyun	add	k+round(%ebp),	%esi;\
92*4882a593Smuzhiyun	xor	%esi,		c ## D;\
93*4882a593Smuzhiyun	rol	$15,		c ## D;\
94*4882a593Smuzhiyun	add	k+4+round(%ebp),d ## D;\
95*4882a593Smuzhiyun	xor	%edi,		d ## D;
96*4882a593Smuzhiyun
97*4882a593Smuzhiyun/*
98*4882a593Smuzhiyun * a input register containing a (rotated 16)
99*4882a593Smuzhiyun * b input register containing b
100*4882a593Smuzhiyun * c input register containing c
101*4882a593Smuzhiyun * d input register containing d (already rol $1)
102*4882a593Smuzhiyun * operations on a and b are interleaved to increase performance
103*4882a593Smuzhiyun * last round has different rotations for the output preparation
104*4882a593Smuzhiyun */
105*4882a593Smuzhiyun#define encrypt_last_round(a,b,c,d,round)\
106*4882a593Smuzhiyun	push	d ## D;\
107*4882a593Smuzhiyun	movzx	b ## B,		%edi;\
108*4882a593Smuzhiyun	mov	s1(%ebp,%edi,4),d ## D;\
109*4882a593Smuzhiyun	movzx	a ## B,		%edi;\
110*4882a593Smuzhiyun	mov	s2(%ebp,%edi,4),%esi;\
111*4882a593Smuzhiyun	movzx	b ## H,		%edi;\
112*4882a593Smuzhiyun	ror	$16,		b ## D;\
113*4882a593Smuzhiyun	xor	s2(%ebp,%edi,4),d ## D;\
114*4882a593Smuzhiyun	movzx	a ## H,		%edi;\
115*4882a593Smuzhiyun	ror	$16,		a ## D;\
116*4882a593Smuzhiyun	xor	s3(%ebp,%edi,4),%esi;\
117*4882a593Smuzhiyun	movzx	b ## B,		%edi;\
118*4882a593Smuzhiyun	xor	s3(%ebp,%edi,4),d ## D;\
119*4882a593Smuzhiyun	movzx	a ## B,		%edi;\
120*4882a593Smuzhiyun	xor	(%ebp,%edi,4),	%esi;\
121*4882a593Smuzhiyun	movzx	b ## H,		%edi;\
122*4882a593Smuzhiyun	ror	$16,		b ## D;\
123*4882a593Smuzhiyun	xor	(%ebp,%edi,4),	d ## D;\
124*4882a593Smuzhiyun	movzx	a ## H,		%edi;\
125*4882a593Smuzhiyun	xor	s1(%ebp,%edi,4),%esi;\
126*4882a593Smuzhiyun	pop	%edi;\
127*4882a593Smuzhiyun	add	d ## D,		%esi;\
128*4882a593Smuzhiyun	add	%esi,		d ## D;\
129*4882a593Smuzhiyun	add	k+round(%ebp),	%esi;\
130*4882a593Smuzhiyun	xor	%esi,		c ## D;\
131*4882a593Smuzhiyun	ror	$1,		c ## D;\
132*4882a593Smuzhiyun	add	k+4+round(%ebp),d ## D;\
133*4882a593Smuzhiyun	xor	%edi,		d ## D;
134*4882a593Smuzhiyun
135*4882a593Smuzhiyun/*
136*4882a593Smuzhiyun * a input register containing a
137*4882a593Smuzhiyun * b input register containing b (rotated 16)
138*4882a593Smuzhiyun * c input register containing c
139*4882a593Smuzhiyun * d input register containing d (already rol $1)
140*4882a593Smuzhiyun * operations on a and b are interleaved to increase performance
141*4882a593Smuzhiyun */
142*4882a593Smuzhiyun#define decrypt_round(a,b,c,d,round)\
143*4882a593Smuzhiyun	push	c ## D;\
144*4882a593Smuzhiyun	movzx	a ## B,		%edi;\
145*4882a593Smuzhiyun	mov	(%ebp,%edi,4),	c ## D;\
146*4882a593Smuzhiyun	movzx	b ## B,		%edi;\
147*4882a593Smuzhiyun	mov	s3(%ebp,%edi,4),%esi;\
148*4882a593Smuzhiyun	movzx	a ## H,		%edi;\
149*4882a593Smuzhiyun	ror	$16,		a ## D;\
150*4882a593Smuzhiyun	xor	s1(%ebp,%edi,4),c ## D;\
151*4882a593Smuzhiyun	movzx	b ## H,		%edi;\
152*4882a593Smuzhiyun	ror	$16,		b ## D;\
153*4882a593Smuzhiyun	xor	(%ebp,%edi,4),	%esi;\
154*4882a593Smuzhiyun	movzx	a ## B,		%edi;\
155*4882a593Smuzhiyun	xor	s2(%ebp,%edi,4),c ## D;\
156*4882a593Smuzhiyun	movzx	b ## B,		%edi;\
157*4882a593Smuzhiyun	xor	s1(%ebp,%edi,4),%esi;\
158*4882a593Smuzhiyun	movzx	a ## H,		%edi;\
159*4882a593Smuzhiyun	ror	$15,		a ## D;\
160*4882a593Smuzhiyun	xor	s3(%ebp,%edi,4),c ## D;\
161*4882a593Smuzhiyun	movzx	b ## H,		%edi;\
162*4882a593Smuzhiyun	xor	s2(%ebp,%edi,4),%esi;\
163*4882a593Smuzhiyun	pop	%edi;\
164*4882a593Smuzhiyun	add	%esi,		c ## D;\
165*4882a593Smuzhiyun	add	c ## D,		%esi;\
166*4882a593Smuzhiyun	add	k+round(%ebp),	c ## D;\
167*4882a593Smuzhiyun	xor	%edi,		c ## D;\
168*4882a593Smuzhiyun	add	k+4+round(%ebp),%esi;\
169*4882a593Smuzhiyun	xor	%esi,		d ## D;\
170*4882a593Smuzhiyun	rol	$15,		d ## D;
171*4882a593Smuzhiyun
172*4882a593Smuzhiyun/*
173*4882a593Smuzhiyun * a input register containing a
174*4882a593Smuzhiyun * b input register containing b (rotated 16)
175*4882a593Smuzhiyun * c input register containing c
176*4882a593Smuzhiyun * d input register containing d (already rol $1)
177*4882a593Smuzhiyun * operations on a and b are interleaved to increase performance
178*4882a593Smuzhiyun * last round has different rotations for the output preparation
179*4882a593Smuzhiyun */
180*4882a593Smuzhiyun#define decrypt_last_round(a,b,c,d,round)\
181*4882a593Smuzhiyun	push	c ## D;\
182*4882a593Smuzhiyun	movzx	a ## B,		%edi;\
183*4882a593Smuzhiyun	mov	(%ebp,%edi,4),	c ## D;\
184*4882a593Smuzhiyun	movzx	b ## B,		%edi;\
185*4882a593Smuzhiyun	mov	s3(%ebp,%edi,4),%esi;\
186*4882a593Smuzhiyun	movzx	a ## H,		%edi;\
187*4882a593Smuzhiyun	ror	$16,		a ## D;\
188*4882a593Smuzhiyun	xor	s1(%ebp,%edi,4),c ## D;\
189*4882a593Smuzhiyun	movzx	b ## H,		%edi;\
190*4882a593Smuzhiyun	ror	$16,		b ## D;\
191*4882a593Smuzhiyun	xor	(%ebp,%edi,4),	%esi;\
192*4882a593Smuzhiyun	movzx	a ## B,		%edi;\
193*4882a593Smuzhiyun	xor	s2(%ebp,%edi,4),c ## D;\
194*4882a593Smuzhiyun	movzx	b ## B,		%edi;\
195*4882a593Smuzhiyun	xor	s1(%ebp,%edi,4),%esi;\
196*4882a593Smuzhiyun	movzx	a ## H,		%edi;\
197*4882a593Smuzhiyun	ror	$16,		a ## D;\
198*4882a593Smuzhiyun	xor	s3(%ebp,%edi,4),c ## D;\
199*4882a593Smuzhiyun	movzx	b ## H,		%edi;\
200*4882a593Smuzhiyun	xor	s2(%ebp,%edi,4),%esi;\
201*4882a593Smuzhiyun	pop	%edi;\
202*4882a593Smuzhiyun	add	%esi,		c ## D;\
203*4882a593Smuzhiyun	add	c ## D,		%esi;\
204*4882a593Smuzhiyun	add	k+round(%ebp),	c ## D;\
205*4882a593Smuzhiyun	xor	%edi,		c ## D;\
206*4882a593Smuzhiyun	add	k+4+round(%ebp),%esi;\
207*4882a593Smuzhiyun	xor	%esi,		d ## D;\
208*4882a593Smuzhiyun	ror	$1,		d ## D;
209*4882a593Smuzhiyun
210*4882a593SmuzhiyunSYM_FUNC_START(twofish_enc_blk)
211*4882a593Smuzhiyun	push	%ebp			/* save registers according to calling convention*/
212*4882a593Smuzhiyun	push    %ebx
213*4882a593Smuzhiyun	push    %esi
214*4882a593Smuzhiyun	push    %edi
215*4882a593Smuzhiyun
216*4882a593Smuzhiyun	mov	ctx + 16(%esp),	%ebp	/* abuse the base pointer: set new base
217*4882a593Smuzhiyun					 * pointer to the ctx address */
218*4882a593Smuzhiyun	mov     in_blk+16(%esp),%edi	/* input address in edi */
219*4882a593Smuzhiyun
220*4882a593Smuzhiyun	mov	(%edi),		%eax
221*4882a593Smuzhiyun	mov	b_offset(%edi),	%ebx
222*4882a593Smuzhiyun	mov	c_offset(%edi),	%ecx
223*4882a593Smuzhiyun	mov	d_offset(%edi),	%edx
224*4882a593Smuzhiyun	input_whitening(%eax,%ebp,a_offset)
225*4882a593Smuzhiyun	ror	$16,	%eax
226*4882a593Smuzhiyun	input_whitening(%ebx,%ebp,b_offset)
227*4882a593Smuzhiyun	input_whitening(%ecx,%ebp,c_offset)
228*4882a593Smuzhiyun	input_whitening(%edx,%ebp,d_offset)
229*4882a593Smuzhiyun	rol	$1,	%edx
230*4882a593Smuzhiyun
231*4882a593Smuzhiyun	encrypt_round(R0,R1,R2,R3,0);
232*4882a593Smuzhiyun	encrypt_round(R2,R3,R0,R1,8);
233*4882a593Smuzhiyun	encrypt_round(R0,R1,R2,R3,2*8);
234*4882a593Smuzhiyun	encrypt_round(R2,R3,R0,R1,3*8);
235*4882a593Smuzhiyun	encrypt_round(R0,R1,R2,R3,4*8);
236*4882a593Smuzhiyun	encrypt_round(R2,R3,R0,R1,5*8);
237*4882a593Smuzhiyun	encrypt_round(R0,R1,R2,R3,6*8);
238*4882a593Smuzhiyun	encrypt_round(R2,R3,R0,R1,7*8);
239*4882a593Smuzhiyun	encrypt_round(R0,R1,R2,R3,8*8);
240*4882a593Smuzhiyun	encrypt_round(R2,R3,R0,R1,9*8);
241*4882a593Smuzhiyun	encrypt_round(R0,R1,R2,R3,10*8);
242*4882a593Smuzhiyun	encrypt_round(R2,R3,R0,R1,11*8);
243*4882a593Smuzhiyun	encrypt_round(R0,R1,R2,R3,12*8);
244*4882a593Smuzhiyun	encrypt_round(R2,R3,R0,R1,13*8);
245*4882a593Smuzhiyun	encrypt_round(R0,R1,R2,R3,14*8);
246*4882a593Smuzhiyun	encrypt_last_round(R2,R3,R0,R1,15*8);
247*4882a593Smuzhiyun
248*4882a593Smuzhiyun	output_whitening(%eax,%ebp,c_offset)
249*4882a593Smuzhiyun	output_whitening(%ebx,%ebp,d_offset)
250*4882a593Smuzhiyun	output_whitening(%ecx,%ebp,a_offset)
251*4882a593Smuzhiyun	output_whitening(%edx,%ebp,b_offset)
252*4882a593Smuzhiyun	mov	out_blk+16(%esp),%edi;
253*4882a593Smuzhiyun	mov	%eax,		c_offset(%edi)
254*4882a593Smuzhiyun	mov	%ebx,		d_offset(%edi)
255*4882a593Smuzhiyun	mov	%ecx,		(%edi)
256*4882a593Smuzhiyun	mov	%edx,		b_offset(%edi)
257*4882a593Smuzhiyun
258*4882a593Smuzhiyun	pop	%edi
259*4882a593Smuzhiyun	pop	%esi
260*4882a593Smuzhiyun	pop	%ebx
261*4882a593Smuzhiyun	pop	%ebp
262*4882a593Smuzhiyun	mov	$1,	%eax
263*4882a593Smuzhiyun	RET
264*4882a593SmuzhiyunSYM_FUNC_END(twofish_enc_blk)
265*4882a593Smuzhiyun
266*4882a593SmuzhiyunSYM_FUNC_START(twofish_dec_blk)
267*4882a593Smuzhiyun	push	%ebp			/* save registers according to calling convention*/
268*4882a593Smuzhiyun	push    %ebx
269*4882a593Smuzhiyun	push    %esi
270*4882a593Smuzhiyun	push    %edi
271*4882a593Smuzhiyun
272*4882a593Smuzhiyun
273*4882a593Smuzhiyun	mov	ctx + 16(%esp),	%ebp	/* abuse the base pointer: set new base
274*4882a593Smuzhiyun					 * pointer to the ctx address */
275*4882a593Smuzhiyun	mov     in_blk+16(%esp),%edi	/* input address in edi */
276*4882a593Smuzhiyun
277*4882a593Smuzhiyun	mov	(%edi),		%eax
278*4882a593Smuzhiyun	mov	b_offset(%edi),	%ebx
279*4882a593Smuzhiyun	mov	c_offset(%edi),	%ecx
280*4882a593Smuzhiyun	mov	d_offset(%edi),	%edx
281*4882a593Smuzhiyun	output_whitening(%eax,%ebp,a_offset)
282*4882a593Smuzhiyun	output_whitening(%ebx,%ebp,b_offset)
283*4882a593Smuzhiyun	ror	$16,	%ebx
284*4882a593Smuzhiyun	output_whitening(%ecx,%ebp,c_offset)
285*4882a593Smuzhiyun	output_whitening(%edx,%ebp,d_offset)
286*4882a593Smuzhiyun	rol	$1,	%ecx
287*4882a593Smuzhiyun
288*4882a593Smuzhiyun	decrypt_round(R0,R1,R2,R3,15*8);
289*4882a593Smuzhiyun	decrypt_round(R2,R3,R0,R1,14*8);
290*4882a593Smuzhiyun	decrypt_round(R0,R1,R2,R3,13*8);
291*4882a593Smuzhiyun	decrypt_round(R2,R3,R0,R1,12*8);
292*4882a593Smuzhiyun	decrypt_round(R0,R1,R2,R3,11*8);
293*4882a593Smuzhiyun	decrypt_round(R2,R3,R0,R1,10*8);
294*4882a593Smuzhiyun	decrypt_round(R0,R1,R2,R3,9*8);
295*4882a593Smuzhiyun	decrypt_round(R2,R3,R0,R1,8*8);
296*4882a593Smuzhiyun	decrypt_round(R0,R1,R2,R3,7*8);
297*4882a593Smuzhiyun	decrypt_round(R2,R3,R0,R1,6*8);
298*4882a593Smuzhiyun	decrypt_round(R0,R1,R2,R3,5*8);
299*4882a593Smuzhiyun	decrypt_round(R2,R3,R0,R1,4*8);
300*4882a593Smuzhiyun	decrypt_round(R0,R1,R2,R3,3*8);
301*4882a593Smuzhiyun	decrypt_round(R2,R3,R0,R1,2*8);
302*4882a593Smuzhiyun	decrypt_round(R0,R1,R2,R3,1*8);
303*4882a593Smuzhiyun	decrypt_last_round(R2,R3,R0,R1,0);
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun	input_whitening(%eax,%ebp,c_offset)
306*4882a593Smuzhiyun	input_whitening(%ebx,%ebp,d_offset)
307*4882a593Smuzhiyun	input_whitening(%ecx,%ebp,a_offset)
308*4882a593Smuzhiyun	input_whitening(%edx,%ebp,b_offset)
309*4882a593Smuzhiyun	mov	out_blk+16(%esp),%edi;
310*4882a593Smuzhiyun	mov	%eax,		c_offset(%edi)
311*4882a593Smuzhiyun	mov	%ebx,		d_offset(%edi)
312*4882a593Smuzhiyun	mov	%ecx,		(%edi)
313*4882a593Smuzhiyun	mov	%edx,		b_offset(%edi)
314*4882a593Smuzhiyun
315*4882a593Smuzhiyun	pop	%edi
316*4882a593Smuzhiyun	pop	%esi
317*4882a593Smuzhiyun	pop	%ebx
318*4882a593Smuzhiyun	pop	%ebp
319*4882a593Smuzhiyun	mov	$1,	%eax
320*4882a593Smuzhiyun	RET
321*4882a593SmuzhiyunSYM_FUNC_END(twofish_dec_blk)
322