xref: /OK3568_Linux_fs/kernel/arch/mips/crypto/chacha-core.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 OR MIT */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
4*4882a593Smuzhiyun * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
5*4882a593Smuzhiyun */
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun#define MASK_U32		0x3c
8*4882a593Smuzhiyun#define CHACHA20_BLOCK_SIZE	64
9*4882a593Smuzhiyun#define STACK_SIZE		32
10*4882a593Smuzhiyun
11*4882a593Smuzhiyun#define X0	$t0
12*4882a593Smuzhiyun#define X1	$t1
13*4882a593Smuzhiyun#define X2	$t2
14*4882a593Smuzhiyun#define X3	$t3
15*4882a593Smuzhiyun#define X4	$t4
16*4882a593Smuzhiyun#define X5	$t5
17*4882a593Smuzhiyun#define X6	$t6
18*4882a593Smuzhiyun#define X7	$t7
19*4882a593Smuzhiyun#define X8	$t8
20*4882a593Smuzhiyun#define X9	$t9
21*4882a593Smuzhiyun#define X10	$v1
22*4882a593Smuzhiyun#define X11	$s6
23*4882a593Smuzhiyun#define X12	$s5
24*4882a593Smuzhiyun#define X13	$s4
25*4882a593Smuzhiyun#define X14	$s3
26*4882a593Smuzhiyun#define X15	$s2
27*4882a593Smuzhiyun/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
28*4882a593Smuzhiyun#define T0	$s1
29*4882a593Smuzhiyun#define T1	$s0
30*4882a593Smuzhiyun#define T(n)	T ## n
31*4882a593Smuzhiyun#define X(n)	X ## n
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun/* Input arguments */
34*4882a593Smuzhiyun#define STATE		$a0
35*4882a593Smuzhiyun#define OUT		$a1
36*4882a593Smuzhiyun#define IN		$a2
37*4882a593Smuzhiyun#define BYTES		$a3
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun/* Output argument */
40*4882a593Smuzhiyun/* NONCE[0] is kept in a register and not in memory.
41*4882a593Smuzhiyun * We don't want to touch original value in memory.
42*4882a593Smuzhiyun * Must be incremented every loop iteration.
43*4882a593Smuzhiyun */
44*4882a593Smuzhiyun#define NONCE_0		$v0
45*4882a593Smuzhiyun
46*4882a593Smuzhiyun/* SAVED_X and SAVED_CA are set in the jump table.
47*4882a593Smuzhiyun * Use regs which are overwritten on exit else we don't leak clear data.
48*4882a593Smuzhiyun * They are used to handling the last bytes which are not multiple of 4.
49*4882a593Smuzhiyun */
50*4882a593Smuzhiyun#define SAVED_X		X15
51*4882a593Smuzhiyun#define SAVED_CA	$s7
52*4882a593Smuzhiyun
53*4882a593Smuzhiyun#define IS_UNALIGNED	$s7
54*4882a593Smuzhiyun
55*4882a593Smuzhiyun#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
56*4882a593Smuzhiyun#define MSB 0
57*4882a593Smuzhiyun#define LSB 3
58*4882a593Smuzhiyun#define ROTx rotl
59*4882a593Smuzhiyun#define ROTR(n) rotr n, 24
60*4882a593Smuzhiyun#define	CPU_TO_LE32(n) \
61*4882a593Smuzhiyun	wsbh	n; \
62*4882a593Smuzhiyun	rotr	n, 16;
63*4882a593Smuzhiyun#else
64*4882a593Smuzhiyun#define MSB 3
65*4882a593Smuzhiyun#define LSB 0
66*4882a593Smuzhiyun#define ROTx rotr
67*4882a593Smuzhiyun#define CPU_TO_LE32(n)
68*4882a593Smuzhiyun#define ROTR(n)
69*4882a593Smuzhiyun#endif
70*4882a593Smuzhiyun
71*4882a593Smuzhiyun#define FOR_EACH_WORD(x) \
72*4882a593Smuzhiyun	x( 0); \
73*4882a593Smuzhiyun	x( 1); \
74*4882a593Smuzhiyun	x( 2); \
75*4882a593Smuzhiyun	x( 3); \
76*4882a593Smuzhiyun	x( 4); \
77*4882a593Smuzhiyun	x( 5); \
78*4882a593Smuzhiyun	x( 6); \
79*4882a593Smuzhiyun	x( 7); \
80*4882a593Smuzhiyun	x( 8); \
81*4882a593Smuzhiyun	x( 9); \
82*4882a593Smuzhiyun	x(10); \
83*4882a593Smuzhiyun	x(11); \
84*4882a593Smuzhiyun	x(12); \
85*4882a593Smuzhiyun	x(13); \
86*4882a593Smuzhiyun	x(14); \
87*4882a593Smuzhiyun	x(15);
88*4882a593Smuzhiyun
89*4882a593Smuzhiyun#define FOR_EACH_WORD_REV(x) \
90*4882a593Smuzhiyun	x(15); \
91*4882a593Smuzhiyun	x(14); \
92*4882a593Smuzhiyun	x(13); \
93*4882a593Smuzhiyun	x(12); \
94*4882a593Smuzhiyun	x(11); \
95*4882a593Smuzhiyun	x(10); \
96*4882a593Smuzhiyun	x( 9); \
97*4882a593Smuzhiyun	x( 8); \
98*4882a593Smuzhiyun	x( 7); \
99*4882a593Smuzhiyun	x( 6); \
100*4882a593Smuzhiyun	x( 5); \
101*4882a593Smuzhiyun	x( 4); \
102*4882a593Smuzhiyun	x( 3); \
103*4882a593Smuzhiyun	x( 2); \
104*4882a593Smuzhiyun	x( 1); \
105*4882a593Smuzhiyun	x( 0);
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun#define PLUS_ONE_0	 1
108*4882a593Smuzhiyun#define PLUS_ONE_1	 2
109*4882a593Smuzhiyun#define PLUS_ONE_2	 3
110*4882a593Smuzhiyun#define PLUS_ONE_3	 4
111*4882a593Smuzhiyun#define PLUS_ONE_4	 5
112*4882a593Smuzhiyun#define PLUS_ONE_5	 6
113*4882a593Smuzhiyun#define PLUS_ONE_6	 7
114*4882a593Smuzhiyun#define PLUS_ONE_7	 8
115*4882a593Smuzhiyun#define PLUS_ONE_8	 9
116*4882a593Smuzhiyun#define PLUS_ONE_9	10
117*4882a593Smuzhiyun#define PLUS_ONE_10	11
118*4882a593Smuzhiyun#define PLUS_ONE_11	12
119*4882a593Smuzhiyun#define PLUS_ONE_12	13
120*4882a593Smuzhiyun#define PLUS_ONE_13	14
121*4882a593Smuzhiyun#define PLUS_ONE_14	15
122*4882a593Smuzhiyun#define PLUS_ONE_15	16
123*4882a593Smuzhiyun#define PLUS_ONE(x)	PLUS_ONE_ ## x
124*4882a593Smuzhiyun#define _CONCAT3(a,b,c)	a ## b ## c
125*4882a593Smuzhiyun#define CONCAT3(a,b,c)	_CONCAT3(a,b,c)
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun#define STORE_UNALIGNED(x) \
128*4882a593SmuzhiyunCONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
129*4882a593Smuzhiyun	.if (x != 12); \
130*4882a593Smuzhiyun		lw	T0, (x*4)(STATE); \
131*4882a593Smuzhiyun	.endif; \
132*4882a593Smuzhiyun	lwl	T1, (x*4)+MSB ## (IN); \
133*4882a593Smuzhiyun	lwr	T1, (x*4)+LSB ## (IN); \
134*4882a593Smuzhiyun	.if (x == 12); \
135*4882a593Smuzhiyun		addu	X ## x, NONCE_0; \
136*4882a593Smuzhiyun	.else; \
137*4882a593Smuzhiyun		addu	X ## x, T0; \
138*4882a593Smuzhiyun	.endif; \
139*4882a593Smuzhiyun	CPU_TO_LE32(X ## x); \
140*4882a593Smuzhiyun	xor	X ## x, T1; \
141*4882a593Smuzhiyun	swl	X ## x, (x*4)+MSB ## (OUT); \
142*4882a593Smuzhiyun	swr	X ## x, (x*4)+LSB ## (OUT);
143*4882a593Smuzhiyun
144*4882a593Smuzhiyun#define STORE_ALIGNED(x) \
145*4882a593SmuzhiyunCONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
146*4882a593Smuzhiyun	.if (x != 12); \
147*4882a593Smuzhiyun		lw	T0, (x*4)(STATE); \
148*4882a593Smuzhiyun	.endif; \
149*4882a593Smuzhiyun	lw	T1, (x*4) ## (IN); \
150*4882a593Smuzhiyun	.if (x == 12); \
151*4882a593Smuzhiyun		addu	X ## x, NONCE_0; \
152*4882a593Smuzhiyun	.else; \
153*4882a593Smuzhiyun		addu	X ## x, T0; \
154*4882a593Smuzhiyun	.endif; \
155*4882a593Smuzhiyun	CPU_TO_LE32(X ## x); \
156*4882a593Smuzhiyun	xor	X ## x, T1; \
157*4882a593Smuzhiyun	sw	X ## x, (x*4) ## (OUT);
158*4882a593Smuzhiyun
159*4882a593Smuzhiyun/* Jump table macro.
160*4882a593Smuzhiyun * Used for setup and handling the last bytes, which are not multiple of 4.
161*4882a593Smuzhiyun * X15 is free to store Xn
162*4882a593Smuzhiyun * Every jumptable entry must be equal in size.
163*4882a593Smuzhiyun */
164*4882a593Smuzhiyun#define JMPTBL_ALIGNED(x) \
165*4882a593Smuzhiyun.Lchacha_mips_jmptbl_aligned_ ## x: ; \
166*4882a593Smuzhiyun	.set	noreorder; \
167*4882a593Smuzhiyun	b	.Lchacha_mips_xor_aligned_ ## x ## _b; \
168*4882a593Smuzhiyun	.if (x == 12); \
169*4882a593Smuzhiyun		addu	SAVED_X, X ## x, NONCE_0; \
170*4882a593Smuzhiyun	.else; \
171*4882a593Smuzhiyun		addu	SAVED_X, X ## x, SAVED_CA; \
172*4882a593Smuzhiyun	.endif; \
173*4882a593Smuzhiyun	.set	reorder
174*4882a593Smuzhiyun
175*4882a593Smuzhiyun#define JMPTBL_UNALIGNED(x) \
176*4882a593Smuzhiyun.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
177*4882a593Smuzhiyun	.set	noreorder; \
178*4882a593Smuzhiyun	b	.Lchacha_mips_xor_unaligned_ ## x ## _b; \
179*4882a593Smuzhiyun	.if (x == 12); \
180*4882a593Smuzhiyun		addu	SAVED_X, X ## x, NONCE_0; \
181*4882a593Smuzhiyun	.else; \
182*4882a593Smuzhiyun		addu	SAVED_X, X ## x, SAVED_CA; \
183*4882a593Smuzhiyun	.endif; \
184*4882a593Smuzhiyun	.set	reorder
185*4882a593Smuzhiyun
186*4882a593Smuzhiyun#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
187*4882a593Smuzhiyun	addu	X(A), X(K); \
188*4882a593Smuzhiyun	addu	X(B), X(L); \
189*4882a593Smuzhiyun	addu	X(C), X(M); \
190*4882a593Smuzhiyun	addu	X(D), X(N); \
191*4882a593Smuzhiyun	xor	X(V), X(A); \
192*4882a593Smuzhiyun	xor	X(W), X(B); \
193*4882a593Smuzhiyun	xor	X(Y), X(C); \
194*4882a593Smuzhiyun	xor	X(Z), X(D); \
195*4882a593Smuzhiyun	rotl	X(V), S;    \
196*4882a593Smuzhiyun	rotl	X(W), S;    \
197*4882a593Smuzhiyun	rotl	X(Y), S;    \
198*4882a593Smuzhiyun	rotl	X(Z), S;
199*4882a593Smuzhiyun
200*4882a593Smuzhiyun.text
201*4882a593Smuzhiyun.set	reorder
202*4882a593Smuzhiyun.set	noat
203*4882a593Smuzhiyun.globl	chacha_crypt_arch
204*4882a593Smuzhiyun.ent	chacha_crypt_arch
205*4882a593Smuzhiyunchacha_crypt_arch:
206*4882a593Smuzhiyun	.frame	$sp, STACK_SIZE, $ra
207*4882a593Smuzhiyun
208*4882a593Smuzhiyun	/* Load number of rounds */
209*4882a593Smuzhiyun	lw	$at, 16($sp)
210*4882a593Smuzhiyun
211*4882a593Smuzhiyun	addiu	$sp, -STACK_SIZE
212*4882a593Smuzhiyun
213*4882a593Smuzhiyun	/* Return bytes = 0. */
214*4882a593Smuzhiyun	beqz	BYTES, .Lchacha_mips_end
215*4882a593Smuzhiyun
216*4882a593Smuzhiyun	lw	NONCE_0, 48(STATE)
217*4882a593Smuzhiyun
218*4882a593Smuzhiyun	/* Save s0-s7 */
219*4882a593Smuzhiyun	sw	$s0,  0($sp)
220*4882a593Smuzhiyun	sw	$s1,  4($sp)
221*4882a593Smuzhiyun	sw	$s2,  8($sp)
222*4882a593Smuzhiyun	sw	$s3, 12($sp)
223*4882a593Smuzhiyun	sw	$s4, 16($sp)
224*4882a593Smuzhiyun	sw	$s5, 20($sp)
225*4882a593Smuzhiyun	sw	$s6, 24($sp)
226*4882a593Smuzhiyun	sw	$s7, 28($sp)
227*4882a593Smuzhiyun
228*4882a593Smuzhiyun	/* Test IN or OUT is unaligned.
229*4882a593Smuzhiyun	 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
230*4882a593Smuzhiyun	 */
231*4882a593Smuzhiyun	or	IS_UNALIGNED, IN, OUT
232*4882a593Smuzhiyun	andi	IS_UNALIGNED, 0x3
233*4882a593Smuzhiyun
234*4882a593Smuzhiyun	b	.Lchacha_rounds_start
235*4882a593Smuzhiyun
236*4882a593Smuzhiyun.align 4
237*4882a593Smuzhiyun.Loop_chacha_rounds:
238*4882a593Smuzhiyun	addiu	IN,  CHACHA20_BLOCK_SIZE
239*4882a593Smuzhiyun	addiu	OUT, CHACHA20_BLOCK_SIZE
240*4882a593Smuzhiyun	addiu	NONCE_0, 1
241*4882a593Smuzhiyun
242*4882a593Smuzhiyun.Lchacha_rounds_start:
243*4882a593Smuzhiyun	lw	X0,  0(STATE)
244*4882a593Smuzhiyun	lw	X1,  4(STATE)
245*4882a593Smuzhiyun	lw	X2,  8(STATE)
246*4882a593Smuzhiyun	lw	X3,  12(STATE)
247*4882a593Smuzhiyun
248*4882a593Smuzhiyun	lw	X4,  16(STATE)
249*4882a593Smuzhiyun	lw	X5,  20(STATE)
250*4882a593Smuzhiyun	lw	X6,  24(STATE)
251*4882a593Smuzhiyun	lw	X7,  28(STATE)
252*4882a593Smuzhiyun	lw	X8,  32(STATE)
253*4882a593Smuzhiyun	lw	X9,  36(STATE)
254*4882a593Smuzhiyun	lw	X10, 40(STATE)
255*4882a593Smuzhiyun	lw	X11, 44(STATE)
256*4882a593Smuzhiyun
257*4882a593Smuzhiyun	move	X12, NONCE_0
258*4882a593Smuzhiyun	lw	X13, 52(STATE)
259*4882a593Smuzhiyun	lw	X14, 56(STATE)
260*4882a593Smuzhiyun	lw	X15, 60(STATE)
261*4882a593Smuzhiyun
262*4882a593Smuzhiyun.Loop_chacha_xor_rounds:
263*4882a593Smuzhiyun	addiu	$at, -2
264*4882a593Smuzhiyun	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
265*4882a593Smuzhiyun	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
266*4882a593Smuzhiyun	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
267*4882a593Smuzhiyun	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
268*4882a593Smuzhiyun	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
269*4882a593Smuzhiyun	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
270*4882a593Smuzhiyun	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
271*4882a593Smuzhiyun	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
272*4882a593Smuzhiyun	bnez	$at, .Loop_chacha_xor_rounds
273*4882a593Smuzhiyun
274*4882a593Smuzhiyun	addiu	BYTES, -(CHACHA20_BLOCK_SIZE)
275*4882a593Smuzhiyun
276*4882a593Smuzhiyun	/* Is data src/dst unaligned? Jump */
277*4882a593Smuzhiyun	bnez	IS_UNALIGNED, .Loop_chacha_unaligned
278*4882a593Smuzhiyun
279*4882a593Smuzhiyun	/* Set number rounds here to fill delayslot. */
280*4882a593Smuzhiyun	lw	$at, (STACK_SIZE+16)($sp)
281*4882a593Smuzhiyun
282*4882a593Smuzhiyun	/* BYTES < 0, it has no full block. */
283*4882a593Smuzhiyun	bltz	BYTES, .Lchacha_mips_no_full_block_aligned
284*4882a593Smuzhiyun
285*4882a593Smuzhiyun	FOR_EACH_WORD_REV(STORE_ALIGNED)
286*4882a593Smuzhiyun
287*4882a593Smuzhiyun	/* BYTES > 0? Loop again. */
288*4882a593Smuzhiyun	bgtz	BYTES, .Loop_chacha_rounds
289*4882a593Smuzhiyun
290*4882a593Smuzhiyun	/* Place this here to fill delay slot */
291*4882a593Smuzhiyun	addiu	NONCE_0, 1
292*4882a593Smuzhiyun
293*4882a593Smuzhiyun	/* BYTES < 0? Handle last bytes */
294*4882a593Smuzhiyun	bltz	BYTES, .Lchacha_mips_xor_bytes
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun.Lchacha_mips_xor_done:
297*4882a593Smuzhiyun	/* Restore used registers */
298*4882a593Smuzhiyun	lw	$s0,  0($sp)
299*4882a593Smuzhiyun	lw	$s1,  4($sp)
300*4882a593Smuzhiyun	lw	$s2,  8($sp)
301*4882a593Smuzhiyun	lw	$s3, 12($sp)
302*4882a593Smuzhiyun	lw	$s4, 16($sp)
303*4882a593Smuzhiyun	lw	$s5, 20($sp)
304*4882a593Smuzhiyun	lw	$s6, 24($sp)
305*4882a593Smuzhiyun	lw	$s7, 28($sp)
306*4882a593Smuzhiyun
307*4882a593Smuzhiyun	/* Write NONCE_0 back to right location in state */
308*4882a593Smuzhiyun	sw	NONCE_0, 48(STATE)
309*4882a593Smuzhiyun
310*4882a593Smuzhiyun.Lchacha_mips_end:
311*4882a593Smuzhiyun	addiu	$sp, STACK_SIZE
312*4882a593Smuzhiyun	jr	$ra
313*4882a593Smuzhiyun
314*4882a593Smuzhiyun.Lchacha_mips_no_full_block_aligned:
315*4882a593Smuzhiyun	/* Restore the offset on BYTES */
316*4882a593Smuzhiyun	addiu	BYTES, CHACHA20_BLOCK_SIZE
317*4882a593Smuzhiyun
318*4882a593Smuzhiyun	/* Get number of full WORDS */
319*4882a593Smuzhiyun	andi	$at, BYTES, MASK_U32
320*4882a593Smuzhiyun
321*4882a593Smuzhiyun	/* Load upper half of jump table addr */
322*4882a593Smuzhiyun	lui	T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
323*4882a593Smuzhiyun
324*4882a593Smuzhiyun	/* Calculate lower half jump table offset */
325*4882a593Smuzhiyun	ins	T0, $at, 1, 6
326*4882a593Smuzhiyun
327*4882a593Smuzhiyun	/* Add offset to STATE */
328*4882a593Smuzhiyun	addu	T1, STATE, $at
329*4882a593Smuzhiyun
330*4882a593Smuzhiyun	/* Add lower half jump table addr */
331*4882a593Smuzhiyun	addiu	T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
332*4882a593Smuzhiyun
333*4882a593Smuzhiyun	/* Read value from STATE */
334*4882a593Smuzhiyun	lw	SAVED_CA, 0(T1)
335*4882a593Smuzhiyun
336*4882a593Smuzhiyun	/* Store remaining bytecounter as negative value */
337*4882a593Smuzhiyun	subu	BYTES, $at, BYTES
338*4882a593Smuzhiyun
339*4882a593Smuzhiyun	jr	T0
340*4882a593Smuzhiyun
341*4882a593Smuzhiyun	/* Jump table */
342*4882a593Smuzhiyun	FOR_EACH_WORD(JMPTBL_ALIGNED)
343*4882a593Smuzhiyun
344*4882a593Smuzhiyun
345*4882a593Smuzhiyun.Loop_chacha_unaligned:
346*4882a593Smuzhiyun	/* Set number rounds here to fill delayslot. */
347*4882a593Smuzhiyun	lw	$at, (STACK_SIZE+16)($sp)
348*4882a593Smuzhiyun
349*4882a593Smuzhiyun	/* BYTES > 0, it has no full block. */
350*4882a593Smuzhiyun	bltz	BYTES, .Lchacha_mips_no_full_block_unaligned
351*4882a593Smuzhiyun
352*4882a593Smuzhiyun	FOR_EACH_WORD_REV(STORE_UNALIGNED)
353*4882a593Smuzhiyun
354*4882a593Smuzhiyun	/* BYTES > 0? Loop again. */
355*4882a593Smuzhiyun	bgtz	BYTES, .Loop_chacha_rounds
356*4882a593Smuzhiyun
357*4882a593Smuzhiyun	/* Write NONCE_0 back to right location in state */
358*4882a593Smuzhiyun	sw	NONCE_0, 48(STATE)
359*4882a593Smuzhiyun
360*4882a593Smuzhiyun	.set noreorder
361*4882a593Smuzhiyun	/* Fall through to byte handling */
362*4882a593Smuzhiyun	bgez	BYTES, .Lchacha_mips_xor_done
363*4882a593Smuzhiyun.Lchacha_mips_xor_unaligned_0_b:
364*4882a593Smuzhiyun.Lchacha_mips_xor_aligned_0_b:
365*4882a593Smuzhiyun	/* Place this here to fill delay slot */
366*4882a593Smuzhiyun	addiu	NONCE_0, 1
367*4882a593Smuzhiyun	.set reorder
368*4882a593Smuzhiyun
369*4882a593Smuzhiyun.Lchacha_mips_xor_bytes:
370*4882a593Smuzhiyun	addu	IN, $at
371*4882a593Smuzhiyun	addu	OUT, $at
372*4882a593Smuzhiyun	/* First byte */
373*4882a593Smuzhiyun	lbu	T1, 0(IN)
374*4882a593Smuzhiyun	addiu	$at, BYTES, 1
375*4882a593Smuzhiyun	CPU_TO_LE32(SAVED_X)
376*4882a593Smuzhiyun	ROTR(SAVED_X)
377*4882a593Smuzhiyun	xor	T1, SAVED_X
378*4882a593Smuzhiyun	sb	T1, 0(OUT)
379*4882a593Smuzhiyun	beqz	$at, .Lchacha_mips_xor_done
380*4882a593Smuzhiyun	/* Second byte */
381*4882a593Smuzhiyun	lbu	T1, 1(IN)
382*4882a593Smuzhiyun	addiu	$at, BYTES, 2
383*4882a593Smuzhiyun	ROTx	SAVED_X, 8
384*4882a593Smuzhiyun	xor	T1, SAVED_X
385*4882a593Smuzhiyun	sb	T1, 1(OUT)
386*4882a593Smuzhiyun	beqz	$at, .Lchacha_mips_xor_done
387*4882a593Smuzhiyun	/* Third byte */
388*4882a593Smuzhiyun	lbu	T1, 2(IN)
389*4882a593Smuzhiyun	ROTx	SAVED_X, 8
390*4882a593Smuzhiyun	xor	T1, SAVED_X
391*4882a593Smuzhiyun	sb	T1, 2(OUT)
392*4882a593Smuzhiyun	b	.Lchacha_mips_xor_done
393*4882a593Smuzhiyun
394*4882a593Smuzhiyun.Lchacha_mips_no_full_block_unaligned:
395*4882a593Smuzhiyun	/* Restore the offset on BYTES */
396*4882a593Smuzhiyun	addiu	BYTES, CHACHA20_BLOCK_SIZE
397*4882a593Smuzhiyun
398*4882a593Smuzhiyun	/* Get number of full WORDS */
399*4882a593Smuzhiyun	andi	$at, BYTES, MASK_U32
400*4882a593Smuzhiyun
401*4882a593Smuzhiyun	/* Load upper half of jump table addr */
402*4882a593Smuzhiyun	lui	T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
403*4882a593Smuzhiyun
404*4882a593Smuzhiyun	/* Calculate lower half jump table offset */
405*4882a593Smuzhiyun	ins	T0, $at, 1, 6
406*4882a593Smuzhiyun
407*4882a593Smuzhiyun	/* Add offset to STATE */
408*4882a593Smuzhiyun	addu	T1, STATE, $at
409*4882a593Smuzhiyun
410*4882a593Smuzhiyun	/* Add lower half jump table addr */
411*4882a593Smuzhiyun	addiu	T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
412*4882a593Smuzhiyun
413*4882a593Smuzhiyun	/* Read value from STATE */
414*4882a593Smuzhiyun	lw	SAVED_CA, 0(T1)
415*4882a593Smuzhiyun
416*4882a593Smuzhiyun	/* Store remaining bytecounter as negative value */
417*4882a593Smuzhiyun	subu	BYTES, $at, BYTES
418*4882a593Smuzhiyun
419*4882a593Smuzhiyun	jr	T0
420*4882a593Smuzhiyun
421*4882a593Smuzhiyun	/* Jump table */
422*4882a593Smuzhiyun	FOR_EACH_WORD(JMPTBL_UNALIGNED)
423*4882a593Smuzhiyun.end chacha_crypt_arch
424*4882a593Smuzhiyun.set at
425*4882a593Smuzhiyun
426*4882a593Smuzhiyun/* Input arguments
427*4882a593Smuzhiyun * STATE	$a0
428*4882a593Smuzhiyun * OUT		$a1
429*4882a593Smuzhiyun * NROUND	$a2
430*4882a593Smuzhiyun */
431*4882a593Smuzhiyun
432*4882a593Smuzhiyun#undef X12
433*4882a593Smuzhiyun#undef X13
434*4882a593Smuzhiyun#undef X14
435*4882a593Smuzhiyun#undef X15
436*4882a593Smuzhiyun
437*4882a593Smuzhiyun#define X12	$a3
438*4882a593Smuzhiyun#define X13	$at
439*4882a593Smuzhiyun#define X14	$v0
440*4882a593Smuzhiyun#define X15	STATE
441*4882a593Smuzhiyun
442*4882a593Smuzhiyun.set noat
443*4882a593Smuzhiyun.globl	hchacha_block_arch
444*4882a593Smuzhiyun.ent	hchacha_block_arch
445*4882a593Smuzhiyunhchacha_block_arch:
446*4882a593Smuzhiyun	.frame	$sp, STACK_SIZE, $ra
447*4882a593Smuzhiyun
448*4882a593Smuzhiyun	addiu	$sp, -STACK_SIZE
449*4882a593Smuzhiyun
450*4882a593Smuzhiyun	/* Save X11(s6) */
451*4882a593Smuzhiyun	sw	X11, 0($sp)
452*4882a593Smuzhiyun
453*4882a593Smuzhiyun	lw	X0,  0(STATE)
454*4882a593Smuzhiyun	lw	X1,  4(STATE)
455*4882a593Smuzhiyun	lw	X2,  8(STATE)
456*4882a593Smuzhiyun	lw	X3,  12(STATE)
457*4882a593Smuzhiyun	lw	X4,  16(STATE)
458*4882a593Smuzhiyun	lw	X5,  20(STATE)
459*4882a593Smuzhiyun	lw	X6,  24(STATE)
460*4882a593Smuzhiyun	lw	X7,  28(STATE)
461*4882a593Smuzhiyun	lw	X8,  32(STATE)
462*4882a593Smuzhiyun	lw	X9,  36(STATE)
463*4882a593Smuzhiyun	lw	X10, 40(STATE)
464*4882a593Smuzhiyun	lw	X11, 44(STATE)
465*4882a593Smuzhiyun	lw	X12, 48(STATE)
466*4882a593Smuzhiyun	lw	X13, 52(STATE)
467*4882a593Smuzhiyun	lw	X14, 56(STATE)
468*4882a593Smuzhiyun	lw	X15, 60(STATE)
469*4882a593Smuzhiyun
470*4882a593Smuzhiyun.Loop_hchacha_xor_rounds:
471*4882a593Smuzhiyun	addiu	$a2, -2
472*4882a593Smuzhiyun	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
473*4882a593Smuzhiyun	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
474*4882a593Smuzhiyun	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
475*4882a593Smuzhiyun	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
476*4882a593Smuzhiyun	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
477*4882a593Smuzhiyun	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
478*4882a593Smuzhiyun	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
479*4882a593Smuzhiyun	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
480*4882a593Smuzhiyun	bnez	$a2, .Loop_hchacha_xor_rounds
481*4882a593Smuzhiyun
482*4882a593Smuzhiyun	/* Restore used register */
483*4882a593Smuzhiyun	lw	X11, 0($sp)
484*4882a593Smuzhiyun
485*4882a593Smuzhiyun	sw	X0,  0(OUT)
486*4882a593Smuzhiyun	sw	X1,  4(OUT)
487*4882a593Smuzhiyun	sw	X2,  8(OUT)
488*4882a593Smuzhiyun	sw	X3,  12(OUT)
489*4882a593Smuzhiyun	sw	X12, 16(OUT)
490*4882a593Smuzhiyun	sw	X13, 20(OUT)
491*4882a593Smuzhiyun	sw	X14, 24(OUT)
492*4882a593Smuzhiyun	sw	X15, 28(OUT)
493*4882a593Smuzhiyun
494*4882a593Smuzhiyun	addiu	$sp, STACK_SIZE
495*4882a593Smuzhiyun	jr	$ra
496*4882a593Smuzhiyun.end hchacha_block_arch
497*4882a593Smuzhiyun.set at
498