xref: /OK3568_Linux_fs/kernel/arch/arm/crypto/chacha-neon-core.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/*
2*4882a593Smuzhiyun * ChaCha/XChaCha NEON helper functions
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun * This program is free software; you can redistribute it and/or modify
7*4882a593Smuzhiyun * it under the terms of the GNU General Public License version 2 as
8*4882a593Smuzhiyun * published by the Free Software Foundation.
9*4882a593Smuzhiyun *
10*4882a593Smuzhiyun * Based on:
11*4882a593Smuzhiyun * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
12*4882a593Smuzhiyun *
13*4882a593Smuzhiyun * Copyright (C) 2015 Martin Willi
14*4882a593Smuzhiyun *
15*4882a593Smuzhiyun * This program is free software; you can redistribute it and/or modify
16*4882a593Smuzhiyun * it under the terms of the GNU General Public License as published by
17*4882a593Smuzhiyun * the Free Software Foundation; either version 2 of the License, or
18*4882a593Smuzhiyun * (at your option) any later version.
19*4882a593Smuzhiyun */
20*4882a593Smuzhiyun
21*4882a593Smuzhiyun /*
22*4882a593Smuzhiyun  * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
23*4882a593Smuzhiyun  *
24*4882a593Smuzhiyun  * (a)  vshl.u32 + vsri.u32		(needs temporary register)
25*4882a593Smuzhiyun  * (b)  vshl.u32 + vshr.u32 + vorr	(needs temporary register)
26*4882a593Smuzhiyun  * (c)  vrev32.16			(16-bit rotations only)
27*4882a593Smuzhiyun  * (d)  vtbl.8 + vtbl.8		(multiple of 8 bits rotations only,
28*4882a593Smuzhiyun  *					 needs index vector)
29*4882a593Smuzhiyun  *
30*4882a593Smuzhiyun  * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
31*4882a593Smuzhiyun  * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
32*4882a593Smuzhiyun  * cycles of (b) on both Cortex-A7 and Cortex-A53.
33*4882a593Smuzhiyun  *
34*4882a593Smuzhiyun  * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
35*4882a593Smuzhiyun  * and doesn't need a temporary register.
36*4882a593Smuzhiyun  *
37*4882a593Smuzhiyun  * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
38*4882a593Smuzhiyun  * is twice as fast as (a), even when doing (a) on multiple registers
39*4882a593Smuzhiyun  * simultaneously to eliminate the stall between vshl and vsri.  Also, it
40*4882a593Smuzhiyun  * parallelizes better when temporary registers are scarce.
41*4882a593Smuzhiyun  *
42*4882a593Smuzhiyun  * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
43*4882a593Smuzhiyun  * (a), so the need to load the rotation table actually makes the vtbl method
44*4882a593Smuzhiyun  * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
45*4882a593Smuzhiyun  * seems to be a good compromise to get a more significant speed boost on some
46*4882a593Smuzhiyun  * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
47*4882a593Smuzhiyun  */
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun#include <linux/linkage.h>
50*4882a593Smuzhiyun
51*4882a593Smuzhiyun	.text
52*4882a593Smuzhiyun	.fpu		neon
53*4882a593Smuzhiyun	.align		5
54*4882a593Smuzhiyun
55*4882a593Smuzhiyun/*
56*4882a593Smuzhiyun * chacha_permute - permute one block
57*4882a593Smuzhiyun *
58*4882a593Smuzhiyun * Permute one 64-byte block where the state matrix is stored in the four NEON
59*4882a593Smuzhiyun * registers q0-q3.  It performs matrix operations on four words in parallel,
60*4882a593Smuzhiyun * but requires shuffling to rearrange the words after each round.
61*4882a593Smuzhiyun *
62*4882a593Smuzhiyun * The round count is given in r3.
63*4882a593Smuzhiyun *
64*4882a593Smuzhiyun * Clobbers: r3, ip, q4-q5
65*4882a593Smuzhiyun */
66*4882a593Smuzhiyunchacha_permute:
67*4882a593Smuzhiyun
68*4882a593Smuzhiyun	adr		ip, .Lrol8_table
69*4882a593Smuzhiyun	vld1.8		{d10}, [ip, :64]
70*4882a593Smuzhiyun
71*4882a593Smuzhiyun.Ldoubleround:
72*4882a593Smuzhiyun	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
73*4882a593Smuzhiyun	vadd.i32	q0, q0, q1
74*4882a593Smuzhiyun	veor		q3, q3, q0
75*4882a593Smuzhiyun	vrev32.16	q3, q3
76*4882a593Smuzhiyun
77*4882a593Smuzhiyun	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
78*4882a593Smuzhiyun	vadd.i32	q2, q2, q3
79*4882a593Smuzhiyun	veor		q4, q1, q2
80*4882a593Smuzhiyun	vshl.u32	q1, q4, #12
81*4882a593Smuzhiyun	vsri.u32	q1, q4, #20
82*4882a593Smuzhiyun
83*4882a593Smuzhiyun	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
84*4882a593Smuzhiyun	vadd.i32	q0, q0, q1
85*4882a593Smuzhiyun	veor		q3, q3, q0
86*4882a593Smuzhiyun	vtbl.8		d6, {d6}, d10
87*4882a593Smuzhiyun	vtbl.8		d7, {d7}, d10
88*4882a593Smuzhiyun
89*4882a593Smuzhiyun	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
90*4882a593Smuzhiyun	vadd.i32	q2, q2, q3
91*4882a593Smuzhiyun	veor		q4, q1, q2
92*4882a593Smuzhiyun	vshl.u32	q1, q4, #7
93*4882a593Smuzhiyun	vsri.u32	q1, q4, #25
94*4882a593Smuzhiyun
95*4882a593Smuzhiyun	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
96*4882a593Smuzhiyun	vext.8		q1, q1, q1, #4
97*4882a593Smuzhiyun	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
98*4882a593Smuzhiyun	vext.8		q2, q2, q2, #8
99*4882a593Smuzhiyun	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
100*4882a593Smuzhiyun	vext.8		q3, q3, q3, #12
101*4882a593Smuzhiyun
102*4882a593Smuzhiyun	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
103*4882a593Smuzhiyun	vadd.i32	q0, q0, q1
104*4882a593Smuzhiyun	veor		q3, q3, q0
105*4882a593Smuzhiyun	vrev32.16	q3, q3
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
108*4882a593Smuzhiyun	vadd.i32	q2, q2, q3
109*4882a593Smuzhiyun	veor		q4, q1, q2
110*4882a593Smuzhiyun	vshl.u32	q1, q4, #12
111*4882a593Smuzhiyun	vsri.u32	q1, q4, #20
112*4882a593Smuzhiyun
113*4882a593Smuzhiyun	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
114*4882a593Smuzhiyun	vadd.i32	q0, q0, q1
115*4882a593Smuzhiyun	veor		q3, q3, q0
116*4882a593Smuzhiyun	vtbl.8		d6, {d6}, d10
117*4882a593Smuzhiyun	vtbl.8		d7, {d7}, d10
118*4882a593Smuzhiyun
119*4882a593Smuzhiyun	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
120*4882a593Smuzhiyun	vadd.i32	q2, q2, q3
121*4882a593Smuzhiyun	veor		q4, q1, q2
122*4882a593Smuzhiyun	vshl.u32	q1, q4, #7
123*4882a593Smuzhiyun	vsri.u32	q1, q4, #25
124*4882a593Smuzhiyun
125*4882a593Smuzhiyun	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
126*4882a593Smuzhiyun	vext.8		q1, q1, q1, #12
127*4882a593Smuzhiyun	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
128*4882a593Smuzhiyun	vext.8		q2, q2, q2, #8
129*4882a593Smuzhiyun	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
130*4882a593Smuzhiyun	vext.8		q3, q3, q3, #4
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun	subs		r3, r3, #2
133*4882a593Smuzhiyun	bne		.Ldoubleround
134*4882a593Smuzhiyun
135*4882a593Smuzhiyun	bx		lr
136*4882a593SmuzhiyunENDPROC(chacha_permute)
137*4882a593Smuzhiyun
138*4882a593SmuzhiyunENTRY(chacha_block_xor_neon)
139*4882a593Smuzhiyun	// r0: Input state matrix, s
140*4882a593Smuzhiyun	// r1: 1 data block output, o
141*4882a593Smuzhiyun	// r2: 1 data block input, i
142*4882a593Smuzhiyun	// r3: nrounds
143*4882a593Smuzhiyun	push		{lr}
144*4882a593Smuzhiyun
145*4882a593Smuzhiyun	// x0..3 = s0..3
146*4882a593Smuzhiyun	add		ip, r0, #0x20
147*4882a593Smuzhiyun	vld1.32		{q0-q1}, [r0]
148*4882a593Smuzhiyun	vld1.32		{q2-q3}, [ip]
149*4882a593Smuzhiyun
150*4882a593Smuzhiyun	vmov		q8, q0
151*4882a593Smuzhiyun	vmov		q9, q1
152*4882a593Smuzhiyun	vmov		q10, q2
153*4882a593Smuzhiyun	vmov		q11, q3
154*4882a593Smuzhiyun
155*4882a593Smuzhiyun	bl		chacha_permute
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun	add		ip, r2, #0x20
158*4882a593Smuzhiyun	vld1.8		{q4-q5}, [r2]
159*4882a593Smuzhiyun	vld1.8		{q6-q7}, [ip]
160*4882a593Smuzhiyun
161*4882a593Smuzhiyun	// o0 = i0 ^ (x0 + s0)
162*4882a593Smuzhiyun	vadd.i32	q0, q0, q8
163*4882a593Smuzhiyun	veor		q0, q0, q4
164*4882a593Smuzhiyun
165*4882a593Smuzhiyun	// o1 = i1 ^ (x1 + s1)
166*4882a593Smuzhiyun	vadd.i32	q1, q1, q9
167*4882a593Smuzhiyun	veor		q1, q1, q5
168*4882a593Smuzhiyun
169*4882a593Smuzhiyun	// o2 = i2 ^ (x2 + s2)
170*4882a593Smuzhiyun	vadd.i32	q2, q2, q10
171*4882a593Smuzhiyun	veor		q2, q2, q6
172*4882a593Smuzhiyun
173*4882a593Smuzhiyun	// o3 = i3 ^ (x3 + s3)
174*4882a593Smuzhiyun	vadd.i32	q3, q3, q11
175*4882a593Smuzhiyun	veor		q3, q3, q7
176*4882a593Smuzhiyun
177*4882a593Smuzhiyun	add		ip, r1, #0x20
178*4882a593Smuzhiyun	vst1.8		{q0-q1}, [r1]
179*4882a593Smuzhiyun	vst1.8		{q2-q3}, [ip]
180*4882a593Smuzhiyun
181*4882a593Smuzhiyun	pop		{pc}
182*4882a593SmuzhiyunENDPROC(chacha_block_xor_neon)
183*4882a593Smuzhiyun
184*4882a593SmuzhiyunENTRY(hchacha_block_neon)
185*4882a593Smuzhiyun	// r0: Input state matrix, s
186*4882a593Smuzhiyun	// r1: output (8 32-bit words)
187*4882a593Smuzhiyun	// r2: nrounds
188*4882a593Smuzhiyun	push		{lr}
189*4882a593Smuzhiyun
190*4882a593Smuzhiyun	vld1.32		{q0-q1}, [r0]!
191*4882a593Smuzhiyun	vld1.32		{q2-q3}, [r0]
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun	mov		r3, r2
194*4882a593Smuzhiyun	bl		chacha_permute
195*4882a593Smuzhiyun
196*4882a593Smuzhiyun	vst1.32		{q0}, [r1]!
197*4882a593Smuzhiyun	vst1.32		{q3}, [r1]
198*4882a593Smuzhiyun
199*4882a593Smuzhiyun	pop		{pc}
200*4882a593SmuzhiyunENDPROC(hchacha_block_neon)
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun	.align		4
203*4882a593Smuzhiyun.Lctrinc:	.word	0, 1, 2, 3
204*4882a593Smuzhiyun.Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
205*4882a593Smuzhiyun
206*4882a593Smuzhiyun	.align		5
207*4882a593SmuzhiyunENTRY(chacha_4block_xor_neon)
208*4882a593Smuzhiyun	push		{r4-r5}
209*4882a593Smuzhiyun	mov		r4, sp			// preserve the stack pointer
210*4882a593Smuzhiyun	sub		ip, sp, #0x20		// allocate a 32 byte buffer
211*4882a593Smuzhiyun	bic		ip, ip, #0x1f		// aligned to 32 bytes
212*4882a593Smuzhiyun	mov		sp, ip
213*4882a593Smuzhiyun
214*4882a593Smuzhiyun	// r0: Input state matrix, s
215*4882a593Smuzhiyun	// r1: 4 data blocks output, o
216*4882a593Smuzhiyun	// r2: 4 data blocks input, i
217*4882a593Smuzhiyun	// r3: nrounds
218*4882a593Smuzhiyun
219*4882a593Smuzhiyun	//
220*4882a593Smuzhiyun	// This function encrypts four consecutive ChaCha blocks by loading
221*4882a593Smuzhiyun	// the state matrix in NEON registers four times. The algorithm performs
222*4882a593Smuzhiyun	// each operation on the corresponding word of each state matrix, hence
223*4882a593Smuzhiyun	// requires no word shuffling. The words are re-interleaved before the
224*4882a593Smuzhiyun	// final addition of the original state and the XORing step.
225*4882a593Smuzhiyun	//
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun	// x0..15[0-3] = s0..15[0-3]
228*4882a593Smuzhiyun	add		ip, r0, #0x20
229*4882a593Smuzhiyun	vld1.32		{q0-q1}, [r0]
230*4882a593Smuzhiyun	vld1.32		{q2-q3}, [ip]
231*4882a593Smuzhiyun
232*4882a593Smuzhiyun	adr		r5, .Lctrinc
233*4882a593Smuzhiyun	vdup.32		q15, d7[1]
234*4882a593Smuzhiyun	vdup.32		q14, d7[0]
235*4882a593Smuzhiyun	vld1.32		{q4}, [r5, :128]
236*4882a593Smuzhiyun	vdup.32		q13, d6[1]
237*4882a593Smuzhiyun	vdup.32		q12, d6[0]
238*4882a593Smuzhiyun	vdup.32		q11, d5[1]
239*4882a593Smuzhiyun	vdup.32		q10, d5[0]
240*4882a593Smuzhiyun	vadd.u32	q12, q12, q4		// x12 += counter values 0-3
241*4882a593Smuzhiyun	vdup.32		q9, d4[1]
242*4882a593Smuzhiyun	vdup.32		q8, d4[0]
243*4882a593Smuzhiyun	vdup.32		q7, d3[1]
244*4882a593Smuzhiyun	vdup.32		q6, d3[0]
245*4882a593Smuzhiyun	vdup.32		q5, d2[1]
246*4882a593Smuzhiyun	vdup.32		q4, d2[0]
247*4882a593Smuzhiyun	vdup.32		q3, d1[1]
248*4882a593Smuzhiyun	vdup.32		q2, d1[0]
249*4882a593Smuzhiyun	vdup.32		q1, d0[1]
250*4882a593Smuzhiyun	vdup.32		q0, d0[0]
251*4882a593Smuzhiyun
252*4882a593Smuzhiyun	adr		ip, .Lrol8_table
253*4882a593Smuzhiyun	b		1f
254*4882a593Smuzhiyun
255*4882a593Smuzhiyun.Ldoubleround4:
256*4882a593Smuzhiyun	vld1.32		{q8-q9}, [sp, :256]
257*4882a593Smuzhiyun1:
258*4882a593Smuzhiyun	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
259*4882a593Smuzhiyun	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
260*4882a593Smuzhiyun	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
261*4882a593Smuzhiyun	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
262*4882a593Smuzhiyun	vadd.i32	q0, q0, q4
263*4882a593Smuzhiyun	vadd.i32	q1, q1, q5
264*4882a593Smuzhiyun	vadd.i32	q2, q2, q6
265*4882a593Smuzhiyun	vadd.i32	q3, q3, q7
266*4882a593Smuzhiyun
267*4882a593Smuzhiyun	veor		q12, q12, q0
268*4882a593Smuzhiyun	veor		q13, q13, q1
269*4882a593Smuzhiyun	veor		q14, q14, q2
270*4882a593Smuzhiyun	veor		q15, q15, q3
271*4882a593Smuzhiyun
272*4882a593Smuzhiyun	vrev32.16	q12, q12
273*4882a593Smuzhiyun	vrev32.16	q13, q13
274*4882a593Smuzhiyun	vrev32.16	q14, q14
275*4882a593Smuzhiyun	vrev32.16	q15, q15
276*4882a593Smuzhiyun
277*4882a593Smuzhiyun	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
278*4882a593Smuzhiyun	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
279*4882a593Smuzhiyun	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
280*4882a593Smuzhiyun	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
281*4882a593Smuzhiyun	vadd.i32	q8, q8, q12
282*4882a593Smuzhiyun	vadd.i32	q9, q9, q13
283*4882a593Smuzhiyun	vadd.i32	q10, q10, q14
284*4882a593Smuzhiyun	vadd.i32	q11, q11, q15
285*4882a593Smuzhiyun
286*4882a593Smuzhiyun	vst1.32		{q8-q9}, [sp, :256]
287*4882a593Smuzhiyun
288*4882a593Smuzhiyun	veor		q8, q4, q8
289*4882a593Smuzhiyun	veor		q9, q5, q9
290*4882a593Smuzhiyun	vshl.u32	q4, q8, #12
291*4882a593Smuzhiyun	vshl.u32	q5, q9, #12
292*4882a593Smuzhiyun	vsri.u32	q4, q8, #20
293*4882a593Smuzhiyun	vsri.u32	q5, q9, #20
294*4882a593Smuzhiyun
295*4882a593Smuzhiyun	veor		q8, q6, q10
296*4882a593Smuzhiyun	veor		q9, q7, q11
297*4882a593Smuzhiyun	vshl.u32	q6, q8, #12
298*4882a593Smuzhiyun	vshl.u32	q7, q9, #12
299*4882a593Smuzhiyun	vsri.u32	q6, q8, #20
300*4882a593Smuzhiyun	vsri.u32	q7, q9, #20
301*4882a593Smuzhiyun
302*4882a593Smuzhiyun	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
303*4882a593Smuzhiyun	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
304*4882a593Smuzhiyun	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
305*4882a593Smuzhiyun	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
306*4882a593Smuzhiyun	vld1.8		{d16}, [ip, :64]
307*4882a593Smuzhiyun	vadd.i32	q0, q0, q4
308*4882a593Smuzhiyun	vadd.i32	q1, q1, q5
309*4882a593Smuzhiyun	vadd.i32	q2, q2, q6
310*4882a593Smuzhiyun	vadd.i32	q3, q3, q7
311*4882a593Smuzhiyun
312*4882a593Smuzhiyun	veor		q12, q12, q0
313*4882a593Smuzhiyun	veor		q13, q13, q1
314*4882a593Smuzhiyun	veor		q14, q14, q2
315*4882a593Smuzhiyun	veor		q15, q15, q3
316*4882a593Smuzhiyun
317*4882a593Smuzhiyun	vtbl.8		d24, {d24}, d16
318*4882a593Smuzhiyun	vtbl.8		d25, {d25}, d16
319*4882a593Smuzhiyun	vtbl.8		d26, {d26}, d16
320*4882a593Smuzhiyun	vtbl.8		d27, {d27}, d16
321*4882a593Smuzhiyun	vtbl.8		d28, {d28}, d16
322*4882a593Smuzhiyun	vtbl.8		d29, {d29}, d16
323*4882a593Smuzhiyun	vtbl.8		d30, {d30}, d16
324*4882a593Smuzhiyun	vtbl.8		d31, {d31}, d16
325*4882a593Smuzhiyun
326*4882a593Smuzhiyun	vld1.32		{q8-q9}, [sp, :256]
327*4882a593Smuzhiyun
328*4882a593Smuzhiyun	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
329*4882a593Smuzhiyun	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
330*4882a593Smuzhiyun	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
331*4882a593Smuzhiyun	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
332*4882a593Smuzhiyun	vadd.i32	q8, q8, q12
333*4882a593Smuzhiyun	vadd.i32	q9, q9, q13
334*4882a593Smuzhiyun	vadd.i32	q10, q10, q14
335*4882a593Smuzhiyun	vadd.i32	q11, q11, q15
336*4882a593Smuzhiyun
337*4882a593Smuzhiyun	vst1.32		{q8-q9}, [sp, :256]
338*4882a593Smuzhiyun
339*4882a593Smuzhiyun	veor		q8, q4, q8
340*4882a593Smuzhiyun	veor		q9, q5, q9
341*4882a593Smuzhiyun	vshl.u32	q4, q8, #7
342*4882a593Smuzhiyun	vshl.u32	q5, q9, #7
343*4882a593Smuzhiyun	vsri.u32	q4, q8, #25
344*4882a593Smuzhiyun	vsri.u32	q5, q9, #25
345*4882a593Smuzhiyun
346*4882a593Smuzhiyun	veor		q8, q6, q10
347*4882a593Smuzhiyun	veor		q9, q7, q11
348*4882a593Smuzhiyun	vshl.u32	q6, q8, #7
349*4882a593Smuzhiyun	vshl.u32	q7, q9, #7
350*4882a593Smuzhiyun	vsri.u32	q6, q8, #25
351*4882a593Smuzhiyun	vsri.u32	q7, q9, #25
352*4882a593Smuzhiyun
353*4882a593Smuzhiyun	vld1.32		{q8-q9}, [sp, :256]
354*4882a593Smuzhiyun
355*4882a593Smuzhiyun	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
356*4882a593Smuzhiyun	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
357*4882a593Smuzhiyun	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
358*4882a593Smuzhiyun	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
359*4882a593Smuzhiyun	vadd.i32	q0, q0, q5
360*4882a593Smuzhiyun	vadd.i32	q1, q1, q6
361*4882a593Smuzhiyun	vadd.i32	q2, q2, q7
362*4882a593Smuzhiyun	vadd.i32	q3, q3, q4
363*4882a593Smuzhiyun
364*4882a593Smuzhiyun	veor		q15, q15, q0
365*4882a593Smuzhiyun	veor		q12, q12, q1
366*4882a593Smuzhiyun	veor		q13, q13, q2
367*4882a593Smuzhiyun	veor		q14, q14, q3
368*4882a593Smuzhiyun
369*4882a593Smuzhiyun	vrev32.16	q15, q15
370*4882a593Smuzhiyun	vrev32.16	q12, q12
371*4882a593Smuzhiyun	vrev32.16	q13, q13
372*4882a593Smuzhiyun	vrev32.16	q14, q14
373*4882a593Smuzhiyun
374*4882a593Smuzhiyun	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
375*4882a593Smuzhiyun	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
376*4882a593Smuzhiyun	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
377*4882a593Smuzhiyun	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
378*4882a593Smuzhiyun	vadd.i32	q10, q10, q15
379*4882a593Smuzhiyun	vadd.i32	q11, q11, q12
380*4882a593Smuzhiyun	vadd.i32	q8, q8, q13
381*4882a593Smuzhiyun	vadd.i32	q9, q9, q14
382*4882a593Smuzhiyun
383*4882a593Smuzhiyun	vst1.32		{q8-q9}, [sp, :256]
384*4882a593Smuzhiyun
385*4882a593Smuzhiyun	veor		q8, q7, q8
386*4882a593Smuzhiyun	veor		q9, q4, q9
387*4882a593Smuzhiyun	vshl.u32	q7, q8, #12
388*4882a593Smuzhiyun	vshl.u32	q4, q9, #12
389*4882a593Smuzhiyun	vsri.u32	q7, q8, #20
390*4882a593Smuzhiyun	vsri.u32	q4, q9, #20
391*4882a593Smuzhiyun
392*4882a593Smuzhiyun	veor		q8, q5, q10
393*4882a593Smuzhiyun	veor		q9, q6, q11
394*4882a593Smuzhiyun	vshl.u32	q5, q8, #12
395*4882a593Smuzhiyun	vshl.u32	q6, q9, #12
396*4882a593Smuzhiyun	vsri.u32	q5, q8, #20
397*4882a593Smuzhiyun	vsri.u32	q6, q9, #20
398*4882a593Smuzhiyun
399*4882a593Smuzhiyun	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
400*4882a593Smuzhiyun	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
401*4882a593Smuzhiyun	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
402*4882a593Smuzhiyun	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
403*4882a593Smuzhiyun	vld1.8		{d16}, [ip, :64]
404*4882a593Smuzhiyun	vadd.i32	q0, q0, q5
405*4882a593Smuzhiyun	vadd.i32	q1, q1, q6
406*4882a593Smuzhiyun	vadd.i32	q2, q2, q7
407*4882a593Smuzhiyun	vadd.i32	q3, q3, q4
408*4882a593Smuzhiyun
409*4882a593Smuzhiyun	veor		q15, q15, q0
410*4882a593Smuzhiyun	veor		q12, q12, q1
411*4882a593Smuzhiyun	veor		q13, q13, q2
412*4882a593Smuzhiyun	veor		q14, q14, q3
413*4882a593Smuzhiyun
414*4882a593Smuzhiyun	vtbl.8		d30, {d30}, d16
415*4882a593Smuzhiyun	vtbl.8		d31, {d31}, d16
416*4882a593Smuzhiyun	vtbl.8		d24, {d24}, d16
417*4882a593Smuzhiyun	vtbl.8		d25, {d25}, d16
418*4882a593Smuzhiyun	vtbl.8		d26, {d26}, d16
419*4882a593Smuzhiyun	vtbl.8		d27, {d27}, d16
420*4882a593Smuzhiyun	vtbl.8		d28, {d28}, d16
421*4882a593Smuzhiyun	vtbl.8		d29, {d29}, d16
422*4882a593Smuzhiyun
423*4882a593Smuzhiyun	vld1.32		{q8-q9}, [sp, :256]
424*4882a593Smuzhiyun
425*4882a593Smuzhiyun	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
426*4882a593Smuzhiyun	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
427*4882a593Smuzhiyun	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
428*4882a593Smuzhiyun	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
429*4882a593Smuzhiyun	vadd.i32	q10, q10, q15
430*4882a593Smuzhiyun	vadd.i32	q11, q11, q12
431*4882a593Smuzhiyun	vadd.i32	q8, q8, q13
432*4882a593Smuzhiyun	vadd.i32	q9, q9, q14
433*4882a593Smuzhiyun
434*4882a593Smuzhiyun	vst1.32		{q8-q9}, [sp, :256]
435*4882a593Smuzhiyun
436*4882a593Smuzhiyun	veor		q8, q7, q8
437*4882a593Smuzhiyun	veor		q9, q4, q9
438*4882a593Smuzhiyun	vshl.u32	q7, q8, #7
439*4882a593Smuzhiyun	vshl.u32	q4, q9, #7
440*4882a593Smuzhiyun	vsri.u32	q7, q8, #25
441*4882a593Smuzhiyun	vsri.u32	q4, q9, #25
442*4882a593Smuzhiyun
443*4882a593Smuzhiyun	veor		q8, q5, q10
444*4882a593Smuzhiyun	veor		q9, q6, q11
445*4882a593Smuzhiyun	vshl.u32	q5, q8, #7
446*4882a593Smuzhiyun	vshl.u32	q6, q9, #7
447*4882a593Smuzhiyun	vsri.u32	q5, q8, #25
448*4882a593Smuzhiyun	vsri.u32	q6, q9, #25
449*4882a593Smuzhiyun
450*4882a593Smuzhiyun	subs		r3, r3, #2
451*4882a593Smuzhiyun	bne		.Ldoubleround4
452*4882a593Smuzhiyun
453*4882a593Smuzhiyun	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
454*4882a593Smuzhiyun	// x8..9[0-3] are on the stack.
455*4882a593Smuzhiyun
456*4882a593Smuzhiyun	// Re-interleave the words in the first two rows of each block (x0..7).
457*4882a593Smuzhiyun	// Also add the counter values 0-3 to x12[0-3].
458*4882a593Smuzhiyun	  vld1.32	{q8}, [r5, :128]	// load counter values 0-3
459*4882a593Smuzhiyun	vzip.32		q0, q1			// => (0 1 0 1) (0 1 0 1)
460*4882a593Smuzhiyun	vzip.32		q2, q3			// => (2 3 2 3) (2 3 2 3)
461*4882a593Smuzhiyun	vzip.32		q4, q5			// => (4 5 4 5) (4 5 4 5)
462*4882a593Smuzhiyun	vzip.32		q6, q7			// => (6 7 6 7) (6 7 6 7)
463*4882a593Smuzhiyun	  vadd.u32	q12, q8			// x12 += counter values 0-3
464*4882a593Smuzhiyun	vswp		d1, d4
465*4882a593Smuzhiyun	vswp		d3, d6
466*4882a593Smuzhiyun	  vld1.32	{q8-q9}, [r0]!		// load s0..7
467*4882a593Smuzhiyun	vswp		d9, d12
468*4882a593Smuzhiyun	vswp		d11, d14
469*4882a593Smuzhiyun
470*4882a593Smuzhiyun	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
471*4882a593Smuzhiyun	// after XORing the first 32 bytes.
472*4882a593Smuzhiyun	vswp		q1, q4
473*4882a593Smuzhiyun
474*4882a593Smuzhiyun	// First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
475*4882a593Smuzhiyun
476*4882a593Smuzhiyun	// x0..3[0-3] += s0..3[0-3]	(add orig state to 1st row of each block)
477*4882a593Smuzhiyun	vadd.u32	q0, q0, q8
478*4882a593Smuzhiyun	vadd.u32	q2, q2, q8
479*4882a593Smuzhiyun	vadd.u32	q4, q4, q8
480*4882a593Smuzhiyun	vadd.u32	q3, q3, q8
481*4882a593Smuzhiyun
482*4882a593Smuzhiyun	// x4..7[0-3] += s4..7[0-3]	(add orig state to 2nd row of each block)
483*4882a593Smuzhiyun	vadd.u32	q1, q1, q9
484*4882a593Smuzhiyun	vadd.u32	q6, q6, q9
485*4882a593Smuzhiyun	vadd.u32	q5, q5, q9
486*4882a593Smuzhiyun	vadd.u32	q7, q7, q9
487*4882a593Smuzhiyun
488*4882a593Smuzhiyun	// XOR first 32 bytes using keystream from first two rows of first block
489*4882a593Smuzhiyun	vld1.8		{q8-q9}, [r2]!
490*4882a593Smuzhiyun	veor		q8, q8, q0
491*4882a593Smuzhiyun	veor		q9, q9, q1
492*4882a593Smuzhiyun	vst1.8		{q8-q9}, [r1]!
493*4882a593Smuzhiyun
494*4882a593Smuzhiyun	// Re-interleave the words in the last two rows of each block (x8..15).
495*4882a593Smuzhiyun	vld1.32		{q8-q9}, [sp, :256]
496*4882a593Smuzhiyun	vzip.32		q12, q13	// => (12 13 12 13) (12 13 12 13)
497*4882a593Smuzhiyun	vzip.32		q14, q15	// => (14 15 14 15) (14 15 14 15)
498*4882a593Smuzhiyun	vzip.32		q8, q9		// => (8 9 8 9) (8 9 8 9)
499*4882a593Smuzhiyun	vzip.32		q10, q11	// => (10 11 10 11) (10 11 10 11)
500*4882a593Smuzhiyun	  vld1.32	{q0-q1}, [r0]	// load s8..15
501*4882a593Smuzhiyun	vswp		d25, d28
502*4882a593Smuzhiyun	vswp		d27, d30
503*4882a593Smuzhiyun	vswp		d17, d20
504*4882a593Smuzhiyun	vswp		d19, d22
505*4882a593Smuzhiyun
506*4882a593Smuzhiyun	// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
507*4882a593Smuzhiyun
508*4882a593Smuzhiyun	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
509*4882a593Smuzhiyun	vadd.u32	q8,  q8,  q0
510*4882a593Smuzhiyun	vadd.u32	q10, q10, q0
511*4882a593Smuzhiyun	vadd.u32	q9,  q9,  q0
512*4882a593Smuzhiyun	vadd.u32	q11, q11, q0
513*4882a593Smuzhiyun
514*4882a593Smuzhiyun	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
515*4882a593Smuzhiyun	vadd.u32	q12, q12, q1
516*4882a593Smuzhiyun	vadd.u32	q14, q14, q1
517*4882a593Smuzhiyun	vadd.u32	q13, q13, q1
518*4882a593Smuzhiyun	vadd.u32	q15, q15, q1
519*4882a593Smuzhiyun
520*4882a593Smuzhiyun	// XOR the rest of the data with the keystream
521*4882a593Smuzhiyun
522*4882a593Smuzhiyun	vld1.8		{q0-q1}, [r2]!
523*4882a593Smuzhiyun	veor		q0, q0, q8
524*4882a593Smuzhiyun	veor		q1, q1, q12
525*4882a593Smuzhiyun	vst1.8		{q0-q1}, [r1]!
526*4882a593Smuzhiyun
527*4882a593Smuzhiyun	vld1.8		{q0-q1}, [r2]!
528*4882a593Smuzhiyun	veor		q0, q0, q2
529*4882a593Smuzhiyun	veor		q1, q1, q6
530*4882a593Smuzhiyun	vst1.8		{q0-q1}, [r1]!
531*4882a593Smuzhiyun
532*4882a593Smuzhiyun	vld1.8		{q0-q1}, [r2]!
533*4882a593Smuzhiyun	veor		q0, q0, q10
534*4882a593Smuzhiyun	veor		q1, q1, q14
535*4882a593Smuzhiyun	vst1.8		{q0-q1}, [r1]!
536*4882a593Smuzhiyun
537*4882a593Smuzhiyun	vld1.8		{q0-q1}, [r2]!
538*4882a593Smuzhiyun	veor		q0, q0, q4
539*4882a593Smuzhiyun	veor		q1, q1, q5
540*4882a593Smuzhiyun	vst1.8		{q0-q1}, [r1]!
541*4882a593Smuzhiyun
542*4882a593Smuzhiyun	vld1.8		{q0-q1}, [r2]!
543*4882a593Smuzhiyun	veor		q0, q0, q9
544*4882a593Smuzhiyun	veor		q1, q1, q13
545*4882a593Smuzhiyun	vst1.8		{q0-q1}, [r1]!
546*4882a593Smuzhiyun
547*4882a593Smuzhiyun	vld1.8		{q0-q1}, [r2]!
548*4882a593Smuzhiyun	veor		q0, q0, q3
549*4882a593Smuzhiyun	veor		q1, q1, q7
550*4882a593Smuzhiyun	vst1.8		{q0-q1}, [r1]!
551*4882a593Smuzhiyun
552*4882a593Smuzhiyun	vld1.8		{q0-q1}, [r2]
553*4882a593Smuzhiyun	  mov		sp, r4		// restore original stack pointer
554*4882a593Smuzhiyun	veor		q0, q0, q11
555*4882a593Smuzhiyun	veor		q1, q1, q15
556*4882a593Smuzhiyun	vst1.8		{q0-q1}, [r1]
557*4882a593Smuzhiyun
558*4882a593Smuzhiyun	pop		{r4-r5}
559*4882a593Smuzhiyun	bx		lr
560*4882a593SmuzhiyunENDPROC(chacha_4block_xor_neon)
561