xref: /OK3568_Linux_fs/kernel/arch/arm/crypto/sha1-armv7-neon.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
5*4882a593Smuzhiyun */
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun#include <linux/linkage.h>
8*4882a593Smuzhiyun#include <asm/assembler.h>
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun.syntax unified
11*4882a593Smuzhiyun.fpu neon
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun.text
14*4882a593Smuzhiyun
15*4882a593Smuzhiyun
16*4882a593Smuzhiyun/* Context structure */
17*4882a593Smuzhiyun
18*4882a593Smuzhiyun#define state_h0 0
19*4882a593Smuzhiyun#define state_h1 4
20*4882a593Smuzhiyun#define state_h2 8
21*4882a593Smuzhiyun#define state_h3 12
22*4882a593Smuzhiyun#define state_h4 16
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun
25*4882a593Smuzhiyun/* Constants */
26*4882a593Smuzhiyun
27*4882a593Smuzhiyun#define K1  0x5A827999
28*4882a593Smuzhiyun#define K2  0x6ED9EBA1
29*4882a593Smuzhiyun#define K3  0x8F1BBCDC
30*4882a593Smuzhiyun#define K4  0xCA62C1D6
31*4882a593Smuzhiyun.align 4
32*4882a593Smuzhiyun.LK_VEC:
33*4882a593Smuzhiyun.LK1:	.long K1, K1, K1, K1
34*4882a593Smuzhiyun.LK2:	.long K2, K2, K2, K2
35*4882a593Smuzhiyun.LK3:	.long K3, K3, K3, K3
36*4882a593Smuzhiyun.LK4:	.long K4, K4, K4, K4
37*4882a593Smuzhiyun
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun/* Register macros */
40*4882a593Smuzhiyun
41*4882a593Smuzhiyun#define RSTATE r0
42*4882a593Smuzhiyun#define RDATA r1
43*4882a593Smuzhiyun#define RNBLKS r2
44*4882a593Smuzhiyun#define ROLDSTACK r3
45*4882a593Smuzhiyun#define RWK lr
46*4882a593Smuzhiyun
47*4882a593Smuzhiyun#define _a r4
48*4882a593Smuzhiyun#define _b r5
49*4882a593Smuzhiyun#define _c r6
50*4882a593Smuzhiyun#define _d r7
51*4882a593Smuzhiyun#define _e r8
52*4882a593Smuzhiyun
53*4882a593Smuzhiyun#define RT0 r9
54*4882a593Smuzhiyun#define RT1 r10
55*4882a593Smuzhiyun#define RT2 r11
56*4882a593Smuzhiyun#define RT3 r12
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun#define W0 q0
59*4882a593Smuzhiyun#define W1 q7
60*4882a593Smuzhiyun#define W2 q2
61*4882a593Smuzhiyun#define W3 q3
62*4882a593Smuzhiyun#define W4 q4
63*4882a593Smuzhiyun#define W5 q6
64*4882a593Smuzhiyun#define W6 q5
65*4882a593Smuzhiyun#define W7 q1
66*4882a593Smuzhiyun
67*4882a593Smuzhiyun#define tmp0 q8
68*4882a593Smuzhiyun#define tmp1 q9
69*4882a593Smuzhiyun#define tmp2 q10
70*4882a593Smuzhiyun#define tmp3 q11
71*4882a593Smuzhiyun
72*4882a593Smuzhiyun#define qK1 q12
73*4882a593Smuzhiyun#define qK2 q13
74*4882a593Smuzhiyun#define qK3 q14
75*4882a593Smuzhiyun#define qK4 q15
76*4882a593Smuzhiyun
77*4882a593Smuzhiyun#ifdef CONFIG_CPU_BIG_ENDIAN
78*4882a593Smuzhiyun#define ARM_LE(code...)
79*4882a593Smuzhiyun#else
80*4882a593Smuzhiyun#define ARM_LE(code...)		code
81*4882a593Smuzhiyun#endif
82*4882a593Smuzhiyun
83*4882a593Smuzhiyun/* Round function macros. */
84*4882a593Smuzhiyun
85*4882a593Smuzhiyun#define WK_offs(i) (((i) & 15) * 4)
86*4882a593Smuzhiyun
87*4882a593Smuzhiyun#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
88*4882a593Smuzhiyun	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
89*4882a593Smuzhiyun	ldr RT3, [sp, WK_offs(i)]; \
90*4882a593Smuzhiyun		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
91*4882a593Smuzhiyun	bic RT0, d, b; \
92*4882a593Smuzhiyun	add e, e, a, ror #(32 - 5); \
93*4882a593Smuzhiyun	and RT1, c, b; \
94*4882a593Smuzhiyun		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
95*4882a593Smuzhiyun	add RT0, RT0, RT3; \
96*4882a593Smuzhiyun	add e, e, RT1; \
97*4882a593Smuzhiyun	ror b, #(32 - 30); \
98*4882a593Smuzhiyun		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
99*4882a593Smuzhiyun	add e, e, RT0;
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
102*4882a593Smuzhiyun	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
103*4882a593Smuzhiyun	ldr RT3, [sp, WK_offs(i)]; \
104*4882a593Smuzhiyun		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
105*4882a593Smuzhiyun	eor RT0, d, b; \
106*4882a593Smuzhiyun	add e, e, a, ror #(32 - 5); \
107*4882a593Smuzhiyun	eor RT0, RT0, c; \
108*4882a593Smuzhiyun		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
109*4882a593Smuzhiyun	add e, e, RT3; \
110*4882a593Smuzhiyun	ror b, #(32 - 30); \
111*4882a593Smuzhiyun		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
112*4882a593Smuzhiyun	add e, e, RT0; \
113*4882a593Smuzhiyun
114*4882a593Smuzhiyun#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
115*4882a593Smuzhiyun	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
116*4882a593Smuzhiyun	ldr RT3, [sp, WK_offs(i)]; \
117*4882a593Smuzhiyun		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
118*4882a593Smuzhiyun	eor RT0, b, c; \
119*4882a593Smuzhiyun	and RT1, b, c; \
120*4882a593Smuzhiyun	add e, e, a, ror #(32 - 5); \
121*4882a593Smuzhiyun		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
122*4882a593Smuzhiyun	and RT0, RT0, d; \
123*4882a593Smuzhiyun	add RT1, RT1, RT3; \
124*4882a593Smuzhiyun	add e, e, RT0; \
125*4882a593Smuzhiyun	ror b, #(32 - 30); \
126*4882a593Smuzhiyun		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
127*4882a593Smuzhiyun	add e, e, RT1;
128*4882a593Smuzhiyun
129*4882a593Smuzhiyun#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
130*4882a593Smuzhiyun	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
131*4882a593Smuzhiyun	_R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
132*4882a593Smuzhiyun	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
133*4882a593Smuzhiyun
134*4882a593Smuzhiyun#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
135*4882a593Smuzhiyun           W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
136*4882a593Smuzhiyun	_R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
137*4882a593Smuzhiyun	       W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
138*4882a593Smuzhiyun
139*4882a593Smuzhiyun#define R(a,b,c,d,e,f,i) \
140*4882a593Smuzhiyun	_R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
141*4882a593Smuzhiyun	       W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
142*4882a593Smuzhiyun
143*4882a593Smuzhiyun#define dummy(...)
144*4882a593Smuzhiyun
145*4882a593Smuzhiyun
146*4882a593Smuzhiyun/* Input expansion macros. */
147*4882a593Smuzhiyun
148*4882a593Smuzhiyun/********* Precalc macros for rounds 0-15 *************************************/
149*4882a593Smuzhiyun
150*4882a593Smuzhiyun#define W_PRECALC_00_15() \
151*4882a593Smuzhiyun	add       RWK, sp, #(WK_offs(0));			\
152*4882a593Smuzhiyun	\
153*4882a593Smuzhiyun	vld1.32   {W0, W7}, [RDATA]!;				\
154*4882a593Smuzhiyun ARM_LE(vrev32.8  W0, W0;	)	/* big => little */	\
155*4882a593Smuzhiyun	vld1.32   {W6, W5}, [RDATA]!;				\
156*4882a593Smuzhiyun	vadd.u32  tmp0, W0, curK;				\
157*4882a593Smuzhiyun ARM_LE(vrev32.8  W7, W7;	)	/* big => little */	\
158*4882a593Smuzhiyun ARM_LE(vrev32.8  W6, W6;	)	/* big => little */	\
159*4882a593Smuzhiyun	vadd.u32  tmp1, W7, curK;				\
160*4882a593Smuzhiyun ARM_LE(vrev32.8  W5, W5;	)	/* big => little */	\
161*4882a593Smuzhiyun	vadd.u32  tmp2, W6, curK;				\
162*4882a593Smuzhiyun	vst1.32   {tmp0, tmp1}, [RWK]!;				\
163*4882a593Smuzhiyun	vadd.u32  tmp3, W5, curK;				\
164*4882a593Smuzhiyun	vst1.32   {tmp2, tmp3}, [RWK];				\
165*4882a593Smuzhiyun
166*4882a593Smuzhiyun#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
167*4882a593Smuzhiyun	vld1.32   {W0, W7}, [RDATA]!;				\
168*4882a593Smuzhiyun
169*4882a593Smuzhiyun#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
170*4882a593Smuzhiyun	add       RWK, sp, #(WK_offs(0));			\
171*4882a593Smuzhiyun
172*4882a593Smuzhiyun#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
173*4882a593Smuzhiyun ARM_LE(vrev32.8  W0, W0;	)	/* big => little */	\
174*4882a593Smuzhiyun
175*4882a593Smuzhiyun#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
176*4882a593Smuzhiyun	vld1.32   {W6, W5}, [RDATA]!;				\
177*4882a593Smuzhiyun
178*4882a593Smuzhiyun#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
179*4882a593Smuzhiyun	vadd.u32  tmp0, W0, curK;				\
180*4882a593Smuzhiyun
181*4882a593Smuzhiyun#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
182*4882a593Smuzhiyun ARM_LE(vrev32.8  W7, W7;	)	/* big => little */	\
183*4882a593Smuzhiyun
184*4882a593Smuzhiyun#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
185*4882a593Smuzhiyun ARM_LE(vrev32.8  W6, W6;	)	/* big => little */	\
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
188*4882a593Smuzhiyun	vadd.u32  tmp1, W7, curK;				\
189*4882a593Smuzhiyun
190*4882a593Smuzhiyun#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
191*4882a593Smuzhiyun ARM_LE(vrev32.8  W5, W5;	)	/* big => little */	\
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
194*4882a593Smuzhiyun	vadd.u32  tmp2, W6, curK;				\
195*4882a593Smuzhiyun
196*4882a593Smuzhiyun#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
197*4882a593Smuzhiyun	vst1.32   {tmp0, tmp1}, [RWK]!;				\
198*4882a593Smuzhiyun
199*4882a593Smuzhiyun#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
200*4882a593Smuzhiyun	vadd.u32  tmp3, W5, curK;				\
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
203*4882a593Smuzhiyun	vst1.32   {tmp2, tmp3}, [RWK];				\
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun
206*4882a593Smuzhiyun/********* Precalc macros for rounds 16-31 ************************************/
207*4882a593Smuzhiyun
208*4882a593Smuzhiyun#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
209*4882a593Smuzhiyun	veor      tmp0, tmp0;			\
210*4882a593Smuzhiyun	vext.8    W, W_m16, W_m12, #8;		\
211*4882a593Smuzhiyun
212*4882a593Smuzhiyun#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
213*4882a593Smuzhiyun	add       RWK, sp, #(WK_offs(i));	\
214*4882a593Smuzhiyun	vext.8    tmp0, W_m04, tmp0, #4;	\
215*4882a593Smuzhiyun
216*4882a593Smuzhiyun#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
217*4882a593Smuzhiyun	veor      tmp0, tmp0, W_m16;		\
218*4882a593Smuzhiyun	veor.32   W, W, W_m08;			\
219*4882a593Smuzhiyun
220*4882a593Smuzhiyun#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
221*4882a593Smuzhiyun	veor      tmp1, tmp1;			\
222*4882a593Smuzhiyun	veor      W, W, tmp0;			\
223*4882a593Smuzhiyun
224*4882a593Smuzhiyun#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
225*4882a593Smuzhiyun	vshl.u32  tmp0, W, #1;			\
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
228*4882a593Smuzhiyun	vext.8    tmp1, tmp1, W, #(16-12);	\
229*4882a593Smuzhiyun	vshr.u32  W, W, #31;			\
230*4882a593Smuzhiyun
231*4882a593Smuzhiyun#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
232*4882a593Smuzhiyun	vorr      tmp0, tmp0, W;		\
233*4882a593Smuzhiyun	vshr.u32  W, tmp1, #30;			\
234*4882a593Smuzhiyun
235*4882a593Smuzhiyun#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
236*4882a593Smuzhiyun	vshl.u32  tmp1, tmp1, #2;		\
237*4882a593Smuzhiyun
238*4882a593Smuzhiyun#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
239*4882a593Smuzhiyun	veor      tmp0, tmp0, W;		\
240*4882a593Smuzhiyun
241*4882a593Smuzhiyun#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
242*4882a593Smuzhiyun	veor      W, tmp0, tmp1;		\
243*4882a593Smuzhiyun
244*4882a593Smuzhiyun#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
245*4882a593Smuzhiyun	vadd.u32  tmp0, W, curK;		\
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
248*4882a593Smuzhiyun	vst1.32   {tmp0}, [RWK];
249*4882a593Smuzhiyun
250*4882a593Smuzhiyun
251*4882a593Smuzhiyun/********* Precalc macros for rounds 32-79 ************************************/
252*4882a593Smuzhiyun
253*4882a593Smuzhiyun#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
254*4882a593Smuzhiyun	veor W, W_m28; \
255*4882a593Smuzhiyun
256*4882a593Smuzhiyun#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
257*4882a593Smuzhiyun	vext.8 tmp0, W_m08, W_m04, #8; \
258*4882a593Smuzhiyun
259*4882a593Smuzhiyun#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
260*4882a593Smuzhiyun	veor W, W_m16; \
261*4882a593Smuzhiyun
262*4882a593Smuzhiyun#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
263*4882a593Smuzhiyun	veor W, tmp0; \
264*4882a593Smuzhiyun
265*4882a593Smuzhiyun#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
266*4882a593Smuzhiyun	add RWK, sp, #(WK_offs(i&~3)); \
267*4882a593Smuzhiyun
268*4882a593Smuzhiyun#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
269*4882a593Smuzhiyun	vshl.u32 tmp1, W, #2; \
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
272*4882a593Smuzhiyun	vshr.u32 tmp0, W, #30; \
273*4882a593Smuzhiyun
274*4882a593Smuzhiyun#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
275*4882a593Smuzhiyun	vorr W, tmp0, tmp1; \
276*4882a593Smuzhiyun
277*4882a593Smuzhiyun#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
278*4882a593Smuzhiyun	vadd.u32 tmp0, W, curK; \
279*4882a593Smuzhiyun
280*4882a593Smuzhiyun#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
281*4882a593Smuzhiyun	vst1.32 {tmp0}, [RWK];
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun
284*4882a593Smuzhiyun/*
285*4882a593Smuzhiyun * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
286*4882a593Smuzhiyun *
287*4882a593Smuzhiyun * unsigned int
288*4882a593Smuzhiyun * sha1_transform_neon (void *ctx, const unsigned char *data,
289*4882a593Smuzhiyun *                      unsigned int nblks)
290*4882a593Smuzhiyun */
291*4882a593Smuzhiyun.align 3
292*4882a593SmuzhiyunENTRY(sha1_transform_neon)
293*4882a593Smuzhiyun  /* input:
294*4882a593Smuzhiyun   *	r0: ctx, CTX
295*4882a593Smuzhiyun   *	r1: data (64*nblks bytes)
296*4882a593Smuzhiyun   *	r2: nblks
297*4882a593Smuzhiyun   */
298*4882a593Smuzhiyun
299*4882a593Smuzhiyun  cmp RNBLKS, #0;
300*4882a593Smuzhiyun  beq .Ldo_nothing;
301*4882a593Smuzhiyun
302*4882a593Smuzhiyun  push {r4-r12, lr};
303*4882a593Smuzhiyun  /*vpush {q4-q7};*/
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun  adr RT3, .LK_VEC;
306*4882a593Smuzhiyun
307*4882a593Smuzhiyun  mov ROLDSTACK, sp;
308*4882a593Smuzhiyun
309*4882a593Smuzhiyun  /* Align stack. */
310*4882a593Smuzhiyun  sub RT0, sp, #(16*4);
311*4882a593Smuzhiyun  and RT0, #(~(16-1));
312*4882a593Smuzhiyun  mov sp, RT0;
313*4882a593Smuzhiyun
314*4882a593Smuzhiyun  vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
315*4882a593Smuzhiyun
316*4882a593Smuzhiyun  /* Get the values of the chaining variables. */
317*4882a593Smuzhiyun  ldm RSTATE, {_a-_e};
318*4882a593Smuzhiyun
319*4882a593Smuzhiyun  vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
320*4882a593Smuzhiyun
321*4882a593Smuzhiyun#undef curK
322*4882a593Smuzhiyun#define curK qK1
323*4882a593Smuzhiyun  /* Precalc 0-15. */
324*4882a593Smuzhiyun  W_PRECALC_00_15();
325*4882a593Smuzhiyun
326*4882a593Smuzhiyun.Loop:
327*4882a593Smuzhiyun  /* Transform 0-15 + Precalc 16-31. */
328*4882a593Smuzhiyun  _R( _a, _b, _c, _d, _e, F1,  0,
329*4882a593Smuzhiyun      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
330*4882a593Smuzhiyun      W4, W5, W6, W7, W0, _, _, _ );
331*4882a593Smuzhiyun  _R( _e, _a, _b, _c, _d, F1,  1,
332*4882a593Smuzhiyun      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
333*4882a593Smuzhiyun      W4, W5, W6, W7, W0, _, _, _ );
334*4882a593Smuzhiyun  _R( _d, _e, _a, _b, _c, F1,  2,
335*4882a593Smuzhiyun      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
336*4882a593Smuzhiyun      W4, W5, W6, W7, W0, _, _, _ );
337*4882a593Smuzhiyun  _R( _c, _d, _e, _a, _b, F1,  3,
338*4882a593Smuzhiyun      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
339*4882a593Smuzhiyun      W4, W5, W6, W7, W0, _, _, _ );
340*4882a593Smuzhiyun
341*4882a593Smuzhiyun#undef curK
342*4882a593Smuzhiyun#define curK qK2
343*4882a593Smuzhiyun  _R( _b, _c, _d, _e, _a, F1,  4,
344*4882a593Smuzhiyun      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
345*4882a593Smuzhiyun      W3, W4, W5, W6, W7, _, _, _ );
346*4882a593Smuzhiyun  _R( _a, _b, _c, _d, _e, F1,  5,
347*4882a593Smuzhiyun      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
348*4882a593Smuzhiyun      W3, W4, W5, W6, W7, _, _, _ );
349*4882a593Smuzhiyun  _R( _e, _a, _b, _c, _d, F1,  6,
350*4882a593Smuzhiyun      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
351*4882a593Smuzhiyun      W3, W4, W5, W6, W7, _, _, _ );
352*4882a593Smuzhiyun  _R( _d, _e, _a, _b, _c, F1,  7,
353*4882a593Smuzhiyun      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
354*4882a593Smuzhiyun      W3, W4, W5, W6, W7, _, _, _ );
355*4882a593Smuzhiyun
356*4882a593Smuzhiyun  _R( _c, _d, _e, _a, _b, F1,  8,
357*4882a593Smuzhiyun      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
358*4882a593Smuzhiyun      W2, W3, W4, W5, W6, _, _, _ );
359*4882a593Smuzhiyun  _R( _b, _c, _d, _e, _a, F1,  9,
360*4882a593Smuzhiyun      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
361*4882a593Smuzhiyun      W2, W3, W4, W5, W6, _, _, _ );
362*4882a593Smuzhiyun  _R( _a, _b, _c, _d, _e, F1, 10,
363*4882a593Smuzhiyun      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
364*4882a593Smuzhiyun      W2, W3, W4, W5, W6, _, _, _ );
365*4882a593Smuzhiyun  _R( _e, _a, _b, _c, _d, F1, 11,
366*4882a593Smuzhiyun      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
367*4882a593Smuzhiyun      W2, W3, W4, W5, W6, _, _, _ );
368*4882a593Smuzhiyun
369*4882a593Smuzhiyun  _R( _d, _e, _a, _b, _c, F1, 12,
370*4882a593Smuzhiyun      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
371*4882a593Smuzhiyun      W1, W2, W3, W4, W5, _, _, _ );
372*4882a593Smuzhiyun  _R( _c, _d, _e, _a, _b, F1, 13,
373*4882a593Smuzhiyun      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
374*4882a593Smuzhiyun      W1, W2, W3, W4, W5, _, _, _ );
375*4882a593Smuzhiyun  _R( _b, _c, _d, _e, _a, F1, 14,
376*4882a593Smuzhiyun      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
377*4882a593Smuzhiyun      W1, W2, W3, W4, W5, _, _, _ );
378*4882a593Smuzhiyun  _R( _a, _b, _c, _d, _e, F1, 15,
379*4882a593Smuzhiyun      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
380*4882a593Smuzhiyun      W1, W2, W3, W4, W5, _, _, _ );
381*4882a593Smuzhiyun
382*4882a593Smuzhiyun  /* Transform 16-63 + Precalc 32-79. */
383*4882a593Smuzhiyun  _R( _e, _a, _b, _c, _d, F1, 16,
384*4882a593Smuzhiyun      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
385*4882a593Smuzhiyun      W0, W1, W2, W3, W4, W5, W6, W7);
386*4882a593Smuzhiyun  _R( _d, _e, _a, _b, _c, F1, 17,
387*4882a593Smuzhiyun      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
388*4882a593Smuzhiyun      W0, W1, W2, W3, W4, W5, W6, W7);
389*4882a593Smuzhiyun  _R( _c, _d, _e, _a, _b, F1, 18,
390*4882a593Smuzhiyun      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 32,
391*4882a593Smuzhiyun      W0, W1, W2, W3, W4, W5, W6, W7);
392*4882a593Smuzhiyun  _R( _b, _c, _d, _e, _a, F1, 19,
393*4882a593Smuzhiyun      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 32,
394*4882a593Smuzhiyun      W0, W1, W2, W3, W4, W5, W6, W7);
395*4882a593Smuzhiyun
396*4882a593Smuzhiyun  _R( _a, _b, _c, _d, _e, F2, 20,
397*4882a593Smuzhiyun      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
398*4882a593Smuzhiyun      W7, W0, W1, W2, W3, W4, W5, W6);
399*4882a593Smuzhiyun  _R( _e, _a, _b, _c, _d, F2, 21,
400*4882a593Smuzhiyun      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
401*4882a593Smuzhiyun      W7, W0, W1, W2, W3, W4, W5, W6);
402*4882a593Smuzhiyun  _R( _d, _e, _a, _b, _c, F2, 22,
403*4882a593Smuzhiyun      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 36,
404*4882a593Smuzhiyun      W7, W0, W1, W2, W3, W4, W5, W6);
405*4882a593Smuzhiyun  _R( _c, _d, _e, _a, _b, F2, 23,
406*4882a593Smuzhiyun      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 36,
407*4882a593Smuzhiyun      W7, W0, W1, W2, W3, W4, W5, W6);
408*4882a593Smuzhiyun
409*4882a593Smuzhiyun#undef curK
410*4882a593Smuzhiyun#define curK qK3
411*4882a593Smuzhiyun  _R( _b, _c, _d, _e, _a, F2, 24,
412*4882a593Smuzhiyun      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
413*4882a593Smuzhiyun      W6, W7, W0, W1, W2, W3, W4, W5);
414*4882a593Smuzhiyun  _R( _a, _b, _c, _d, _e, F2, 25,
415*4882a593Smuzhiyun      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
416*4882a593Smuzhiyun      W6, W7, W0, W1, W2, W3, W4, W5);
417*4882a593Smuzhiyun  _R( _e, _a, _b, _c, _d, F2, 26,
418*4882a593Smuzhiyun      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 40,
419*4882a593Smuzhiyun      W6, W7, W0, W1, W2, W3, W4, W5);
420*4882a593Smuzhiyun  _R( _d, _e, _a, _b, _c, F2, 27,
421*4882a593Smuzhiyun      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 40,
422*4882a593Smuzhiyun      W6, W7, W0, W1, W2, W3, W4, W5);
423*4882a593Smuzhiyun
424*4882a593Smuzhiyun  _R( _c, _d, _e, _a, _b, F2, 28,
425*4882a593Smuzhiyun      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
426*4882a593Smuzhiyun      W5, W6, W7, W0, W1, W2, W3, W4);
427*4882a593Smuzhiyun  _R( _b, _c, _d, _e, _a, F2, 29,
428*4882a593Smuzhiyun      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
429*4882a593Smuzhiyun      W5, W6, W7, W0, W1, W2, W3, W4);
430*4882a593Smuzhiyun  _R( _a, _b, _c, _d, _e, F2, 30,
431*4882a593Smuzhiyun      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 44,
432*4882a593Smuzhiyun      W5, W6, W7, W0, W1, W2, W3, W4);
433*4882a593Smuzhiyun  _R( _e, _a, _b, _c, _d, F2, 31,
434*4882a593Smuzhiyun      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 44,
435*4882a593Smuzhiyun      W5, W6, W7, W0, W1, W2, W3, W4);
436*4882a593Smuzhiyun
437*4882a593Smuzhiyun  _R( _d, _e, _a, _b, _c, F2, 32,
438*4882a593Smuzhiyun      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
439*4882a593Smuzhiyun      W4, W5, W6, W7, W0, W1, W2, W3);
440*4882a593Smuzhiyun  _R( _c, _d, _e, _a, _b, F2, 33,
441*4882a593Smuzhiyun      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
442*4882a593Smuzhiyun      W4, W5, W6, W7, W0, W1, W2, W3);
443*4882a593Smuzhiyun  _R( _b, _c, _d, _e, _a, F2, 34,
444*4882a593Smuzhiyun      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 48,
445*4882a593Smuzhiyun      W4, W5, W6, W7, W0, W1, W2, W3);
446*4882a593Smuzhiyun  _R( _a, _b, _c, _d, _e, F2, 35,
447*4882a593Smuzhiyun      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 48,
448*4882a593Smuzhiyun      W4, W5, W6, W7, W0, W1, W2, W3);
449*4882a593Smuzhiyun
450*4882a593Smuzhiyun  _R( _e, _a, _b, _c, _d, F2, 36,
451*4882a593Smuzhiyun      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
452*4882a593Smuzhiyun      W3, W4, W5, W6, W7, W0, W1, W2);
453*4882a593Smuzhiyun  _R( _d, _e, _a, _b, _c, F2, 37,
454*4882a593Smuzhiyun      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
455*4882a593Smuzhiyun      W3, W4, W5, W6, W7, W0, W1, W2);
456*4882a593Smuzhiyun  _R( _c, _d, _e, _a, _b, F2, 38,
457*4882a593Smuzhiyun      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 52,
458*4882a593Smuzhiyun      W3, W4, W5, W6, W7, W0, W1, W2);
459*4882a593Smuzhiyun  _R( _b, _c, _d, _e, _a, F2, 39,
460*4882a593Smuzhiyun      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 52,
461*4882a593Smuzhiyun      W3, W4, W5, W6, W7, W0, W1, W2);
462*4882a593Smuzhiyun
463*4882a593Smuzhiyun  _R( _a, _b, _c, _d, _e, F3, 40,
464*4882a593Smuzhiyun      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
465*4882a593Smuzhiyun      W2, W3, W4, W5, W6, W7, W0, W1);
466*4882a593Smuzhiyun  _R( _e, _a, _b, _c, _d, F3, 41,
467*4882a593Smuzhiyun      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
468*4882a593Smuzhiyun      W2, W3, W4, W5, W6, W7, W0, W1);
469*4882a593Smuzhiyun  _R( _d, _e, _a, _b, _c, F3, 42,
470*4882a593Smuzhiyun      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 56,
471*4882a593Smuzhiyun      W2, W3, W4, W5, W6, W7, W0, W1);
472*4882a593Smuzhiyun  _R( _c, _d, _e, _a, _b, F3, 43,
473*4882a593Smuzhiyun      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 56,
474*4882a593Smuzhiyun      W2, W3, W4, W5, W6, W7, W0, W1);
475*4882a593Smuzhiyun
476*4882a593Smuzhiyun#undef curK
477*4882a593Smuzhiyun#define curK qK4
478*4882a593Smuzhiyun  _R( _b, _c, _d, _e, _a, F3, 44,
479*4882a593Smuzhiyun      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
480*4882a593Smuzhiyun      W1, W2, W3, W4, W5, W6, W7, W0);
481*4882a593Smuzhiyun  _R( _a, _b, _c, _d, _e, F3, 45,
482*4882a593Smuzhiyun      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
483*4882a593Smuzhiyun      W1, W2, W3, W4, W5, W6, W7, W0);
484*4882a593Smuzhiyun  _R( _e, _a, _b, _c, _d, F3, 46,
485*4882a593Smuzhiyun      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 60,
486*4882a593Smuzhiyun      W1, W2, W3, W4, W5, W6, W7, W0);
487*4882a593Smuzhiyun  _R( _d, _e, _a, _b, _c, F3, 47,
488*4882a593Smuzhiyun      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 60,
489*4882a593Smuzhiyun      W1, W2, W3, W4, W5, W6, W7, W0);
490*4882a593Smuzhiyun
491*4882a593Smuzhiyun  _R( _c, _d, _e, _a, _b, F3, 48,
492*4882a593Smuzhiyun      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
493*4882a593Smuzhiyun      W0, W1, W2, W3, W4, W5, W6, W7);
494*4882a593Smuzhiyun  _R( _b, _c, _d, _e, _a, F3, 49,
495*4882a593Smuzhiyun      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
496*4882a593Smuzhiyun      W0, W1, W2, W3, W4, W5, W6, W7);
497*4882a593Smuzhiyun  _R( _a, _b, _c, _d, _e, F3, 50,
498*4882a593Smuzhiyun      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 64,
499*4882a593Smuzhiyun      W0, W1, W2, W3, W4, W5, W6, W7);
500*4882a593Smuzhiyun  _R( _e, _a, _b, _c, _d, F3, 51,
501*4882a593Smuzhiyun      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 64,
502*4882a593Smuzhiyun      W0, W1, W2, W3, W4, W5, W6, W7);
503*4882a593Smuzhiyun
504*4882a593Smuzhiyun  _R( _d, _e, _a, _b, _c, F3, 52,
505*4882a593Smuzhiyun      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
506*4882a593Smuzhiyun      W7, W0, W1, W2, W3, W4, W5, W6);
507*4882a593Smuzhiyun  _R( _c, _d, _e, _a, _b, F3, 53,
508*4882a593Smuzhiyun      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
509*4882a593Smuzhiyun      W7, W0, W1, W2, W3, W4, W5, W6);
510*4882a593Smuzhiyun  _R( _b, _c, _d, _e, _a, F3, 54,
511*4882a593Smuzhiyun      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 68,
512*4882a593Smuzhiyun      W7, W0, W1, W2, W3, W4, W5, W6);
513*4882a593Smuzhiyun  _R( _a, _b, _c, _d, _e, F3, 55,
514*4882a593Smuzhiyun      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 68,
515*4882a593Smuzhiyun      W7, W0, W1, W2, W3, W4, W5, W6);
516*4882a593Smuzhiyun
517*4882a593Smuzhiyun  _R( _e, _a, _b, _c, _d, F3, 56,
518*4882a593Smuzhiyun      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
519*4882a593Smuzhiyun      W6, W7, W0, W1, W2, W3, W4, W5);
520*4882a593Smuzhiyun  _R( _d, _e, _a, _b, _c, F3, 57,
521*4882a593Smuzhiyun      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
522*4882a593Smuzhiyun      W6, W7, W0, W1, W2, W3, W4, W5);
523*4882a593Smuzhiyun  _R( _c, _d, _e, _a, _b, F3, 58,
524*4882a593Smuzhiyun      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 72,
525*4882a593Smuzhiyun      W6, W7, W0, W1, W2, W3, W4, W5);
526*4882a593Smuzhiyun  _R( _b, _c, _d, _e, _a, F3, 59,
527*4882a593Smuzhiyun      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 72,
528*4882a593Smuzhiyun      W6, W7, W0, W1, W2, W3, W4, W5);
529*4882a593Smuzhiyun
530*4882a593Smuzhiyun  subs RNBLKS, #1;
531*4882a593Smuzhiyun
532*4882a593Smuzhiyun  _R( _a, _b, _c, _d, _e, F4, 60,
533*4882a593Smuzhiyun      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
534*4882a593Smuzhiyun      W5, W6, W7, W0, W1, W2, W3, W4);
535*4882a593Smuzhiyun  _R( _e, _a, _b, _c, _d, F4, 61,
536*4882a593Smuzhiyun      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
537*4882a593Smuzhiyun      W5, W6, W7, W0, W1, W2, W3, W4);
538*4882a593Smuzhiyun  _R( _d, _e, _a, _b, _c, F4, 62,
539*4882a593Smuzhiyun      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 76,
540*4882a593Smuzhiyun      W5, W6, W7, W0, W1, W2, W3, W4);
541*4882a593Smuzhiyun  _R( _c, _d, _e, _a, _b, F4, 63,
542*4882a593Smuzhiyun      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 76,
543*4882a593Smuzhiyun      W5, W6, W7, W0, W1, W2, W3, W4);
544*4882a593Smuzhiyun
545*4882a593Smuzhiyun  beq .Lend;
546*4882a593Smuzhiyun
547*4882a593Smuzhiyun  /* Transform 64-79 + Precalc 0-15 of next block. */
548*4882a593Smuzhiyun#undef curK
549*4882a593Smuzhiyun#define curK qK1
550*4882a593Smuzhiyun  _R( _b, _c, _d, _e, _a, F4, 64,
551*4882a593Smuzhiyun      WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
552*4882a593Smuzhiyun  _R( _a, _b, _c, _d, _e, F4, 65,
553*4882a593Smuzhiyun      WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
554*4882a593Smuzhiyun  _R( _e, _a, _b, _c, _d, F4, 66,
555*4882a593Smuzhiyun      WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
556*4882a593Smuzhiyun  _R( _d, _e, _a, _b, _c, F4, 67,
557*4882a593Smuzhiyun      WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
558*4882a593Smuzhiyun
559*4882a593Smuzhiyun  _R( _c, _d, _e, _a, _b, F4, 68,
560*4882a593Smuzhiyun      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
561*4882a593Smuzhiyun  _R( _b, _c, _d, _e, _a, F4, 69,
562*4882a593Smuzhiyun      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
563*4882a593Smuzhiyun  _R( _a, _b, _c, _d, _e, F4, 70,
564*4882a593Smuzhiyun      WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
565*4882a593Smuzhiyun  _R( _e, _a, _b, _c, _d, F4, 71,
566*4882a593Smuzhiyun      WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
567*4882a593Smuzhiyun
568*4882a593Smuzhiyun  _R( _d, _e, _a, _b, _c, F4, 72,
569*4882a593Smuzhiyun      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
570*4882a593Smuzhiyun  _R( _c, _d, _e, _a, _b, F4, 73,
571*4882a593Smuzhiyun      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
572*4882a593Smuzhiyun  _R( _b, _c, _d, _e, _a, F4, 74,
573*4882a593Smuzhiyun      WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
574*4882a593Smuzhiyun  _R( _a, _b, _c, _d, _e, F4, 75,
575*4882a593Smuzhiyun      WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
576*4882a593Smuzhiyun
577*4882a593Smuzhiyun  _R( _e, _a, _b, _c, _d, F4, 76,
578*4882a593Smuzhiyun      WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
579*4882a593Smuzhiyun  _R( _d, _e, _a, _b, _c, F4, 77,
580*4882a593Smuzhiyun      WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
581*4882a593Smuzhiyun  _R( _c, _d, _e, _a, _b, F4, 78,
582*4882a593Smuzhiyun      WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
583*4882a593Smuzhiyun  _R( _b, _c, _d, _e, _a, F4, 79,
584*4882a593Smuzhiyun      WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
585*4882a593Smuzhiyun
586*4882a593Smuzhiyun  /* Update the chaining variables. */
587*4882a593Smuzhiyun  ldm RSTATE, {RT0-RT3};
588*4882a593Smuzhiyun  add _a, RT0;
589*4882a593Smuzhiyun  ldr RT0, [RSTATE, #state_h4];
590*4882a593Smuzhiyun  add _b, RT1;
591*4882a593Smuzhiyun  add _c, RT2;
592*4882a593Smuzhiyun  add _d, RT3;
593*4882a593Smuzhiyun  add _e, RT0;
594*4882a593Smuzhiyun  stm RSTATE, {_a-_e};
595*4882a593Smuzhiyun
596*4882a593Smuzhiyun  b .Loop;
597*4882a593Smuzhiyun
598*4882a593Smuzhiyun.Lend:
599*4882a593Smuzhiyun  /* Transform 64-79 */
600*4882a593Smuzhiyun  R( _b, _c, _d, _e, _a, F4, 64 );
601*4882a593Smuzhiyun  R( _a, _b, _c, _d, _e, F4, 65 );
602*4882a593Smuzhiyun  R( _e, _a, _b, _c, _d, F4, 66 );
603*4882a593Smuzhiyun  R( _d, _e, _a, _b, _c, F4, 67 );
604*4882a593Smuzhiyun  R( _c, _d, _e, _a, _b, F4, 68 );
605*4882a593Smuzhiyun  R( _b, _c, _d, _e, _a, F4, 69 );
606*4882a593Smuzhiyun  R( _a, _b, _c, _d, _e, F4, 70 );
607*4882a593Smuzhiyun  R( _e, _a, _b, _c, _d, F4, 71 );
608*4882a593Smuzhiyun  R( _d, _e, _a, _b, _c, F4, 72 );
609*4882a593Smuzhiyun  R( _c, _d, _e, _a, _b, F4, 73 );
610*4882a593Smuzhiyun  R( _b, _c, _d, _e, _a, F4, 74 );
611*4882a593Smuzhiyun  R( _a, _b, _c, _d, _e, F4, 75 );
612*4882a593Smuzhiyun  R( _e, _a, _b, _c, _d, F4, 76 );
613*4882a593Smuzhiyun  R( _d, _e, _a, _b, _c, F4, 77 );
614*4882a593Smuzhiyun  R( _c, _d, _e, _a, _b, F4, 78 );
615*4882a593Smuzhiyun  R( _b, _c, _d, _e, _a, F4, 79 );
616*4882a593Smuzhiyun
617*4882a593Smuzhiyun  mov sp, ROLDSTACK;
618*4882a593Smuzhiyun
619*4882a593Smuzhiyun  /* Update the chaining variables. */
620*4882a593Smuzhiyun  ldm RSTATE, {RT0-RT3};
621*4882a593Smuzhiyun  add _a, RT0;
622*4882a593Smuzhiyun  ldr RT0, [RSTATE, #state_h4];
623*4882a593Smuzhiyun  add _b, RT1;
624*4882a593Smuzhiyun  add _c, RT2;
625*4882a593Smuzhiyun  add _d, RT3;
626*4882a593Smuzhiyun  /*vpop {q4-q7};*/
627*4882a593Smuzhiyun  add _e, RT0;
628*4882a593Smuzhiyun  stm RSTATE, {_a-_e};
629*4882a593Smuzhiyun
630*4882a593Smuzhiyun  pop {r4-r12, pc};
631*4882a593Smuzhiyun
632*4882a593Smuzhiyun.Ldo_nothing:
633*4882a593Smuzhiyun  bx lr
634*4882a593SmuzhiyunENDPROC(sha1_transform_neon)
635