xref: /OK3568_Linux_fs/kernel/arch/powerpc/crypto/sha1-spe-asm.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Fast SHA-1 implementation for SPE instruction set (PPC)
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * This code makes use of the SPE SIMD instruction set as defined in
6*4882a593Smuzhiyun * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
7*4882a593Smuzhiyun * Implementation is based on optimization guide notes from
8*4882a593Smuzhiyun * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
9*4882a593Smuzhiyun *
10*4882a593Smuzhiyun * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
11*4882a593Smuzhiyun */
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun#include <asm/ppc_asm.h>
14*4882a593Smuzhiyun#include <asm/asm-offsets.h>
15*4882a593Smuzhiyun
16*4882a593Smuzhiyun#define rHP	r3	/* pointer to hash value			*/
17*4882a593Smuzhiyun#define rWP	r4	/* pointer to input				*/
18*4882a593Smuzhiyun#define rKP	r5	/* pointer to constants				*/
19*4882a593Smuzhiyun
20*4882a593Smuzhiyun#define rW0	r14	/* 64 bit round words				*/
21*4882a593Smuzhiyun#define rW1	r15
22*4882a593Smuzhiyun#define rW2	r16
23*4882a593Smuzhiyun#define rW3	r17
24*4882a593Smuzhiyun#define rW4	r18
25*4882a593Smuzhiyun#define rW5	r19
26*4882a593Smuzhiyun#define rW6	r20
27*4882a593Smuzhiyun#define rW7	r21
28*4882a593Smuzhiyun
29*4882a593Smuzhiyun#define rH0	r6	/* 32 bit hash values 				*/
30*4882a593Smuzhiyun#define rH1	r7
31*4882a593Smuzhiyun#define rH2	r8
32*4882a593Smuzhiyun#define rH3	r9
33*4882a593Smuzhiyun#define rH4	r10
34*4882a593Smuzhiyun
35*4882a593Smuzhiyun#define rT0	r22	/* 64 bit temporary				*/
36*4882a593Smuzhiyun#define rT1	r0	/* 32 bit temporaries				*/
37*4882a593Smuzhiyun#define rT2	r11
38*4882a593Smuzhiyun#define rT3	r12
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun#define rK	r23	/* 64 bit constant in volatile register		*/
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun#define LOAD_K01
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun#define LOAD_K11 \
45*4882a593Smuzhiyun	evlwwsplat	rK,0(rKP);
46*4882a593Smuzhiyun
47*4882a593Smuzhiyun#define LOAD_K21 \
48*4882a593Smuzhiyun	evlwwsplat	rK,4(rKP);
49*4882a593Smuzhiyun
50*4882a593Smuzhiyun#define LOAD_K31 \
51*4882a593Smuzhiyun	evlwwsplat	rK,8(rKP);
52*4882a593Smuzhiyun
53*4882a593Smuzhiyun#define LOAD_K41 \
54*4882a593Smuzhiyun	evlwwsplat	rK,12(rKP);
55*4882a593Smuzhiyun
56*4882a593Smuzhiyun#define INITIALIZE \
57*4882a593Smuzhiyun	stwu		r1,-128(r1);	/* create stack frame		*/ \
58*4882a593Smuzhiyun	evstdw		r14,8(r1);	/* We must save non volatile	*/ \
59*4882a593Smuzhiyun	evstdw		r15,16(r1);	/* registers. Take the chance	*/ \
60*4882a593Smuzhiyun	evstdw		r16,24(r1);	/* and save the SPE part too	*/ \
61*4882a593Smuzhiyun	evstdw		r17,32(r1);					   \
62*4882a593Smuzhiyun	evstdw		r18,40(r1);					   \
63*4882a593Smuzhiyun	evstdw		r19,48(r1);					   \
64*4882a593Smuzhiyun	evstdw		r20,56(r1);					   \
65*4882a593Smuzhiyun	evstdw		r21,64(r1);					   \
66*4882a593Smuzhiyun	evstdw		r22,72(r1);					   \
67*4882a593Smuzhiyun	evstdw		r23,80(r1);
68*4882a593Smuzhiyun
69*4882a593Smuzhiyun
70*4882a593Smuzhiyun#define FINALIZE \
71*4882a593Smuzhiyun	evldw		r14,8(r1);	/* restore SPE registers	*/ \
72*4882a593Smuzhiyun	evldw		r15,16(r1);					   \
73*4882a593Smuzhiyun	evldw		r16,24(r1);					   \
74*4882a593Smuzhiyun	evldw		r17,32(r1);					   \
75*4882a593Smuzhiyun	evldw		r18,40(r1);					   \
76*4882a593Smuzhiyun	evldw		r19,48(r1);					   \
77*4882a593Smuzhiyun	evldw		r20,56(r1);					   \
78*4882a593Smuzhiyun	evldw		r21,64(r1);					   \
79*4882a593Smuzhiyun	evldw		r22,72(r1);					   \
80*4882a593Smuzhiyun	evldw		r23,80(r1);					   \
81*4882a593Smuzhiyun	xor		r0,r0,r0;					   \
82*4882a593Smuzhiyun	stw		r0,8(r1);	/* Delete sensitive data	*/ \
83*4882a593Smuzhiyun	stw		r0,16(r1);	/* that we might have pushed	*/ \
84*4882a593Smuzhiyun	stw		r0,24(r1);	/* from other context that runs	*/ \
85*4882a593Smuzhiyun	stw		r0,32(r1);	/* the same code. Assume that	*/ \
86*4882a593Smuzhiyun	stw		r0,40(r1);	/* the lower part of the GPRs	*/ \
87*4882a593Smuzhiyun	stw		r0,48(r1);	/* were already overwritten on	*/ \
88*4882a593Smuzhiyun	stw		r0,56(r1);	/* the way down to here		*/ \
89*4882a593Smuzhiyun	stw		r0,64(r1);					   \
90*4882a593Smuzhiyun	stw		r0,72(r1);					   \
91*4882a593Smuzhiyun	stw		r0,80(r1);					   \
92*4882a593Smuzhiyun	addi		r1,r1,128;	/* cleanup stack frame		*/
93*4882a593Smuzhiyun
94*4882a593Smuzhiyun#ifdef __BIG_ENDIAN__
95*4882a593Smuzhiyun#define LOAD_DATA(reg, off) \
96*4882a593Smuzhiyun	lwz		reg,off(rWP);	/* load data			*/
97*4882a593Smuzhiyun#define NEXT_BLOCK \
98*4882a593Smuzhiyun	addi		rWP,rWP,64;	/* increment per block		*/
99*4882a593Smuzhiyun#else
100*4882a593Smuzhiyun#define LOAD_DATA(reg, off) \
101*4882a593Smuzhiyun	lwbrx		reg,0,rWP;	/* load data			*/ \
102*4882a593Smuzhiyun	addi		rWP,rWP,4;	/* increment per word		*/
103*4882a593Smuzhiyun#define NEXT_BLOCK			/* nothing to do		*/
104*4882a593Smuzhiyun#endif
105*4882a593Smuzhiyun
106*4882a593Smuzhiyun#define	R_00_15(a, b, c, d, e, w0, w1, k, off) \
107*4882a593Smuzhiyun	LOAD_DATA(w0, off)		/* 1: W				*/ \
108*4882a593Smuzhiyun	and		rT2,b,c;	/* 1: F' = B and C 		*/ \
109*4882a593Smuzhiyun	LOAD_K##k##1							   \
110*4882a593Smuzhiyun	andc		rT1,d,b;	/* 1: F" = ~B and D 		*/ \
111*4882a593Smuzhiyun	rotrwi		rT0,a,27;	/* 1: A' = A rotl 5		*/ \
112*4882a593Smuzhiyun	or		rT2,rT2,rT1;	/* 1: F = F' or F"		*/ \
113*4882a593Smuzhiyun	add		e,e,rT0;	/* 1: E = E + A'		*/ \
114*4882a593Smuzhiyun	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
115*4882a593Smuzhiyun	add		e,e,w0;		/* 1: E = E + W			*/ \
116*4882a593Smuzhiyun	LOAD_DATA(w1, off+4)		/* 2: W				*/ \
117*4882a593Smuzhiyun	add		e,e,rT2;	/* 1: E = E + F			*/ \
118*4882a593Smuzhiyun	and		rT1,a,b;	/* 2: F' = B and C 		*/ \
119*4882a593Smuzhiyun	add		e,e,rK;		/* 1: E = E + K			*/ \
120*4882a593Smuzhiyun	andc		rT2,c,a;	/* 2: F" = ~B and D 		*/ \
121*4882a593Smuzhiyun	add		d,d,rK;		/* 2: E = E + K			*/ \
122*4882a593Smuzhiyun	or		rT2,rT2,rT1;	/* 2: F = F' or F"		*/ \
123*4882a593Smuzhiyun	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
124*4882a593Smuzhiyun	add		d,d,w1;		/* 2: E = E + W			*/ \
125*4882a593Smuzhiyun	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
126*4882a593Smuzhiyun	add		d,d,rT0;	/* 2: E = E + A'		*/ \
127*4882a593Smuzhiyun	evmergelo	w1,w1,w0;	/*    mix W[0]/W[1]		*/ \
128*4882a593Smuzhiyun	add		d,d,rT2		/* 2: E = E + F			*/
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
131*4882a593Smuzhiyun	and		rT2,b,c;	/* 1: F' = B and C 		*/ \
132*4882a593Smuzhiyun	evmergelohi	rT0,w7,w6;	/*    W[-3]			*/ \
133*4882a593Smuzhiyun	andc		rT1,d,b;	/* 1: F" = ~B and D 		*/ \
134*4882a593Smuzhiyun	evxor		w0,w0,rT0;	/*    W = W[-16] xor W[-3]	*/ \
135*4882a593Smuzhiyun	or		rT1,rT1,rT2;	/* 1: F = F' or F"		*/ \
136*4882a593Smuzhiyun	evxor		w0,w0,w4;	/*    W = W xor W[-8]		*/ \
137*4882a593Smuzhiyun	add		e,e,rT1;	/* 1: E = E + F			*/ \
138*4882a593Smuzhiyun	evxor		w0,w0,w1;	/*    W = W xor W[-14]		*/ \
139*4882a593Smuzhiyun	rotrwi		rT2,a,27;	/* 1: A' = A rotl 5		*/ \
140*4882a593Smuzhiyun	evrlwi		w0,w0,1;	/*    W = W rotl 1		*/ \
141*4882a593Smuzhiyun	add		e,e,rT2;	/* 1: E = E + A'		*/ \
142*4882a593Smuzhiyun	evaddw		rT0,w0,rK;	/*    WK = W + K		*/ \
143*4882a593Smuzhiyun	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
144*4882a593Smuzhiyun	LOAD_K##k##1							   \
145*4882a593Smuzhiyun	evmergehi	rT1,rT1,rT0;	/*    WK1/WK2			*/ \
146*4882a593Smuzhiyun	add		e,e,rT0;	/* 1: E = E + WK		*/ \
147*4882a593Smuzhiyun	add		d,d,rT1;	/* 2: E = E + WK		*/ \
148*4882a593Smuzhiyun	and		rT2,a,b;	/* 2: F' = B and C 		*/ \
149*4882a593Smuzhiyun	andc		rT1,c,a;	/* 2: F" = ~B and D 		*/ \
150*4882a593Smuzhiyun	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
151*4882a593Smuzhiyun	or		rT1,rT1,rT2;	/* 2: F = F' or F"		*/ \
152*4882a593Smuzhiyun	add		d,d,rT0;	/* 2: E = E + A'		*/ \
153*4882a593Smuzhiyun	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
154*4882a593Smuzhiyun	add		d,d,rT1		/* 2: E = E + F			*/
155*4882a593Smuzhiyun
156*4882a593Smuzhiyun#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
157*4882a593Smuzhiyun	evmergelohi	rT0,w7,w6;	/*    W[-3]			*/ \
158*4882a593Smuzhiyun	xor		rT2,b,c;	/* 1: F' = B xor C		*/ \
159*4882a593Smuzhiyun	evxor		w0,w0,rT0;	/*    W = W[-16] xor W[-3]	*/ \
160*4882a593Smuzhiyun	xor		rT2,rT2,d;	/* 1: F = F' xor D		*/ \
161*4882a593Smuzhiyun	evxor		w0,w0,w4;	/*    W = W xor W[-8]		*/ \
162*4882a593Smuzhiyun	add		e,e,rT2;	/* 1: E = E + F			*/ \
163*4882a593Smuzhiyun	evxor		w0,w0,w1;	/*    W = W xor W[-14]		*/ \
164*4882a593Smuzhiyun	rotrwi		rT2,a,27;	/* 1: A' = A rotl 5		*/ \
165*4882a593Smuzhiyun	evrlwi		w0,w0,1;	/*    W = W rotl 1		*/ \
166*4882a593Smuzhiyun	add		e,e,rT2;	/* 1: E = E + A'		*/ \
167*4882a593Smuzhiyun	evaddw		rT0,w0,rK;	/*    WK = W + K		*/ \
168*4882a593Smuzhiyun	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
169*4882a593Smuzhiyun	LOAD_K##k##1							   \
170*4882a593Smuzhiyun	evmergehi	rT1,rT1,rT0;	/*    WK1/WK2			*/ \
171*4882a593Smuzhiyun	add		e,e,rT0;	/* 1: E = E + WK		*/ \
172*4882a593Smuzhiyun	xor		rT2,a,b;	/* 2: F' = B xor C		*/ \
173*4882a593Smuzhiyun	add		d,d,rT1;	/* 2: E = E + WK		*/ \
174*4882a593Smuzhiyun	xor		rT2,rT2,c;	/* 2: F = F' xor D		*/ \
175*4882a593Smuzhiyun	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
176*4882a593Smuzhiyun	add		d,d,rT2;	/* 2: E = E + F			*/ \
177*4882a593Smuzhiyun	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
178*4882a593Smuzhiyun	add		d,d,rT0		/* 2: E = E + A'		*/
179*4882a593Smuzhiyun
180*4882a593Smuzhiyun#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
181*4882a593Smuzhiyun	and		rT2,b,c;	/* 1: F' = B and C		*/ \
182*4882a593Smuzhiyun	evmergelohi	rT0,w7,w6;	/*    W[-3]			*/ \
183*4882a593Smuzhiyun	or		rT1,b,c;	/* 1: F" = B or C		*/ \
184*4882a593Smuzhiyun	evxor		w0,w0,rT0;	/*    W = W[-16] xor W[-3]	*/ \
185*4882a593Smuzhiyun	and		rT1,d,rT1;	/* 1: F" = F" and D		*/ \
186*4882a593Smuzhiyun	evxor		w0,w0,w4;	/*    W = W xor W[-8]		*/ \
187*4882a593Smuzhiyun	or		rT2,rT2,rT1;	/* 1: F = F' or F"		*/ \
188*4882a593Smuzhiyun	evxor		w0,w0,w1;	/*    W = W xor W[-14]		*/ \
189*4882a593Smuzhiyun	add		e,e,rT2;	/* 1: E = E + F			*/ \
190*4882a593Smuzhiyun	evrlwi		w0,w0,1;	/*    W = W rotl 1		*/ \
191*4882a593Smuzhiyun	rotrwi		rT2,a,27;	/* 1: A' = A rotl 5		*/ \
192*4882a593Smuzhiyun	evaddw		rT0,w0,rK;	/*    WK = W + K		*/ \
193*4882a593Smuzhiyun	add		e,e,rT2;	/* 1: E = E + A'		*/ \
194*4882a593Smuzhiyun	LOAD_K##k##1							   \
195*4882a593Smuzhiyun	evmergehi	rT1,rT1,rT0;	/*    WK1/WK2			*/ \
196*4882a593Smuzhiyun	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
197*4882a593Smuzhiyun	add		e,e,rT0;	/* 1: E = E + WK		*/ \
198*4882a593Smuzhiyun	and		rT2,a,b;	/* 2: F' = B and C		*/ \
199*4882a593Smuzhiyun	or		rT0,a,b;	/* 2: F" = B or C		*/ \
200*4882a593Smuzhiyun	add		d,d,rT1;	/* 2: E = E + WK		*/ \
201*4882a593Smuzhiyun	and		rT0,c,rT0;	/* 2: F" = F" and D		*/ \
202*4882a593Smuzhiyun	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
203*4882a593Smuzhiyun	or		rT2,rT2,rT0;	/* 2: F = F' or F"		*/ \
204*4882a593Smuzhiyun	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
205*4882a593Smuzhiyun	add		d,d,rT2;	/* 2: E = E + F			*/ \
206*4882a593Smuzhiyun	add		d,d,rT0		/* 2: E = E + A'		*/
207*4882a593Smuzhiyun
208*4882a593Smuzhiyun#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
209*4882a593Smuzhiyun	R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k)
210*4882a593Smuzhiyun
211*4882a593Smuzhiyun_GLOBAL(ppc_spe_sha1_transform)
212*4882a593Smuzhiyun	INITIALIZE
213*4882a593Smuzhiyun
214*4882a593Smuzhiyun	lwz		rH0,0(rHP)
215*4882a593Smuzhiyun	lwz		rH1,4(rHP)
216*4882a593Smuzhiyun	mtctr		r5
217*4882a593Smuzhiyun	lwz		rH2,8(rHP)
218*4882a593Smuzhiyun	lis		rKP,PPC_SPE_SHA1_K@h
219*4882a593Smuzhiyun	lwz		rH3,12(rHP)
220*4882a593Smuzhiyun	ori		rKP,rKP,PPC_SPE_SHA1_K@l
221*4882a593Smuzhiyun	lwz		rH4,16(rHP)
222*4882a593Smuzhiyun
223*4882a593Smuzhiyunppc_spe_sha1_main:
224*4882a593Smuzhiyun	R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0)
225*4882a593Smuzhiyun	R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8)
226*4882a593Smuzhiyun	R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16)
227*4882a593Smuzhiyun	R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24)
228*4882a593Smuzhiyun	R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32)
229*4882a593Smuzhiyun	R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40)
230*4882a593Smuzhiyun	R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48)
231*4882a593Smuzhiyun	R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56)
232*4882a593Smuzhiyun
233*4882a593Smuzhiyun	R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0)
234*4882a593Smuzhiyun	R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2)
235*4882a593Smuzhiyun
236*4882a593Smuzhiyun	R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0)
237*4882a593Smuzhiyun	R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0)
238*4882a593Smuzhiyun	R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0)
239*4882a593Smuzhiyun	R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0)
240*4882a593Smuzhiyun	R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0)
241*4882a593Smuzhiyun	R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0)
242*4882a593Smuzhiyun	R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0)
243*4882a593Smuzhiyun	R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0)
244*4882a593Smuzhiyun	R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0)
245*4882a593Smuzhiyun	R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3)
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun	R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0)
248*4882a593Smuzhiyun	R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0)
249*4882a593Smuzhiyun	R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0)
250*4882a593Smuzhiyun	R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0)
251*4882a593Smuzhiyun	R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0)
252*4882a593Smuzhiyun	R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0)
253*4882a593Smuzhiyun	R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0)
254*4882a593Smuzhiyun	R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0)
255*4882a593Smuzhiyun	R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0)
256*4882a593Smuzhiyun	R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4)
257*4882a593Smuzhiyun
258*4882a593Smuzhiyun	R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0)
259*4882a593Smuzhiyun	R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0)
260*4882a593Smuzhiyun	R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0)
261*4882a593Smuzhiyun	R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0)
262*4882a593Smuzhiyun	R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0)
263*4882a593Smuzhiyun	R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0)
264*4882a593Smuzhiyun	R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0)
265*4882a593Smuzhiyun	lwz		rT3,0(rHP)
266*4882a593Smuzhiyun	R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0)
267*4882a593Smuzhiyun	lwz		rW1,4(rHP)
268*4882a593Smuzhiyun	R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0)
269*4882a593Smuzhiyun	lwz		rW2,8(rHP)
270*4882a593Smuzhiyun	R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0)
271*4882a593Smuzhiyun	lwz		rW3,12(rHP)
272*4882a593Smuzhiyun	NEXT_BLOCK
273*4882a593Smuzhiyun	lwz		rW4,16(rHP)
274*4882a593Smuzhiyun
275*4882a593Smuzhiyun	add		rH0,rH0,rT3
276*4882a593Smuzhiyun	stw		rH0,0(rHP)
277*4882a593Smuzhiyun	add		rH1,rH1,rW1
278*4882a593Smuzhiyun	stw		rH1,4(rHP)
279*4882a593Smuzhiyun	add		rH2,rH2,rW2
280*4882a593Smuzhiyun	stw		rH2,8(rHP)
281*4882a593Smuzhiyun	add		rH3,rH3,rW3
282*4882a593Smuzhiyun	stw		rH3,12(rHP)
283*4882a593Smuzhiyun	add		rH4,rH4,rW4
284*4882a593Smuzhiyun	stw		rH4,16(rHP)
285*4882a593Smuzhiyun
286*4882a593Smuzhiyun	bdnz		ppc_spe_sha1_main
287*4882a593Smuzhiyun
288*4882a593Smuzhiyun	FINALIZE
289*4882a593Smuzhiyun	blr
290*4882a593Smuzhiyun
291*4882a593Smuzhiyun.data
292*4882a593Smuzhiyun.align 4
293*4882a593SmuzhiyunPPC_SPE_SHA1_K:
294*4882a593Smuzhiyun	.long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6
295