xref: /OK3568_Linux_fs/kernel/arch/alpha/include/asm/xor.h (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun /* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  * include/asm-alpha/xor.h
4*4882a593Smuzhiyun  *
5*4882a593Smuzhiyun  * Optimized RAID-5 checksumming functions for alpha EV5 and EV6
6*4882a593Smuzhiyun  */
7*4882a593Smuzhiyun 
8*4882a593Smuzhiyun extern void xor_alpha_2(unsigned long, unsigned long *, unsigned long *);
9*4882a593Smuzhiyun extern void xor_alpha_3(unsigned long, unsigned long *, unsigned long *,
10*4882a593Smuzhiyun 		        unsigned long *);
11*4882a593Smuzhiyun extern void xor_alpha_4(unsigned long, unsigned long *, unsigned long *,
12*4882a593Smuzhiyun 		        unsigned long *, unsigned long *);
13*4882a593Smuzhiyun extern void xor_alpha_5(unsigned long, unsigned long *, unsigned long *,
14*4882a593Smuzhiyun 		        unsigned long *, unsigned long *, unsigned long *);
15*4882a593Smuzhiyun 
16*4882a593Smuzhiyun extern void xor_alpha_prefetch_2(unsigned long, unsigned long *,
17*4882a593Smuzhiyun 				 unsigned long *);
18*4882a593Smuzhiyun extern void xor_alpha_prefetch_3(unsigned long, unsigned long *,
19*4882a593Smuzhiyun 				 unsigned long *, unsigned long *);
20*4882a593Smuzhiyun extern void xor_alpha_prefetch_4(unsigned long, unsigned long *,
21*4882a593Smuzhiyun 				 unsigned long *, unsigned long *,
22*4882a593Smuzhiyun 				 unsigned long *);
23*4882a593Smuzhiyun extern void xor_alpha_prefetch_5(unsigned long, unsigned long *,
24*4882a593Smuzhiyun 				 unsigned long *, unsigned long *,
25*4882a593Smuzhiyun 				 unsigned long *, unsigned long *);
26*4882a593Smuzhiyun 
27*4882a593Smuzhiyun asm("								\n\
28*4882a593Smuzhiyun 	.text							\n\
29*4882a593Smuzhiyun 	.align 3						\n\
30*4882a593Smuzhiyun 	.ent xor_alpha_2					\n\
31*4882a593Smuzhiyun xor_alpha_2:							\n\
32*4882a593Smuzhiyun 	.prologue 0						\n\
33*4882a593Smuzhiyun 	srl $16, 6, $16						\n\
34*4882a593Smuzhiyun 	.align 4						\n\
35*4882a593Smuzhiyun 2:								\n\
36*4882a593Smuzhiyun 	ldq $0,0($17)						\n\
37*4882a593Smuzhiyun 	ldq $1,0($18)						\n\
38*4882a593Smuzhiyun 	ldq $2,8($17)						\n\
39*4882a593Smuzhiyun 	ldq $3,8($18)						\n\
40*4882a593Smuzhiyun 								\n\
41*4882a593Smuzhiyun 	ldq $4,16($17)						\n\
42*4882a593Smuzhiyun 	ldq $5,16($18)						\n\
43*4882a593Smuzhiyun 	ldq $6,24($17)						\n\
44*4882a593Smuzhiyun 	ldq $7,24($18)						\n\
45*4882a593Smuzhiyun 								\n\
46*4882a593Smuzhiyun 	ldq $19,32($17)						\n\
47*4882a593Smuzhiyun 	ldq $20,32($18)						\n\
48*4882a593Smuzhiyun 	ldq $21,40($17)						\n\
49*4882a593Smuzhiyun 	ldq $22,40($18)						\n\
50*4882a593Smuzhiyun 								\n\
51*4882a593Smuzhiyun 	ldq $23,48($17)						\n\
52*4882a593Smuzhiyun 	ldq $24,48($18)						\n\
53*4882a593Smuzhiyun 	ldq $25,56($17)						\n\
54*4882a593Smuzhiyun 	xor $0,$1,$0		# 7 cycles from $1 load		\n\
55*4882a593Smuzhiyun 								\n\
56*4882a593Smuzhiyun 	ldq $27,56($18)						\n\
57*4882a593Smuzhiyun 	xor $2,$3,$2						\n\
58*4882a593Smuzhiyun 	stq $0,0($17)						\n\
59*4882a593Smuzhiyun 	xor $4,$5,$4						\n\
60*4882a593Smuzhiyun 								\n\
61*4882a593Smuzhiyun 	stq $2,8($17)						\n\
62*4882a593Smuzhiyun 	xor $6,$7,$6						\n\
63*4882a593Smuzhiyun 	stq $4,16($17)						\n\
64*4882a593Smuzhiyun 	xor $19,$20,$19						\n\
65*4882a593Smuzhiyun 								\n\
66*4882a593Smuzhiyun 	stq $6,24($17)						\n\
67*4882a593Smuzhiyun 	xor $21,$22,$21						\n\
68*4882a593Smuzhiyun 	stq $19,32($17)						\n\
69*4882a593Smuzhiyun 	xor $23,$24,$23						\n\
70*4882a593Smuzhiyun 								\n\
71*4882a593Smuzhiyun 	stq $21,40($17)						\n\
72*4882a593Smuzhiyun 	xor $25,$27,$25						\n\
73*4882a593Smuzhiyun 	stq $23,48($17)						\n\
74*4882a593Smuzhiyun 	subq $16,1,$16						\n\
75*4882a593Smuzhiyun 								\n\
76*4882a593Smuzhiyun 	stq $25,56($17)						\n\
77*4882a593Smuzhiyun 	addq $17,64,$17						\n\
78*4882a593Smuzhiyun 	addq $18,64,$18						\n\
79*4882a593Smuzhiyun 	bgt $16,2b						\n\
80*4882a593Smuzhiyun 								\n\
81*4882a593Smuzhiyun 	ret							\n\
82*4882a593Smuzhiyun 	.end xor_alpha_2					\n\
83*4882a593Smuzhiyun 								\n\
84*4882a593Smuzhiyun 	.align 3						\n\
85*4882a593Smuzhiyun 	.ent xor_alpha_3					\n\
86*4882a593Smuzhiyun xor_alpha_3:							\n\
87*4882a593Smuzhiyun 	.prologue 0						\n\
88*4882a593Smuzhiyun 	srl $16, 6, $16						\n\
89*4882a593Smuzhiyun 	.align 4						\n\
90*4882a593Smuzhiyun 3:								\n\
91*4882a593Smuzhiyun 	ldq $0,0($17)						\n\
92*4882a593Smuzhiyun 	ldq $1,0($18)						\n\
93*4882a593Smuzhiyun 	ldq $2,0($19)						\n\
94*4882a593Smuzhiyun 	ldq $3,8($17)						\n\
95*4882a593Smuzhiyun 								\n\
96*4882a593Smuzhiyun 	ldq $4,8($18)						\n\
97*4882a593Smuzhiyun 	ldq $6,16($17)						\n\
98*4882a593Smuzhiyun 	ldq $7,16($18)						\n\
99*4882a593Smuzhiyun 	ldq $21,24($17)						\n\
100*4882a593Smuzhiyun 								\n\
101*4882a593Smuzhiyun 	ldq $22,24($18)						\n\
102*4882a593Smuzhiyun 	ldq $24,32($17)						\n\
103*4882a593Smuzhiyun 	ldq $25,32($18)						\n\
104*4882a593Smuzhiyun 	ldq $5,8($19)						\n\
105*4882a593Smuzhiyun 								\n\
106*4882a593Smuzhiyun 	ldq $20,16($19)						\n\
107*4882a593Smuzhiyun 	ldq $23,24($19)						\n\
108*4882a593Smuzhiyun 	ldq $27,32($19)						\n\
109*4882a593Smuzhiyun 	nop							\n\
110*4882a593Smuzhiyun 								\n\
111*4882a593Smuzhiyun 	xor $0,$1,$1		# 8 cycles from $0 load		\n\
112*4882a593Smuzhiyun 	xor $3,$4,$4		# 6 cycles from $4 load		\n\
113*4882a593Smuzhiyun 	xor $6,$7,$7		# 6 cycles from $7 load		\n\
114*4882a593Smuzhiyun 	xor $21,$22,$22		# 5 cycles from $22 load	\n\
115*4882a593Smuzhiyun 								\n\
116*4882a593Smuzhiyun 	xor $1,$2,$2		# 9 cycles from $2 load		\n\
117*4882a593Smuzhiyun 	xor $24,$25,$25		# 5 cycles from $25 load	\n\
118*4882a593Smuzhiyun 	stq $2,0($17)						\n\
119*4882a593Smuzhiyun 	xor $4,$5,$5		# 6 cycles from $5 load		\n\
120*4882a593Smuzhiyun 								\n\
121*4882a593Smuzhiyun 	stq $5,8($17)						\n\
122*4882a593Smuzhiyun 	xor $7,$20,$20		# 7 cycles from $20 load	\n\
123*4882a593Smuzhiyun 	stq $20,16($17)						\n\
124*4882a593Smuzhiyun 	xor $22,$23,$23		# 7 cycles from $23 load	\n\
125*4882a593Smuzhiyun 								\n\
126*4882a593Smuzhiyun 	stq $23,24($17)						\n\
127*4882a593Smuzhiyun 	xor $25,$27,$27		# 7 cycles from $27 load	\n\
128*4882a593Smuzhiyun 	stq $27,32($17)						\n\
129*4882a593Smuzhiyun 	nop							\n\
130*4882a593Smuzhiyun 								\n\
131*4882a593Smuzhiyun 	ldq $0,40($17)						\n\
132*4882a593Smuzhiyun 	ldq $1,40($18)						\n\
133*4882a593Smuzhiyun 	ldq $3,48($17)						\n\
134*4882a593Smuzhiyun 	ldq $4,48($18)						\n\
135*4882a593Smuzhiyun 								\n\
136*4882a593Smuzhiyun 	ldq $6,56($17)						\n\
137*4882a593Smuzhiyun 	ldq $7,56($18)						\n\
138*4882a593Smuzhiyun 	ldq $2,40($19)						\n\
139*4882a593Smuzhiyun 	ldq $5,48($19)						\n\
140*4882a593Smuzhiyun 								\n\
141*4882a593Smuzhiyun 	ldq $20,56($19)						\n\
142*4882a593Smuzhiyun 	xor $0,$1,$1		# 4 cycles from $1 load		\n\
143*4882a593Smuzhiyun 	xor $3,$4,$4		# 5 cycles from $4 load		\n\
144*4882a593Smuzhiyun 	xor $6,$7,$7		# 5 cycles from $7 load		\n\
145*4882a593Smuzhiyun 								\n\
146*4882a593Smuzhiyun 	xor $1,$2,$2		# 4 cycles from $2 load		\n\
147*4882a593Smuzhiyun 	xor $4,$5,$5		# 5 cycles from $5 load		\n\
148*4882a593Smuzhiyun 	stq $2,40($17)						\n\
149*4882a593Smuzhiyun 	xor $7,$20,$20		# 4 cycles from $20 load	\n\
150*4882a593Smuzhiyun 								\n\
151*4882a593Smuzhiyun 	stq $5,48($17)						\n\
152*4882a593Smuzhiyun 	subq $16,1,$16						\n\
153*4882a593Smuzhiyun 	stq $20,56($17)						\n\
154*4882a593Smuzhiyun 	addq $19,64,$19						\n\
155*4882a593Smuzhiyun 								\n\
156*4882a593Smuzhiyun 	addq $18,64,$18						\n\
157*4882a593Smuzhiyun 	addq $17,64,$17						\n\
158*4882a593Smuzhiyun 	bgt $16,3b						\n\
159*4882a593Smuzhiyun 	ret							\n\
160*4882a593Smuzhiyun 	.end xor_alpha_3					\n\
161*4882a593Smuzhiyun 								\n\
162*4882a593Smuzhiyun 	.align 3						\n\
163*4882a593Smuzhiyun 	.ent xor_alpha_4					\n\
164*4882a593Smuzhiyun xor_alpha_4:							\n\
165*4882a593Smuzhiyun 	.prologue 0						\n\
166*4882a593Smuzhiyun 	srl $16, 6, $16						\n\
167*4882a593Smuzhiyun 	.align 4						\n\
168*4882a593Smuzhiyun 4:								\n\
169*4882a593Smuzhiyun 	ldq $0,0($17)						\n\
170*4882a593Smuzhiyun 	ldq $1,0($18)						\n\
171*4882a593Smuzhiyun 	ldq $2,0($19)						\n\
172*4882a593Smuzhiyun 	ldq $3,0($20)						\n\
173*4882a593Smuzhiyun 								\n\
174*4882a593Smuzhiyun 	ldq $4,8($17)						\n\
175*4882a593Smuzhiyun 	ldq $5,8($18)						\n\
176*4882a593Smuzhiyun 	ldq $6,8($19)						\n\
177*4882a593Smuzhiyun 	ldq $7,8($20)						\n\
178*4882a593Smuzhiyun 								\n\
179*4882a593Smuzhiyun 	ldq $21,16($17)						\n\
180*4882a593Smuzhiyun 	ldq $22,16($18)						\n\
181*4882a593Smuzhiyun 	ldq $23,16($19)						\n\
182*4882a593Smuzhiyun 	ldq $24,16($20)						\n\
183*4882a593Smuzhiyun 								\n\
184*4882a593Smuzhiyun 	ldq $25,24($17)						\n\
185*4882a593Smuzhiyun 	xor $0,$1,$1		# 6 cycles from $1 load		\n\
186*4882a593Smuzhiyun 	ldq $27,24($18)						\n\
187*4882a593Smuzhiyun 	xor $2,$3,$3		# 6 cycles from $3 load		\n\
188*4882a593Smuzhiyun 								\n\
189*4882a593Smuzhiyun 	ldq $0,24($19)						\n\
190*4882a593Smuzhiyun 	xor $1,$3,$3						\n\
191*4882a593Smuzhiyun 	ldq $1,24($20)						\n\
192*4882a593Smuzhiyun 	xor $4,$5,$5		# 7 cycles from $5 load		\n\
193*4882a593Smuzhiyun 								\n\
194*4882a593Smuzhiyun 	stq $3,0($17)						\n\
195*4882a593Smuzhiyun 	xor $6,$7,$7						\n\
196*4882a593Smuzhiyun 	xor $21,$22,$22		# 7 cycles from $22 load	\n\
197*4882a593Smuzhiyun 	xor $5,$7,$7						\n\
198*4882a593Smuzhiyun 								\n\
199*4882a593Smuzhiyun 	stq $7,8($17)						\n\
200*4882a593Smuzhiyun 	xor $23,$24,$24		# 7 cycles from $24 load	\n\
201*4882a593Smuzhiyun 	ldq $2,32($17)						\n\
202*4882a593Smuzhiyun 	xor $22,$24,$24						\n\
203*4882a593Smuzhiyun 								\n\
204*4882a593Smuzhiyun 	ldq $3,32($18)						\n\
205*4882a593Smuzhiyun 	ldq $4,32($19)						\n\
206*4882a593Smuzhiyun 	ldq $5,32($20)						\n\
207*4882a593Smuzhiyun 	xor $25,$27,$27		# 8 cycles from $27 load	\n\
208*4882a593Smuzhiyun 								\n\
209*4882a593Smuzhiyun 	ldq $6,40($17)						\n\
210*4882a593Smuzhiyun 	ldq $7,40($18)						\n\
211*4882a593Smuzhiyun 	ldq $21,40($19)						\n\
212*4882a593Smuzhiyun 	ldq $22,40($20)						\n\
213*4882a593Smuzhiyun 								\n\
214*4882a593Smuzhiyun 	stq $24,16($17)						\n\
215*4882a593Smuzhiyun 	xor $0,$1,$1		# 9 cycles from $1 load		\n\
216*4882a593Smuzhiyun 	xor $2,$3,$3		# 5 cycles from $3 load		\n\
217*4882a593Smuzhiyun 	xor $27,$1,$1						\n\
218*4882a593Smuzhiyun 								\n\
219*4882a593Smuzhiyun 	stq $1,24($17)						\n\
220*4882a593Smuzhiyun 	xor $4,$5,$5		# 5 cycles from $5 load		\n\
221*4882a593Smuzhiyun 	ldq $23,48($17)						\n\
222*4882a593Smuzhiyun 	ldq $24,48($18)						\n\
223*4882a593Smuzhiyun 								\n\
224*4882a593Smuzhiyun 	ldq $25,48($19)						\n\
225*4882a593Smuzhiyun 	xor $3,$5,$5						\n\
226*4882a593Smuzhiyun 	ldq $27,48($20)						\n\
227*4882a593Smuzhiyun 	ldq $0,56($17)						\n\
228*4882a593Smuzhiyun 								\n\
229*4882a593Smuzhiyun 	ldq $1,56($18)						\n\
230*4882a593Smuzhiyun 	ldq $2,56($19)						\n\
231*4882a593Smuzhiyun 	xor $6,$7,$7		# 8 cycles from $6 load		\n\
232*4882a593Smuzhiyun 	ldq $3,56($20)						\n\
233*4882a593Smuzhiyun 								\n\
234*4882a593Smuzhiyun 	stq $5,32($17)						\n\
235*4882a593Smuzhiyun 	xor $21,$22,$22		# 8 cycles from $22 load	\n\
236*4882a593Smuzhiyun 	xor $7,$22,$22						\n\
237*4882a593Smuzhiyun 	xor $23,$24,$24		# 5 cycles from $24 load	\n\
238*4882a593Smuzhiyun 								\n\
239*4882a593Smuzhiyun 	stq $22,40($17)						\n\
240*4882a593Smuzhiyun 	xor $25,$27,$27		# 5 cycles from $27 load	\n\
241*4882a593Smuzhiyun 	xor $24,$27,$27						\n\
242*4882a593Smuzhiyun 	xor $0,$1,$1		# 5 cycles from $1 load		\n\
243*4882a593Smuzhiyun 								\n\
244*4882a593Smuzhiyun 	stq $27,48($17)						\n\
245*4882a593Smuzhiyun 	xor $2,$3,$3		# 4 cycles from $3 load		\n\
246*4882a593Smuzhiyun 	xor $1,$3,$3						\n\
247*4882a593Smuzhiyun 	subq $16,1,$16						\n\
248*4882a593Smuzhiyun 								\n\
249*4882a593Smuzhiyun 	stq $3,56($17)						\n\
250*4882a593Smuzhiyun 	addq $20,64,$20						\n\
251*4882a593Smuzhiyun 	addq $19,64,$19						\n\
252*4882a593Smuzhiyun 	addq $18,64,$18						\n\
253*4882a593Smuzhiyun 								\n\
254*4882a593Smuzhiyun 	addq $17,64,$17						\n\
255*4882a593Smuzhiyun 	bgt $16,4b						\n\
256*4882a593Smuzhiyun 	ret							\n\
257*4882a593Smuzhiyun 	.end xor_alpha_4					\n\
258*4882a593Smuzhiyun 								\n\
259*4882a593Smuzhiyun 	.align 3						\n\
260*4882a593Smuzhiyun 	.ent xor_alpha_5					\n\
261*4882a593Smuzhiyun xor_alpha_5:							\n\
262*4882a593Smuzhiyun 	.prologue 0						\n\
263*4882a593Smuzhiyun 	srl $16, 6, $16						\n\
264*4882a593Smuzhiyun 	.align 4						\n\
265*4882a593Smuzhiyun 5:								\n\
266*4882a593Smuzhiyun 	ldq $0,0($17)						\n\
267*4882a593Smuzhiyun 	ldq $1,0($18)						\n\
268*4882a593Smuzhiyun 	ldq $2,0($19)						\n\
269*4882a593Smuzhiyun 	ldq $3,0($20)						\n\
270*4882a593Smuzhiyun 								\n\
271*4882a593Smuzhiyun 	ldq $4,0($21)						\n\
272*4882a593Smuzhiyun 	ldq $5,8($17)						\n\
273*4882a593Smuzhiyun 	ldq $6,8($18)						\n\
274*4882a593Smuzhiyun 	ldq $7,8($19)						\n\
275*4882a593Smuzhiyun 								\n\
276*4882a593Smuzhiyun 	ldq $22,8($20)						\n\
277*4882a593Smuzhiyun 	ldq $23,8($21)						\n\
278*4882a593Smuzhiyun 	ldq $24,16($17)						\n\
279*4882a593Smuzhiyun 	ldq $25,16($18)						\n\
280*4882a593Smuzhiyun 								\n\
281*4882a593Smuzhiyun 	ldq $27,16($19)						\n\
282*4882a593Smuzhiyun 	xor $0,$1,$1		# 6 cycles from $1 load		\n\
283*4882a593Smuzhiyun 	ldq $28,16($20)						\n\
284*4882a593Smuzhiyun 	xor $2,$3,$3		# 6 cycles from $3 load		\n\
285*4882a593Smuzhiyun 								\n\
286*4882a593Smuzhiyun 	ldq $0,16($21)						\n\
287*4882a593Smuzhiyun 	xor $1,$3,$3						\n\
288*4882a593Smuzhiyun 	ldq $1,24($17)						\n\
289*4882a593Smuzhiyun 	xor $3,$4,$4		# 7 cycles from $4 load		\n\
290*4882a593Smuzhiyun 								\n\
291*4882a593Smuzhiyun 	stq $4,0($17)						\n\
292*4882a593Smuzhiyun 	xor $5,$6,$6		# 7 cycles from $6 load		\n\
293*4882a593Smuzhiyun 	xor $7,$22,$22		# 7 cycles from $22 load	\n\
294*4882a593Smuzhiyun 	xor $6,$23,$23		# 7 cycles from $23 load	\n\
295*4882a593Smuzhiyun 								\n\
296*4882a593Smuzhiyun 	ldq $2,24($18)						\n\
297*4882a593Smuzhiyun 	xor $22,$23,$23						\n\
298*4882a593Smuzhiyun 	ldq $3,24($19)						\n\
299*4882a593Smuzhiyun 	xor $24,$25,$25		# 8 cycles from $25 load	\n\
300*4882a593Smuzhiyun 								\n\
301*4882a593Smuzhiyun 	stq $23,8($17)						\n\
302*4882a593Smuzhiyun 	xor $25,$27,$27		# 8 cycles from $27 load	\n\
303*4882a593Smuzhiyun 	ldq $4,24($20)						\n\
304*4882a593Smuzhiyun 	xor $28,$0,$0		# 7 cycles from $0 load		\n\
305*4882a593Smuzhiyun 								\n\
306*4882a593Smuzhiyun 	ldq $5,24($21)						\n\
307*4882a593Smuzhiyun 	xor $27,$0,$0						\n\
308*4882a593Smuzhiyun 	ldq $6,32($17)						\n\
309*4882a593Smuzhiyun 	ldq $7,32($18)						\n\
310*4882a593Smuzhiyun 								\n\
311*4882a593Smuzhiyun 	stq $0,16($17)						\n\
312*4882a593Smuzhiyun 	xor $1,$2,$2		# 6 cycles from $2 load		\n\
313*4882a593Smuzhiyun 	ldq $22,32($19)						\n\
314*4882a593Smuzhiyun 	xor $3,$4,$4		# 4 cycles from $4 load		\n\
315*4882a593Smuzhiyun 								\n\
316*4882a593Smuzhiyun 	ldq $23,32($20)						\n\
317*4882a593Smuzhiyun 	xor $2,$4,$4						\n\
318*4882a593Smuzhiyun 	ldq $24,32($21)						\n\
319*4882a593Smuzhiyun 	ldq $25,40($17)						\n\
320*4882a593Smuzhiyun 								\n\
321*4882a593Smuzhiyun 	ldq $27,40($18)						\n\
322*4882a593Smuzhiyun 	ldq $28,40($19)						\n\
323*4882a593Smuzhiyun 	ldq $0,40($20)						\n\
324*4882a593Smuzhiyun 	xor $4,$5,$5		# 7 cycles from $5 load		\n\
325*4882a593Smuzhiyun 								\n\
326*4882a593Smuzhiyun 	stq $5,24($17)						\n\
327*4882a593Smuzhiyun 	xor $6,$7,$7		# 7 cycles from $7 load		\n\
328*4882a593Smuzhiyun 	ldq $1,40($21)						\n\
329*4882a593Smuzhiyun 	ldq $2,48($17)						\n\
330*4882a593Smuzhiyun 								\n\
331*4882a593Smuzhiyun 	ldq $3,48($18)						\n\
332*4882a593Smuzhiyun 	xor $7,$22,$22		# 7 cycles from $22 load	\n\
333*4882a593Smuzhiyun 	ldq $4,48($19)						\n\
334*4882a593Smuzhiyun 	xor $23,$24,$24		# 6 cycles from $24 load	\n\
335*4882a593Smuzhiyun 								\n\
336*4882a593Smuzhiyun 	ldq $5,48($20)						\n\
337*4882a593Smuzhiyun 	xor $22,$24,$24						\n\
338*4882a593Smuzhiyun 	ldq $6,48($21)						\n\
339*4882a593Smuzhiyun 	xor $25,$27,$27		# 7 cycles from $27 load	\n\
340*4882a593Smuzhiyun 								\n\
341*4882a593Smuzhiyun 	stq $24,32($17)						\n\
342*4882a593Smuzhiyun 	xor $27,$28,$28		# 8 cycles from $28 load	\n\
343*4882a593Smuzhiyun 	ldq $7,56($17)						\n\
344*4882a593Smuzhiyun 	xor $0,$1,$1		# 6 cycles from $1 load		\n\
345*4882a593Smuzhiyun 								\n\
346*4882a593Smuzhiyun 	ldq $22,56($18)						\n\
347*4882a593Smuzhiyun 	ldq $23,56($19)						\n\
348*4882a593Smuzhiyun 	ldq $24,56($20)						\n\
349*4882a593Smuzhiyun 	ldq $25,56($21)						\n\
350*4882a593Smuzhiyun 								\n\
351*4882a593Smuzhiyun 	xor $28,$1,$1						\n\
352*4882a593Smuzhiyun 	xor $2,$3,$3		# 9 cycles from $3 load		\n\
353*4882a593Smuzhiyun 	xor $3,$4,$4		# 9 cycles from $4 load		\n\
354*4882a593Smuzhiyun 	xor $5,$6,$6		# 8 cycles from $6 load		\n\
355*4882a593Smuzhiyun 								\n\
356*4882a593Smuzhiyun 	stq $1,40($17)						\n\
357*4882a593Smuzhiyun 	xor $4,$6,$6						\n\
358*4882a593Smuzhiyun 	xor $7,$22,$22		# 7 cycles from $22 load	\n\
359*4882a593Smuzhiyun 	xor $23,$24,$24		# 6 cycles from $24 load	\n\
360*4882a593Smuzhiyun 								\n\
361*4882a593Smuzhiyun 	stq $6,48($17)						\n\
362*4882a593Smuzhiyun 	xor $22,$24,$24						\n\
363*4882a593Smuzhiyun 	subq $16,1,$16						\n\
364*4882a593Smuzhiyun 	xor $24,$25,$25		# 8 cycles from $25 load	\n\
365*4882a593Smuzhiyun 								\n\
366*4882a593Smuzhiyun 	stq $25,56($17)						\n\
367*4882a593Smuzhiyun 	addq $21,64,$21						\n\
368*4882a593Smuzhiyun 	addq $20,64,$20						\n\
369*4882a593Smuzhiyun 	addq $19,64,$19						\n\
370*4882a593Smuzhiyun 								\n\
371*4882a593Smuzhiyun 	addq $18,64,$18						\n\
372*4882a593Smuzhiyun 	addq $17,64,$17						\n\
373*4882a593Smuzhiyun 	bgt $16,5b						\n\
374*4882a593Smuzhiyun 	ret							\n\
375*4882a593Smuzhiyun 	.end xor_alpha_5					\n\
376*4882a593Smuzhiyun 								\n\
377*4882a593Smuzhiyun 	.align 3						\n\
378*4882a593Smuzhiyun 	.ent xor_alpha_prefetch_2				\n\
379*4882a593Smuzhiyun xor_alpha_prefetch_2:						\n\
380*4882a593Smuzhiyun 	.prologue 0						\n\
381*4882a593Smuzhiyun 	srl $16, 6, $16						\n\
382*4882a593Smuzhiyun 								\n\
383*4882a593Smuzhiyun 	ldq $31, 0($17)						\n\
384*4882a593Smuzhiyun 	ldq $31, 0($18)						\n\
385*4882a593Smuzhiyun 								\n\
386*4882a593Smuzhiyun 	ldq $31, 64($17)					\n\
387*4882a593Smuzhiyun 	ldq $31, 64($18)					\n\
388*4882a593Smuzhiyun 								\n\
389*4882a593Smuzhiyun 	ldq $31, 128($17)					\n\
390*4882a593Smuzhiyun 	ldq $31, 128($18)					\n\
391*4882a593Smuzhiyun 								\n\
392*4882a593Smuzhiyun 	ldq $31, 192($17)					\n\
393*4882a593Smuzhiyun 	ldq $31, 192($18)					\n\
394*4882a593Smuzhiyun 	.align 4						\n\
395*4882a593Smuzhiyun 2:								\n\
396*4882a593Smuzhiyun 	ldq $0,0($17)						\n\
397*4882a593Smuzhiyun 	ldq $1,0($18)						\n\
398*4882a593Smuzhiyun 	ldq $2,8($17)						\n\
399*4882a593Smuzhiyun 	ldq $3,8($18)						\n\
400*4882a593Smuzhiyun 								\n\
401*4882a593Smuzhiyun 	ldq $4,16($17)						\n\
402*4882a593Smuzhiyun 	ldq $5,16($18)						\n\
403*4882a593Smuzhiyun 	ldq $6,24($17)						\n\
404*4882a593Smuzhiyun 	ldq $7,24($18)						\n\
405*4882a593Smuzhiyun 								\n\
406*4882a593Smuzhiyun 	ldq $19,32($17)						\n\
407*4882a593Smuzhiyun 	ldq $20,32($18)						\n\
408*4882a593Smuzhiyun 	ldq $21,40($17)						\n\
409*4882a593Smuzhiyun 	ldq $22,40($18)						\n\
410*4882a593Smuzhiyun 								\n\
411*4882a593Smuzhiyun 	ldq $23,48($17)						\n\
412*4882a593Smuzhiyun 	ldq $24,48($18)						\n\
413*4882a593Smuzhiyun 	ldq $25,56($17)						\n\
414*4882a593Smuzhiyun 	ldq $27,56($18)						\n\
415*4882a593Smuzhiyun 								\n\
416*4882a593Smuzhiyun 	ldq $31,256($17)					\n\
417*4882a593Smuzhiyun 	xor $0,$1,$0		# 8 cycles from $1 load		\n\
418*4882a593Smuzhiyun 	ldq $31,256($18)					\n\
419*4882a593Smuzhiyun 	xor $2,$3,$2						\n\
420*4882a593Smuzhiyun 								\n\
421*4882a593Smuzhiyun 	stq $0,0($17)						\n\
422*4882a593Smuzhiyun 	xor $4,$5,$4						\n\
423*4882a593Smuzhiyun 	stq $2,8($17)						\n\
424*4882a593Smuzhiyun 	xor $6,$7,$6						\n\
425*4882a593Smuzhiyun 								\n\
426*4882a593Smuzhiyun 	stq $4,16($17)						\n\
427*4882a593Smuzhiyun 	xor $19,$20,$19						\n\
428*4882a593Smuzhiyun 	stq $6,24($17)						\n\
429*4882a593Smuzhiyun 	xor $21,$22,$21						\n\
430*4882a593Smuzhiyun 								\n\
431*4882a593Smuzhiyun 	stq $19,32($17)						\n\
432*4882a593Smuzhiyun 	xor $23,$24,$23						\n\
433*4882a593Smuzhiyun 	stq $21,40($17)						\n\
434*4882a593Smuzhiyun 	xor $25,$27,$25						\n\
435*4882a593Smuzhiyun 								\n\
436*4882a593Smuzhiyun 	stq $23,48($17)						\n\
437*4882a593Smuzhiyun 	subq $16,1,$16						\n\
438*4882a593Smuzhiyun 	stq $25,56($17)						\n\
439*4882a593Smuzhiyun 	addq $17,64,$17						\n\
440*4882a593Smuzhiyun 								\n\
441*4882a593Smuzhiyun 	addq $18,64,$18						\n\
442*4882a593Smuzhiyun 	bgt $16,2b						\n\
443*4882a593Smuzhiyun 	ret							\n\
444*4882a593Smuzhiyun 	.end xor_alpha_prefetch_2				\n\
445*4882a593Smuzhiyun 								\n\
446*4882a593Smuzhiyun 	.align 3						\n\
447*4882a593Smuzhiyun 	.ent xor_alpha_prefetch_3				\n\
448*4882a593Smuzhiyun xor_alpha_prefetch_3:						\n\
449*4882a593Smuzhiyun 	.prologue 0						\n\
450*4882a593Smuzhiyun 	srl $16, 6, $16						\n\
451*4882a593Smuzhiyun 								\n\
452*4882a593Smuzhiyun 	ldq $31, 0($17)						\n\
453*4882a593Smuzhiyun 	ldq $31, 0($18)						\n\
454*4882a593Smuzhiyun 	ldq $31, 0($19)						\n\
455*4882a593Smuzhiyun 								\n\
456*4882a593Smuzhiyun 	ldq $31, 64($17)					\n\
457*4882a593Smuzhiyun 	ldq $31, 64($18)					\n\
458*4882a593Smuzhiyun 	ldq $31, 64($19)					\n\
459*4882a593Smuzhiyun 								\n\
460*4882a593Smuzhiyun 	ldq $31, 128($17)					\n\
461*4882a593Smuzhiyun 	ldq $31, 128($18)					\n\
462*4882a593Smuzhiyun 	ldq $31, 128($19)					\n\
463*4882a593Smuzhiyun 								\n\
464*4882a593Smuzhiyun 	ldq $31, 192($17)					\n\
465*4882a593Smuzhiyun 	ldq $31, 192($18)					\n\
466*4882a593Smuzhiyun 	ldq $31, 192($19)					\n\
467*4882a593Smuzhiyun 	.align 4						\n\
468*4882a593Smuzhiyun 3:								\n\
469*4882a593Smuzhiyun 	ldq $0,0($17)						\n\
470*4882a593Smuzhiyun 	ldq $1,0($18)						\n\
471*4882a593Smuzhiyun 	ldq $2,0($19)						\n\
472*4882a593Smuzhiyun 	ldq $3,8($17)						\n\
473*4882a593Smuzhiyun 								\n\
474*4882a593Smuzhiyun 	ldq $4,8($18)						\n\
475*4882a593Smuzhiyun 	ldq $6,16($17)						\n\
476*4882a593Smuzhiyun 	ldq $7,16($18)						\n\
477*4882a593Smuzhiyun 	ldq $21,24($17)						\n\
478*4882a593Smuzhiyun 								\n\
479*4882a593Smuzhiyun 	ldq $22,24($18)						\n\
480*4882a593Smuzhiyun 	ldq $24,32($17)						\n\
481*4882a593Smuzhiyun 	ldq $25,32($18)						\n\
482*4882a593Smuzhiyun 	ldq $5,8($19)						\n\
483*4882a593Smuzhiyun 								\n\
484*4882a593Smuzhiyun 	ldq $20,16($19)						\n\
485*4882a593Smuzhiyun 	ldq $23,24($19)						\n\
486*4882a593Smuzhiyun 	ldq $27,32($19)						\n\
487*4882a593Smuzhiyun 	nop							\n\
488*4882a593Smuzhiyun 								\n\
489*4882a593Smuzhiyun 	xor $0,$1,$1		# 8 cycles from $0 load		\n\
490*4882a593Smuzhiyun 	xor $3,$4,$4		# 7 cycles from $4 load		\n\
491*4882a593Smuzhiyun 	xor $6,$7,$7		# 6 cycles from $7 load		\n\
492*4882a593Smuzhiyun 	xor $21,$22,$22		# 5 cycles from $22 load	\n\
493*4882a593Smuzhiyun 								\n\
494*4882a593Smuzhiyun 	xor $1,$2,$2		# 9 cycles from $2 load		\n\
495*4882a593Smuzhiyun 	xor $24,$25,$25		# 5 cycles from $25 load	\n\
496*4882a593Smuzhiyun 	stq $2,0($17)						\n\
497*4882a593Smuzhiyun 	xor $4,$5,$5		# 6 cycles from $5 load		\n\
498*4882a593Smuzhiyun 								\n\
499*4882a593Smuzhiyun 	stq $5,8($17)						\n\
500*4882a593Smuzhiyun 	xor $7,$20,$20		# 7 cycles from $20 load	\n\
501*4882a593Smuzhiyun 	stq $20,16($17)						\n\
502*4882a593Smuzhiyun 	xor $22,$23,$23		# 7 cycles from $23 load	\n\
503*4882a593Smuzhiyun 								\n\
504*4882a593Smuzhiyun 	stq $23,24($17)						\n\
505*4882a593Smuzhiyun 	xor $25,$27,$27		# 7 cycles from $27 load	\n\
506*4882a593Smuzhiyun 	stq $27,32($17)						\n\
507*4882a593Smuzhiyun 	nop							\n\
508*4882a593Smuzhiyun 								\n\
509*4882a593Smuzhiyun 	ldq $0,40($17)						\n\
510*4882a593Smuzhiyun 	ldq $1,40($18)						\n\
511*4882a593Smuzhiyun 	ldq $3,48($17)						\n\
512*4882a593Smuzhiyun 	ldq $4,48($18)						\n\
513*4882a593Smuzhiyun 								\n\
514*4882a593Smuzhiyun 	ldq $6,56($17)						\n\
515*4882a593Smuzhiyun 	ldq $7,56($18)						\n\
516*4882a593Smuzhiyun 	ldq $2,40($19)						\n\
517*4882a593Smuzhiyun 	ldq $5,48($19)						\n\
518*4882a593Smuzhiyun 								\n\
519*4882a593Smuzhiyun 	ldq $20,56($19)						\n\
520*4882a593Smuzhiyun 	ldq $31,256($17)					\n\
521*4882a593Smuzhiyun 	ldq $31,256($18)					\n\
522*4882a593Smuzhiyun 	ldq $31,256($19)					\n\
523*4882a593Smuzhiyun 								\n\
524*4882a593Smuzhiyun 	xor $0,$1,$1		# 6 cycles from $1 load		\n\
525*4882a593Smuzhiyun 	xor $3,$4,$4		# 5 cycles from $4 load		\n\
526*4882a593Smuzhiyun 	xor $6,$7,$7		# 5 cycles from $7 load		\n\
527*4882a593Smuzhiyun 	xor $1,$2,$2		# 4 cycles from $2 load		\n\
528*4882a593Smuzhiyun 								\n\
529*4882a593Smuzhiyun 	xor $4,$5,$5		# 5 cycles from $5 load		\n\
530*4882a593Smuzhiyun 	xor $7,$20,$20		# 4 cycles from $20 load	\n\
531*4882a593Smuzhiyun 	stq $2,40($17)						\n\
532*4882a593Smuzhiyun 	subq $16,1,$16						\n\
533*4882a593Smuzhiyun 								\n\
534*4882a593Smuzhiyun 	stq $5,48($17)						\n\
535*4882a593Smuzhiyun 	addq $19,64,$19						\n\
536*4882a593Smuzhiyun 	stq $20,56($17)						\n\
537*4882a593Smuzhiyun 	addq $18,64,$18						\n\
538*4882a593Smuzhiyun 								\n\
539*4882a593Smuzhiyun 	addq $17,64,$17						\n\
540*4882a593Smuzhiyun 	bgt $16,3b						\n\
541*4882a593Smuzhiyun 	ret							\n\
542*4882a593Smuzhiyun 	.end xor_alpha_prefetch_3				\n\
543*4882a593Smuzhiyun 								\n\
544*4882a593Smuzhiyun 	.align 3						\n\
545*4882a593Smuzhiyun 	.ent xor_alpha_prefetch_4				\n\
546*4882a593Smuzhiyun xor_alpha_prefetch_4:						\n\
547*4882a593Smuzhiyun 	.prologue 0						\n\
548*4882a593Smuzhiyun 	srl $16, 6, $16						\n\
549*4882a593Smuzhiyun 								\n\
550*4882a593Smuzhiyun 	ldq $31, 0($17)						\n\
551*4882a593Smuzhiyun 	ldq $31, 0($18)						\n\
552*4882a593Smuzhiyun 	ldq $31, 0($19)						\n\
553*4882a593Smuzhiyun 	ldq $31, 0($20)						\n\
554*4882a593Smuzhiyun 								\n\
555*4882a593Smuzhiyun 	ldq $31, 64($17)					\n\
556*4882a593Smuzhiyun 	ldq $31, 64($18)					\n\
557*4882a593Smuzhiyun 	ldq $31, 64($19)					\n\
558*4882a593Smuzhiyun 	ldq $31, 64($20)					\n\
559*4882a593Smuzhiyun 								\n\
560*4882a593Smuzhiyun 	ldq $31, 128($17)					\n\
561*4882a593Smuzhiyun 	ldq $31, 128($18)					\n\
562*4882a593Smuzhiyun 	ldq $31, 128($19)					\n\
563*4882a593Smuzhiyun 	ldq $31, 128($20)					\n\
564*4882a593Smuzhiyun 								\n\
565*4882a593Smuzhiyun 	ldq $31, 192($17)					\n\
566*4882a593Smuzhiyun 	ldq $31, 192($18)					\n\
567*4882a593Smuzhiyun 	ldq $31, 192($19)					\n\
568*4882a593Smuzhiyun 	ldq $31, 192($20)					\n\
569*4882a593Smuzhiyun 	.align 4						\n\
570*4882a593Smuzhiyun 4:								\n\
571*4882a593Smuzhiyun 	ldq $0,0($17)						\n\
572*4882a593Smuzhiyun 	ldq $1,0($18)						\n\
573*4882a593Smuzhiyun 	ldq $2,0($19)						\n\
574*4882a593Smuzhiyun 	ldq $3,0($20)						\n\
575*4882a593Smuzhiyun 								\n\
576*4882a593Smuzhiyun 	ldq $4,8($17)						\n\
577*4882a593Smuzhiyun 	ldq $5,8($18)						\n\
578*4882a593Smuzhiyun 	ldq $6,8($19)						\n\
579*4882a593Smuzhiyun 	ldq $7,8($20)						\n\
580*4882a593Smuzhiyun 								\n\
581*4882a593Smuzhiyun 	ldq $21,16($17)						\n\
582*4882a593Smuzhiyun 	ldq $22,16($18)						\n\
583*4882a593Smuzhiyun 	ldq $23,16($19)						\n\
584*4882a593Smuzhiyun 	ldq $24,16($20)						\n\
585*4882a593Smuzhiyun 								\n\
586*4882a593Smuzhiyun 	ldq $25,24($17)						\n\
587*4882a593Smuzhiyun 	xor $0,$1,$1		# 6 cycles from $1 load		\n\
588*4882a593Smuzhiyun 	ldq $27,24($18)						\n\
589*4882a593Smuzhiyun 	xor $2,$3,$3		# 6 cycles from $3 load		\n\
590*4882a593Smuzhiyun 								\n\
591*4882a593Smuzhiyun 	ldq $0,24($19)						\n\
592*4882a593Smuzhiyun 	xor $1,$3,$3						\n\
593*4882a593Smuzhiyun 	ldq $1,24($20)						\n\
594*4882a593Smuzhiyun 	xor $4,$5,$5		# 7 cycles from $5 load		\n\
595*4882a593Smuzhiyun 								\n\
596*4882a593Smuzhiyun 	stq $3,0($17)						\n\
597*4882a593Smuzhiyun 	xor $6,$7,$7						\n\
598*4882a593Smuzhiyun 	xor $21,$22,$22		# 7 cycles from $22 load	\n\
599*4882a593Smuzhiyun 	xor $5,$7,$7						\n\
600*4882a593Smuzhiyun 								\n\
601*4882a593Smuzhiyun 	stq $7,8($17)						\n\
602*4882a593Smuzhiyun 	xor $23,$24,$24		# 7 cycles from $24 load	\n\
603*4882a593Smuzhiyun 	ldq $2,32($17)						\n\
604*4882a593Smuzhiyun 	xor $22,$24,$24						\n\
605*4882a593Smuzhiyun 								\n\
606*4882a593Smuzhiyun 	ldq $3,32($18)						\n\
607*4882a593Smuzhiyun 	ldq $4,32($19)						\n\
608*4882a593Smuzhiyun 	ldq $5,32($20)						\n\
609*4882a593Smuzhiyun 	xor $25,$27,$27		# 8 cycles from $27 load	\n\
610*4882a593Smuzhiyun 								\n\
611*4882a593Smuzhiyun 	ldq $6,40($17)						\n\
612*4882a593Smuzhiyun 	ldq $7,40($18)						\n\
613*4882a593Smuzhiyun 	ldq $21,40($19)						\n\
614*4882a593Smuzhiyun 	ldq $22,40($20)						\n\
615*4882a593Smuzhiyun 								\n\
616*4882a593Smuzhiyun 	stq $24,16($17)						\n\
617*4882a593Smuzhiyun 	xor $0,$1,$1		# 9 cycles from $1 load		\n\
618*4882a593Smuzhiyun 	xor $2,$3,$3		# 5 cycles from $3 load		\n\
619*4882a593Smuzhiyun 	xor $27,$1,$1						\n\
620*4882a593Smuzhiyun 								\n\
621*4882a593Smuzhiyun 	stq $1,24($17)						\n\
622*4882a593Smuzhiyun 	xor $4,$5,$5		# 5 cycles from $5 load		\n\
623*4882a593Smuzhiyun 	ldq $23,48($17)						\n\
624*4882a593Smuzhiyun 	xor $3,$5,$5						\n\
625*4882a593Smuzhiyun 								\n\
626*4882a593Smuzhiyun 	ldq $24,48($18)						\n\
627*4882a593Smuzhiyun 	ldq $25,48($19)						\n\
628*4882a593Smuzhiyun 	ldq $27,48($20)						\n\
629*4882a593Smuzhiyun 	ldq $0,56($17)						\n\
630*4882a593Smuzhiyun 								\n\
631*4882a593Smuzhiyun 	ldq $1,56($18)						\n\
632*4882a593Smuzhiyun 	ldq $2,56($19)						\n\
633*4882a593Smuzhiyun 	ldq $3,56($20)						\n\
634*4882a593Smuzhiyun 	xor $6,$7,$7		# 8 cycles from $6 load		\n\
635*4882a593Smuzhiyun 								\n\
636*4882a593Smuzhiyun 	ldq $31,256($17)					\n\
637*4882a593Smuzhiyun 	xor $21,$22,$22		# 8 cycles from $22 load	\n\
638*4882a593Smuzhiyun 	ldq $31,256($18)					\n\
639*4882a593Smuzhiyun 	xor $7,$22,$22						\n\
640*4882a593Smuzhiyun 								\n\
641*4882a593Smuzhiyun 	ldq $31,256($19)					\n\
642*4882a593Smuzhiyun 	xor $23,$24,$24		# 6 cycles from $24 load	\n\
643*4882a593Smuzhiyun 	ldq $31,256($20)					\n\
644*4882a593Smuzhiyun 	xor $25,$27,$27		# 6 cycles from $27 load	\n\
645*4882a593Smuzhiyun 								\n\
646*4882a593Smuzhiyun 	stq $5,32($17)						\n\
647*4882a593Smuzhiyun 	xor $24,$27,$27						\n\
648*4882a593Smuzhiyun 	xor $0,$1,$1		# 7 cycles from $1 load		\n\
649*4882a593Smuzhiyun 	xor $2,$3,$3		# 6 cycles from $3 load		\n\
650*4882a593Smuzhiyun 								\n\
651*4882a593Smuzhiyun 	stq $22,40($17)						\n\
652*4882a593Smuzhiyun 	xor $1,$3,$3						\n\
653*4882a593Smuzhiyun 	stq $27,48($17)						\n\
654*4882a593Smuzhiyun 	subq $16,1,$16						\n\
655*4882a593Smuzhiyun 								\n\
656*4882a593Smuzhiyun 	stq $3,56($17)						\n\
657*4882a593Smuzhiyun 	addq $20,64,$20						\n\
658*4882a593Smuzhiyun 	addq $19,64,$19						\n\
659*4882a593Smuzhiyun 	addq $18,64,$18						\n\
660*4882a593Smuzhiyun 								\n\
661*4882a593Smuzhiyun 	addq $17,64,$17						\n\
662*4882a593Smuzhiyun 	bgt $16,4b						\n\
663*4882a593Smuzhiyun 	ret							\n\
664*4882a593Smuzhiyun 	.end xor_alpha_prefetch_4				\n\
665*4882a593Smuzhiyun 								\n\
666*4882a593Smuzhiyun 	.align 3						\n\
667*4882a593Smuzhiyun 	.ent xor_alpha_prefetch_5				\n\
668*4882a593Smuzhiyun xor_alpha_prefetch_5:						\n\
669*4882a593Smuzhiyun 	.prologue 0						\n\
670*4882a593Smuzhiyun 	srl $16, 6, $16						\n\
671*4882a593Smuzhiyun 								\n\
672*4882a593Smuzhiyun 	ldq $31, 0($17)						\n\
673*4882a593Smuzhiyun 	ldq $31, 0($18)						\n\
674*4882a593Smuzhiyun 	ldq $31, 0($19)						\n\
675*4882a593Smuzhiyun 	ldq $31, 0($20)						\n\
676*4882a593Smuzhiyun 	ldq $31, 0($21)						\n\
677*4882a593Smuzhiyun 								\n\
678*4882a593Smuzhiyun 	ldq $31, 64($17)					\n\
679*4882a593Smuzhiyun 	ldq $31, 64($18)					\n\
680*4882a593Smuzhiyun 	ldq $31, 64($19)					\n\
681*4882a593Smuzhiyun 	ldq $31, 64($20)					\n\
682*4882a593Smuzhiyun 	ldq $31, 64($21)					\n\
683*4882a593Smuzhiyun 								\n\
684*4882a593Smuzhiyun 	ldq $31, 128($17)					\n\
685*4882a593Smuzhiyun 	ldq $31, 128($18)					\n\
686*4882a593Smuzhiyun 	ldq $31, 128($19)					\n\
687*4882a593Smuzhiyun 	ldq $31, 128($20)					\n\
688*4882a593Smuzhiyun 	ldq $31, 128($21)					\n\
689*4882a593Smuzhiyun 								\n\
690*4882a593Smuzhiyun 	ldq $31, 192($17)					\n\
691*4882a593Smuzhiyun 	ldq $31, 192($18)					\n\
692*4882a593Smuzhiyun 	ldq $31, 192($19)					\n\
693*4882a593Smuzhiyun 	ldq $31, 192($20)					\n\
694*4882a593Smuzhiyun 	ldq $31, 192($21)					\n\
695*4882a593Smuzhiyun 	.align 4						\n\
696*4882a593Smuzhiyun 5:								\n\
697*4882a593Smuzhiyun 	ldq $0,0($17)						\n\
698*4882a593Smuzhiyun 	ldq $1,0($18)						\n\
699*4882a593Smuzhiyun 	ldq $2,0($19)						\n\
700*4882a593Smuzhiyun 	ldq $3,0($20)						\n\
701*4882a593Smuzhiyun 								\n\
702*4882a593Smuzhiyun 	ldq $4,0($21)						\n\
703*4882a593Smuzhiyun 	ldq $5,8($17)						\n\
704*4882a593Smuzhiyun 	ldq $6,8($18)						\n\
705*4882a593Smuzhiyun 	ldq $7,8($19)						\n\
706*4882a593Smuzhiyun 								\n\
707*4882a593Smuzhiyun 	ldq $22,8($20)						\n\
708*4882a593Smuzhiyun 	ldq $23,8($21)						\n\
709*4882a593Smuzhiyun 	ldq $24,16($17)						\n\
710*4882a593Smuzhiyun 	ldq $25,16($18)						\n\
711*4882a593Smuzhiyun 								\n\
712*4882a593Smuzhiyun 	ldq $27,16($19)						\n\
713*4882a593Smuzhiyun 	xor $0,$1,$1		# 6 cycles from $1 load		\n\
714*4882a593Smuzhiyun 	ldq $28,16($20)						\n\
715*4882a593Smuzhiyun 	xor $2,$3,$3		# 6 cycles from $3 load		\n\
716*4882a593Smuzhiyun 								\n\
717*4882a593Smuzhiyun 	ldq $0,16($21)						\n\
718*4882a593Smuzhiyun 	xor $1,$3,$3						\n\
719*4882a593Smuzhiyun 	ldq $1,24($17)						\n\
720*4882a593Smuzhiyun 	xor $3,$4,$4		# 7 cycles from $4 load		\n\
721*4882a593Smuzhiyun 								\n\
722*4882a593Smuzhiyun 	stq $4,0($17)						\n\
723*4882a593Smuzhiyun 	xor $5,$6,$6		# 7 cycles from $6 load		\n\
724*4882a593Smuzhiyun 	xor $7,$22,$22		# 7 cycles from $22 load	\n\
725*4882a593Smuzhiyun 	xor $6,$23,$23		# 7 cycles from $23 load	\n\
726*4882a593Smuzhiyun 								\n\
727*4882a593Smuzhiyun 	ldq $2,24($18)						\n\
728*4882a593Smuzhiyun 	xor $22,$23,$23						\n\
729*4882a593Smuzhiyun 	ldq $3,24($19)						\n\
730*4882a593Smuzhiyun 	xor $24,$25,$25		# 8 cycles from $25 load	\n\
731*4882a593Smuzhiyun 								\n\
732*4882a593Smuzhiyun 	stq $23,8($17)						\n\
733*4882a593Smuzhiyun 	xor $25,$27,$27		# 8 cycles from $27 load	\n\
734*4882a593Smuzhiyun 	ldq $4,24($20)						\n\
735*4882a593Smuzhiyun 	xor $28,$0,$0		# 7 cycles from $0 load		\n\
736*4882a593Smuzhiyun 								\n\
737*4882a593Smuzhiyun 	ldq $5,24($21)						\n\
738*4882a593Smuzhiyun 	xor $27,$0,$0						\n\
739*4882a593Smuzhiyun 	ldq $6,32($17)						\n\
740*4882a593Smuzhiyun 	ldq $7,32($18)						\n\
741*4882a593Smuzhiyun 								\n\
742*4882a593Smuzhiyun 	stq $0,16($17)						\n\
743*4882a593Smuzhiyun 	xor $1,$2,$2		# 6 cycles from $2 load		\n\
744*4882a593Smuzhiyun 	ldq $22,32($19)						\n\
745*4882a593Smuzhiyun 	xor $3,$4,$4		# 4 cycles from $4 load		\n\
746*4882a593Smuzhiyun 								\n\
747*4882a593Smuzhiyun 	ldq $23,32($20)						\n\
748*4882a593Smuzhiyun 	xor $2,$4,$4						\n\
749*4882a593Smuzhiyun 	ldq $24,32($21)						\n\
750*4882a593Smuzhiyun 	ldq $25,40($17)						\n\
751*4882a593Smuzhiyun 								\n\
752*4882a593Smuzhiyun 	ldq $27,40($18)						\n\
753*4882a593Smuzhiyun 	ldq $28,40($19)						\n\
754*4882a593Smuzhiyun 	ldq $0,40($20)						\n\
755*4882a593Smuzhiyun 	xor $4,$5,$5		# 7 cycles from $5 load		\n\
756*4882a593Smuzhiyun 								\n\
757*4882a593Smuzhiyun 	stq $5,24($17)						\n\
758*4882a593Smuzhiyun 	xor $6,$7,$7		# 7 cycles from $7 load		\n\
759*4882a593Smuzhiyun 	ldq $1,40($21)						\n\
760*4882a593Smuzhiyun 	ldq $2,48($17)						\n\
761*4882a593Smuzhiyun 								\n\
762*4882a593Smuzhiyun 	ldq $3,48($18)						\n\
763*4882a593Smuzhiyun 	xor $7,$22,$22		# 7 cycles from $22 load	\n\
764*4882a593Smuzhiyun 	ldq $4,48($19)						\n\
765*4882a593Smuzhiyun 	xor $23,$24,$24		# 6 cycles from $24 load	\n\
766*4882a593Smuzhiyun 								\n\
767*4882a593Smuzhiyun 	ldq $5,48($20)						\n\
768*4882a593Smuzhiyun 	xor $22,$24,$24						\n\
769*4882a593Smuzhiyun 	ldq $6,48($21)						\n\
770*4882a593Smuzhiyun 	xor $25,$27,$27		# 7 cycles from $27 load	\n\
771*4882a593Smuzhiyun 								\n\
772*4882a593Smuzhiyun 	stq $24,32($17)						\n\
773*4882a593Smuzhiyun 	xor $27,$28,$28		# 8 cycles from $28 load	\n\
774*4882a593Smuzhiyun 	ldq $7,56($17)						\n\
775*4882a593Smuzhiyun 	xor $0,$1,$1		# 6 cycles from $1 load		\n\
776*4882a593Smuzhiyun 								\n\
777*4882a593Smuzhiyun 	ldq $22,56($18)						\n\
778*4882a593Smuzhiyun 	ldq $23,56($19)						\n\
779*4882a593Smuzhiyun 	ldq $24,56($20)						\n\
780*4882a593Smuzhiyun 	ldq $25,56($21)						\n\
781*4882a593Smuzhiyun 								\n\
782*4882a593Smuzhiyun 	ldq $31,256($17)					\n\
783*4882a593Smuzhiyun 	xor $28,$1,$1						\n\
784*4882a593Smuzhiyun 	ldq $31,256($18)					\n\
785*4882a593Smuzhiyun 	xor $2,$3,$3		# 9 cycles from $3 load		\n\
786*4882a593Smuzhiyun 								\n\
787*4882a593Smuzhiyun 	ldq $31,256($19)					\n\
788*4882a593Smuzhiyun 	xor $3,$4,$4		# 9 cycles from $4 load		\n\
789*4882a593Smuzhiyun 	ldq $31,256($20)					\n\
790*4882a593Smuzhiyun 	xor $5,$6,$6		# 8 cycles from $6 load		\n\
791*4882a593Smuzhiyun 								\n\
792*4882a593Smuzhiyun 	stq $1,40($17)						\n\
793*4882a593Smuzhiyun 	xor $4,$6,$6						\n\
794*4882a593Smuzhiyun 	xor $7,$22,$22		# 7 cycles from $22 load	\n\
795*4882a593Smuzhiyun 	xor $23,$24,$24		# 6 cycles from $24 load	\n\
796*4882a593Smuzhiyun 								\n\
797*4882a593Smuzhiyun 	stq $6,48($17)						\n\
798*4882a593Smuzhiyun 	xor $22,$24,$24						\n\
799*4882a593Smuzhiyun 	ldq $31,256($21)					\n\
800*4882a593Smuzhiyun 	xor $24,$25,$25		# 8 cycles from $25 load	\n\
801*4882a593Smuzhiyun 								\n\
802*4882a593Smuzhiyun 	stq $25,56($17)						\n\
803*4882a593Smuzhiyun 	subq $16,1,$16						\n\
804*4882a593Smuzhiyun 	addq $21,64,$21						\n\
805*4882a593Smuzhiyun 	addq $20,64,$20						\n\
806*4882a593Smuzhiyun 								\n\
807*4882a593Smuzhiyun 	addq $19,64,$19						\n\
808*4882a593Smuzhiyun 	addq $18,64,$18						\n\
809*4882a593Smuzhiyun 	addq $17,64,$17						\n\
810*4882a593Smuzhiyun 	bgt $16,5b						\n\
811*4882a593Smuzhiyun 								\n\
812*4882a593Smuzhiyun 	ret							\n\
813*4882a593Smuzhiyun 	.end xor_alpha_prefetch_5				\n\
814*4882a593Smuzhiyun ");
815*4882a593Smuzhiyun 
816*4882a593Smuzhiyun static struct xor_block_template xor_block_alpha = {
817*4882a593Smuzhiyun 	.name	= "alpha",
818*4882a593Smuzhiyun 	.do_2	= xor_alpha_2,
819*4882a593Smuzhiyun 	.do_3	= xor_alpha_3,
820*4882a593Smuzhiyun 	.do_4	= xor_alpha_4,
821*4882a593Smuzhiyun 	.do_5	= xor_alpha_5,
822*4882a593Smuzhiyun };
823*4882a593Smuzhiyun 
824*4882a593Smuzhiyun static struct xor_block_template xor_block_alpha_prefetch = {
825*4882a593Smuzhiyun 	.name	= "alpha prefetch",
826*4882a593Smuzhiyun 	.do_2	= xor_alpha_prefetch_2,
827*4882a593Smuzhiyun 	.do_3	= xor_alpha_prefetch_3,
828*4882a593Smuzhiyun 	.do_4	= xor_alpha_prefetch_4,
829*4882a593Smuzhiyun 	.do_5	= xor_alpha_prefetch_5,
830*4882a593Smuzhiyun };
831*4882a593Smuzhiyun 
832*4882a593Smuzhiyun /* For grins, also test the generic routines.  */
833*4882a593Smuzhiyun #include <asm-generic/xor.h>
834*4882a593Smuzhiyun 
835*4882a593Smuzhiyun #undef XOR_TRY_TEMPLATES
836*4882a593Smuzhiyun #define XOR_TRY_TEMPLATES				\
837*4882a593Smuzhiyun 	do {						\
838*4882a593Smuzhiyun 		xor_speed(&xor_block_8regs);		\
839*4882a593Smuzhiyun 		xor_speed(&xor_block_32regs);		\
840*4882a593Smuzhiyun 		xor_speed(&xor_block_alpha);		\
841*4882a593Smuzhiyun 		xor_speed(&xor_block_alpha_prefetch);	\
842*4882a593Smuzhiyun 	} while (0)
843*4882a593Smuzhiyun 
844*4882a593Smuzhiyun /* Force the use of alpha_prefetch if EV6, as it is significantly
845*4882a593Smuzhiyun    faster in the cold cache case.  */
846*4882a593Smuzhiyun #define XOR_SELECT_TEMPLATE(FASTEST) \
847*4882a593Smuzhiyun 	(implver() == IMPLVER_EV6 ? &xor_block_alpha_prefetch : FASTEST)
848