xref: /OK3568_Linux_fs/kernel/arch/sparc/lib/U1memcpy.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun/* U1memcpy.S: UltraSPARC-I/II/IIi/IIe optimized memcpy.
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * Copyright (C) 1997, 2004 David S. Miller (davem@redhat.com)
5*4882a593Smuzhiyun * Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek (jj@ultra.linux.cz)
6*4882a593Smuzhiyun */
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun#ifdef __KERNEL__
9*4882a593Smuzhiyun#include <linux/linkage.h>
10*4882a593Smuzhiyun#include <asm/visasm.h>
11*4882a593Smuzhiyun#include <asm/asi.h>
12*4882a593Smuzhiyun#include <asm/export.h>
13*4882a593Smuzhiyun#define GLOBAL_SPARE	g7
14*4882a593Smuzhiyun#else
15*4882a593Smuzhiyun#define GLOBAL_SPARE	g5
16*4882a593Smuzhiyun#define ASI_BLK_P 0xf0
17*4882a593Smuzhiyun#define FPRS_FEF  0x04
18*4882a593Smuzhiyun#ifdef MEMCPY_DEBUG
19*4882a593Smuzhiyun#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
20*4882a593Smuzhiyun		 clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;
21*4882a593Smuzhiyun#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
22*4882a593Smuzhiyun#else
23*4882a593Smuzhiyun#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
24*4882a593Smuzhiyun#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
25*4882a593Smuzhiyun#endif
26*4882a593Smuzhiyun#endif
27*4882a593Smuzhiyun
28*4882a593Smuzhiyun#ifndef EX_LD
29*4882a593Smuzhiyun#define EX_LD(x,y)	x
30*4882a593Smuzhiyun#endif
31*4882a593Smuzhiyun#ifndef EX_LD_FP
32*4882a593Smuzhiyun#define EX_LD_FP(x,y)	x
33*4882a593Smuzhiyun#endif
34*4882a593Smuzhiyun
35*4882a593Smuzhiyun#ifndef EX_ST
36*4882a593Smuzhiyun#define EX_ST(x,y)	x
37*4882a593Smuzhiyun#endif
38*4882a593Smuzhiyun#ifndef EX_ST_FP
39*4882a593Smuzhiyun#define EX_ST_FP(x,y)	x
40*4882a593Smuzhiyun#endif
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun#ifndef LOAD
43*4882a593Smuzhiyun#define LOAD(type,addr,dest)	type [addr], dest
44*4882a593Smuzhiyun#endif
45*4882a593Smuzhiyun
46*4882a593Smuzhiyun#ifndef LOAD_BLK
47*4882a593Smuzhiyun#define LOAD_BLK(addr,dest)	ldda [addr] ASI_BLK_P, dest
48*4882a593Smuzhiyun#endif
49*4882a593Smuzhiyun
50*4882a593Smuzhiyun#ifndef STORE
51*4882a593Smuzhiyun#define STORE(type,src,addr)	type src, [addr]
52*4882a593Smuzhiyun#endif
53*4882a593Smuzhiyun
54*4882a593Smuzhiyun#ifndef STORE_BLK
55*4882a593Smuzhiyun#define STORE_BLK(src,addr)	stda src, [addr] ASI_BLK_P
56*4882a593Smuzhiyun#endif
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun#ifndef FUNC_NAME
59*4882a593Smuzhiyun#define FUNC_NAME	memcpy
60*4882a593Smuzhiyun#endif
61*4882a593Smuzhiyun
62*4882a593Smuzhiyun#ifndef PREAMBLE
63*4882a593Smuzhiyun#define PREAMBLE
64*4882a593Smuzhiyun#endif
65*4882a593Smuzhiyun
66*4882a593Smuzhiyun#ifndef XCC
67*4882a593Smuzhiyun#define XCC xcc
68*4882a593Smuzhiyun#endif
69*4882a593Smuzhiyun
70*4882a593Smuzhiyun#define FREG_FROB(f1, f2, f3, f4, f5, f6, f7, f8, f9)		\
71*4882a593Smuzhiyun	faligndata		%f1, %f2, %f48;			\
72*4882a593Smuzhiyun	faligndata		%f2, %f3, %f50;			\
73*4882a593Smuzhiyun	faligndata		%f3, %f4, %f52;			\
74*4882a593Smuzhiyun	faligndata		%f4, %f5, %f54;			\
75*4882a593Smuzhiyun	faligndata		%f5, %f6, %f56;			\
76*4882a593Smuzhiyun	faligndata		%f6, %f7, %f58;			\
77*4882a593Smuzhiyun	faligndata		%f7, %f8, %f60;			\
78*4882a593Smuzhiyun	faligndata		%f8, %f9, %f62;
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun#define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, jmptgt)			\
81*4882a593Smuzhiyun	EX_LD_FP(LOAD_BLK(%src, %fdest), U1_gs_80_fp);			\
82*4882a593Smuzhiyun	EX_ST_FP(STORE_BLK(%fsrc, %dest), U1_gs_80_fp);			\
83*4882a593Smuzhiyun	add			%src, 0x40, %src;			\
84*4882a593Smuzhiyun	subcc			%GLOBAL_SPARE, 0x40, %GLOBAL_SPARE;	\
85*4882a593Smuzhiyun	be,pn			%xcc, jmptgt;				\
86*4882a593Smuzhiyun	 add			%dest, 0x40, %dest;			\
87*4882a593Smuzhiyun
88*4882a593Smuzhiyun#define LOOP_CHUNK1(src, dest, branch_dest)		\
89*4882a593Smuzhiyun	MAIN_LOOP_CHUNK(src, dest, f0,  f48, branch_dest)
90*4882a593Smuzhiyun#define LOOP_CHUNK2(src, dest, branch_dest)		\
91*4882a593Smuzhiyun	MAIN_LOOP_CHUNK(src, dest, f16, f48, branch_dest)
92*4882a593Smuzhiyun#define LOOP_CHUNK3(src, dest, branch_dest)		\
93*4882a593Smuzhiyun	MAIN_LOOP_CHUNK(src, dest, f32, f48, branch_dest)
94*4882a593Smuzhiyun
95*4882a593Smuzhiyun#define DO_SYNC			membar	#Sync;
96*4882a593Smuzhiyun#define STORE_SYNC(dest, fsrc)				\
97*4882a593Smuzhiyun	EX_ST_FP(STORE_BLK(%fsrc, %dest), U1_gs_80_fp);	\
98*4882a593Smuzhiyun	add			%dest, 0x40, %dest;	\
99*4882a593Smuzhiyun	DO_SYNC
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun#define STORE_JUMP(dest, fsrc, target)			\
102*4882a593Smuzhiyun	EX_ST_FP(STORE_BLK(%fsrc, %dest), U1_gs_40_fp);	\
103*4882a593Smuzhiyun	add			%dest, 0x40, %dest;	\
104*4882a593Smuzhiyun	ba,pt			%xcc, target;		\
105*4882a593Smuzhiyun	 nop;
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun#define FINISH_VISCHUNK(dest, f0, f1)			\
108*4882a593Smuzhiyun	subcc			%g3, 8, %g3;		\
109*4882a593Smuzhiyun	bl,pn			%xcc, 95f;		\
110*4882a593Smuzhiyun	 faligndata		%f0, %f1, %f48;		\
111*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f48, %dest), U1_g3_8_fp);	\
112*4882a593Smuzhiyun	add			%dest, 8, %dest;
113*4882a593Smuzhiyun
114*4882a593Smuzhiyun#define UNEVEN_VISCHUNK_LAST(dest, f0, f1)	\
115*4882a593Smuzhiyun	subcc			%g3, 8, %g3;	\
116*4882a593Smuzhiyun	bl,pn			%xcc, 95f;	\
117*4882a593Smuzhiyun	 fsrc2			%f0, %f1;
118*4882a593Smuzhiyun
119*4882a593Smuzhiyun#define UNEVEN_VISCHUNK(dest, f0, f1)		\
120*4882a593Smuzhiyun	UNEVEN_VISCHUNK_LAST(dest, f0, f1)	\
121*4882a593Smuzhiyun	ba,a,pt			%xcc, 93f;
122*4882a593Smuzhiyun
123*4882a593Smuzhiyun	.register	%g2,#scratch
124*4882a593Smuzhiyun	.register	%g3,#scratch
125*4882a593Smuzhiyun
126*4882a593Smuzhiyun	.text
127*4882a593Smuzhiyun#ifndef EX_RETVAL
128*4882a593Smuzhiyun#define EX_RETVAL(x)	x
129*4882a593SmuzhiyunENTRY(U1_g1_1_fp)
130*4882a593Smuzhiyun	VISExitHalf
131*4882a593Smuzhiyun	add		%g1, 1, %g1
132*4882a593Smuzhiyun	add		%g1, %g2, %g1
133*4882a593Smuzhiyun	retl
134*4882a593Smuzhiyun	 add		%g1, %o2, %o0
135*4882a593SmuzhiyunENDPROC(U1_g1_1_fp)
136*4882a593SmuzhiyunENTRY(U1_g2_0_fp)
137*4882a593Smuzhiyun	VISExitHalf
138*4882a593Smuzhiyun	retl
139*4882a593Smuzhiyun	 add		%g2, %o2, %o0
140*4882a593SmuzhiyunENDPROC(U1_g2_0_fp)
141*4882a593SmuzhiyunENTRY(U1_g2_8_fp)
142*4882a593Smuzhiyun	VISExitHalf
143*4882a593Smuzhiyun	add		%g2, 8, %g2
144*4882a593Smuzhiyun	retl
145*4882a593Smuzhiyun	 add		%g2, %o2, %o0
146*4882a593SmuzhiyunENDPROC(U1_g2_8_fp)
147*4882a593SmuzhiyunENTRY(U1_gs_0_fp)
148*4882a593Smuzhiyun	VISExitHalf
149*4882a593Smuzhiyun	add		%GLOBAL_SPARE, %g3, %o0
150*4882a593Smuzhiyun	retl
151*4882a593Smuzhiyun	 add		%o0, %o2, %o0
152*4882a593SmuzhiyunENDPROC(U1_gs_0_fp)
153*4882a593SmuzhiyunENTRY(U1_gs_80_fp)
154*4882a593Smuzhiyun	VISExitHalf
155*4882a593Smuzhiyun	add		%GLOBAL_SPARE, 0x80, %GLOBAL_SPARE
156*4882a593Smuzhiyun	add		%GLOBAL_SPARE, %g3, %o0
157*4882a593Smuzhiyun	retl
158*4882a593Smuzhiyun	 add		%o0, %o2, %o0
159*4882a593SmuzhiyunENDPROC(U1_gs_80_fp)
160*4882a593SmuzhiyunENTRY(U1_gs_40_fp)
161*4882a593Smuzhiyun	VISExitHalf
162*4882a593Smuzhiyun	add		%GLOBAL_SPARE, 0x40, %GLOBAL_SPARE
163*4882a593Smuzhiyun	add		%GLOBAL_SPARE, %g3, %o0
164*4882a593Smuzhiyun	retl
165*4882a593Smuzhiyun	 add		%o0, %o2, %o0
166*4882a593SmuzhiyunENDPROC(U1_gs_40_fp)
167*4882a593SmuzhiyunENTRY(U1_g3_0_fp)
168*4882a593Smuzhiyun	VISExitHalf
169*4882a593Smuzhiyun	retl
170*4882a593Smuzhiyun	 add		%g3, %o2, %o0
171*4882a593SmuzhiyunENDPROC(U1_g3_0_fp)
172*4882a593SmuzhiyunENTRY(U1_g3_8_fp)
173*4882a593Smuzhiyun	VISExitHalf
174*4882a593Smuzhiyun	add		%g3, 8, %g3
175*4882a593Smuzhiyun	retl
176*4882a593Smuzhiyun	 add		%g3, %o2, %o0
177*4882a593SmuzhiyunENDPROC(U1_g3_8_fp)
178*4882a593SmuzhiyunENTRY(U1_o2_0_fp)
179*4882a593Smuzhiyun	VISExitHalf
180*4882a593Smuzhiyun	retl
181*4882a593Smuzhiyun	 mov		%o2, %o0
182*4882a593SmuzhiyunENDPROC(U1_o2_0_fp)
183*4882a593SmuzhiyunENTRY(U1_o2_1_fp)
184*4882a593Smuzhiyun	VISExitHalf
185*4882a593Smuzhiyun	retl
186*4882a593Smuzhiyun	 add		%o2, 1, %o0
187*4882a593SmuzhiyunENDPROC(U1_o2_1_fp)
188*4882a593SmuzhiyunENTRY(U1_gs_0)
189*4882a593Smuzhiyun	VISExitHalf
190*4882a593Smuzhiyun	retl
191*4882a593Smuzhiyun	 add		%GLOBAL_SPARE, %o2, %o0
192*4882a593SmuzhiyunENDPROC(U1_gs_0)
193*4882a593SmuzhiyunENTRY(U1_gs_8)
194*4882a593Smuzhiyun	VISExitHalf
195*4882a593Smuzhiyun	add		%GLOBAL_SPARE, %o2, %GLOBAL_SPARE
196*4882a593Smuzhiyun	retl
197*4882a593Smuzhiyun	 add		%GLOBAL_SPARE, 0x8, %o0
198*4882a593SmuzhiyunENDPROC(U1_gs_8)
199*4882a593SmuzhiyunENTRY(U1_gs_10)
200*4882a593Smuzhiyun	VISExitHalf
201*4882a593Smuzhiyun	add		%GLOBAL_SPARE, %o2, %GLOBAL_SPARE
202*4882a593Smuzhiyun	retl
203*4882a593Smuzhiyun	 add		%GLOBAL_SPARE, 0x10, %o0
204*4882a593SmuzhiyunENDPROC(U1_gs_10)
205*4882a593SmuzhiyunENTRY(U1_o2_0)
206*4882a593Smuzhiyun	retl
207*4882a593Smuzhiyun	 mov		%o2, %o0
208*4882a593SmuzhiyunENDPROC(U1_o2_0)
209*4882a593SmuzhiyunENTRY(U1_o2_8)
210*4882a593Smuzhiyun	retl
211*4882a593Smuzhiyun	 add		%o2, 8, %o0
212*4882a593SmuzhiyunENDPROC(U1_o2_8)
213*4882a593SmuzhiyunENTRY(U1_o2_4)
214*4882a593Smuzhiyun	retl
215*4882a593Smuzhiyun	 add		%o2, 4, %o0
216*4882a593SmuzhiyunENDPROC(U1_o2_4)
217*4882a593SmuzhiyunENTRY(U1_o2_1)
218*4882a593Smuzhiyun	retl
219*4882a593Smuzhiyun	 add		%o2, 1, %o0
220*4882a593SmuzhiyunENDPROC(U1_o2_1)
221*4882a593SmuzhiyunENTRY(U1_g1_0)
222*4882a593Smuzhiyun	retl
223*4882a593Smuzhiyun	 add		%g1, %o2, %o0
224*4882a593SmuzhiyunENDPROC(U1_g1_0)
225*4882a593SmuzhiyunENTRY(U1_g1_1)
226*4882a593Smuzhiyun	add		%g1, 1, %g1
227*4882a593Smuzhiyun	retl
228*4882a593Smuzhiyun	 add		%g1, %o2, %o0
229*4882a593SmuzhiyunENDPROC(U1_g1_1)
230*4882a593SmuzhiyunENTRY(U1_gs_0_o2_adj)
231*4882a593Smuzhiyun	and		%o2, 7, %o2
232*4882a593Smuzhiyun	retl
233*4882a593Smuzhiyun	 add		%GLOBAL_SPARE, %o2, %o0
234*4882a593SmuzhiyunENDPROC(U1_gs_0_o2_adj)
235*4882a593SmuzhiyunENTRY(U1_gs_8_o2_adj)
236*4882a593Smuzhiyun	and		%o2, 7, %o2
237*4882a593Smuzhiyun	add		%GLOBAL_SPARE, 8, %GLOBAL_SPARE
238*4882a593Smuzhiyun	retl
239*4882a593Smuzhiyun	 add		%GLOBAL_SPARE, %o2, %o0
240*4882a593SmuzhiyunENDPROC(U1_gs_8_o2_adj)
241*4882a593Smuzhiyun#endif
242*4882a593Smuzhiyun
243*4882a593Smuzhiyun	.align		64
244*4882a593Smuzhiyun
245*4882a593Smuzhiyun	.globl		FUNC_NAME
246*4882a593Smuzhiyun	.type		FUNC_NAME,#function
247*4882a593SmuzhiyunFUNC_NAME:		/* %o0=dst, %o1=src, %o2=len */
248*4882a593Smuzhiyun	srlx		%o2, 31, %g2
249*4882a593Smuzhiyun	cmp		%g2, 0
250*4882a593Smuzhiyun	tne		%xcc, 5
251*4882a593Smuzhiyun	PREAMBLE
252*4882a593Smuzhiyun	mov		%o0, %o4
253*4882a593Smuzhiyun	cmp		%o2, 0
254*4882a593Smuzhiyun	be,pn		%XCC, 85f
255*4882a593Smuzhiyun	 or		%o0, %o1, %o3
256*4882a593Smuzhiyun	cmp		%o2, 16
257*4882a593Smuzhiyun	blu,a,pn	%XCC, 80f
258*4882a593Smuzhiyun	 or		%o3, %o2, %o3
259*4882a593Smuzhiyun
260*4882a593Smuzhiyun	cmp		%o2, (5 * 64)
261*4882a593Smuzhiyun	blu,pt		%XCC, 70f
262*4882a593Smuzhiyun	 andcc		%o3, 0x7, %g0
263*4882a593Smuzhiyun
264*4882a593Smuzhiyun	/* Clobbers o5/g1/g2/g3/g7/icc/xcc.  */
265*4882a593Smuzhiyun	VISEntry
266*4882a593Smuzhiyun
267*4882a593Smuzhiyun	/* Is 'dst' already aligned on an 64-byte boundary? */
268*4882a593Smuzhiyun	andcc		%o0, 0x3f, %g2
269*4882a593Smuzhiyun	be,pt		%XCC, 2f
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun	/* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
272*4882a593Smuzhiyun	 * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
273*4882a593Smuzhiyun	 * subtract this from 'len'.
274*4882a593Smuzhiyun	 */
275*4882a593Smuzhiyun	 sub		%o0, %o1, %GLOBAL_SPARE
276*4882a593Smuzhiyun	sub		%g2, 0x40, %g2
277*4882a593Smuzhiyun	sub		%g0, %g2, %g2
278*4882a593Smuzhiyun	sub		%o2, %g2, %o2
279*4882a593Smuzhiyun	andcc		%g2, 0x7, %g1
280*4882a593Smuzhiyun	be,pt		%icc, 2f
281*4882a593Smuzhiyun	 and		%g2, 0x38, %g2
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun1:	subcc		%g1, 0x1, %g1
284*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3), U1_g1_1_fp)
285*4882a593Smuzhiyun	EX_ST_FP(STORE(stb, %o3, %o1 + %GLOBAL_SPARE), U1_g1_1_fp)
286*4882a593Smuzhiyun	bgu,pt		%XCC, 1b
287*4882a593Smuzhiyun	 add		%o1, 0x1, %o1
288*4882a593Smuzhiyun
289*4882a593Smuzhiyun	add		%o1, %GLOBAL_SPARE, %o0
290*4882a593Smuzhiyun
291*4882a593Smuzhiyun2:	cmp		%g2, 0x0
292*4882a593Smuzhiyun	and		%o1, 0x7, %g1
293*4882a593Smuzhiyun	be,pt		%icc, 3f
294*4882a593Smuzhiyun	 alignaddr	%o1, %g0, %o1
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1, %f4), U1_g2_0_fp)
297*4882a593Smuzhiyun1:	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6), U1_g2_0_fp)
298*4882a593Smuzhiyun	add		%o1, 0x8, %o1
299*4882a593Smuzhiyun	subcc		%g2, 0x8, %g2
300*4882a593Smuzhiyun	faligndata	%f4, %f6, %f0
301*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f0, %o0), U1_g2_8_fp)
302*4882a593Smuzhiyun	be,pn		%icc, 3f
303*4882a593Smuzhiyun	 add		%o0, 0x8, %o0
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4), U1_g2_0_fp)
306*4882a593Smuzhiyun	add		%o1, 0x8, %o1
307*4882a593Smuzhiyun	subcc		%g2, 0x8, %g2
308*4882a593Smuzhiyun	faligndata	%f6, %f4, %f0
309*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f0, %o0), U1_g2_8_fp)
310*4882a593Smuzhiyun	bne,pt		%icc, 1b
311*4882a593Smuzhiyun	 add		%o0, 0x8, %o0
312*4882a593Smuzhiyun
313*4882a593Smuzhiyun	/* Destination is 64-byte aligned.  */
314*4882a593Smuzhiyun3:
315*4882a593Smuzhiyun	membar		  #LoadStore | #StoreStore | #StoreLoad
316*4882a593Smuzhiyun
317*4882a593Smuzhiyun	subcc		%o2, 0x40, %GLOBAL_SPARE
318*4882a593Smuzhiyun	add		%o1, %g1, %g1
319*4882a593Smuzhiyun	andncc		%GLOBAL_SPARE, (0x40 - 1), %GLOBAL_SPARE
320*4882a593Smuzhiyun	srl		%g1, 3, %g2
321*4882a593Smuzhiyun	sub		%o2, %GLOBAL_SPARE, %g3
322*4882a593Smuzhiyun	andn		%o1, (0x40 - 1), %o1
323*4882a593Smuzhiyun	and		%g2, 7, %g2
324*4882a593Smuzhiyun	andncc		%g3, 0x7, %g3
325*4882a593Smuzhiyun	fsrc2		%f0, %f2
326*4882a593Smuzhiyun	sub		%g3, 0x8, %g3
327*4882a593Smuzhiyun	sub		%o2, %GLOBAL_SPARE, %o2
328*4882a593Smuzhiyun
329*4882a593Smuzhiyun	add		%g1, %GLOBAL_SPARE, %g1
330*4882a593Smuzhiyun	subcc		%o2, %g3, %o2
331*4882a593Smuzhiyun
332*4882a593Smuzhiyun	EX_LD_FP(LOAD_BLK(%o1, %f0), U1_gs_0_fp)
333*4882a593Smuzhiyun	add		%o1, 0x40, %o1
334*4882a593Smuzhiyun	add		%g1, %g3, %g1
335*4882a593Smuzhiyun	EX_LD_FP(LOAD_BLK(%o1, %f16), U1_gs_0_fp)
336*4882a593Smuzhiyun	add		%o1, 0x40, %o1
337*4882a593Smuzhiyun	sub		%GLOBAL_SPARE, 0x80, %GLOBAL_SPARE
338*4882a593Smuzhiyun	EX_LD_FP(LOAD_BLK(%o1, %f32), U1_gs_80_fp)
339*4882a593Smuzhiyun	add		%o1, 0x40, %o1
340*4882a593Smuzhiyun
341*4882a593Smuzhiyun	/* There are 8 instances of the unrolled loop,
342*4882a593Smuzhiyun	 * one for each possible alignment of the
343*4882a593Smuzhiyun	 * source buffer.  Each loop instance is 452
344*4882a593Smuzhiyun	 * bytes.
345*4882a593Smuzhiyun	 */
346*4882a593Smuzhiyun	sll		%g2, 3, %o3
347*4882a593Smuzhiyun	sub		%o3, %g2, %o3
348*4882a593Smuzhiyun	sllx		%o3, 4, %o3
349*4882a593Smuzhiyun	add		%o3, %g2, %o3
350*4882a593Smuzhiyun	sllx		%o3, 2, %g2
351*4882a593Smuzhiyun1:	rd		%pc, %o3
352*4882a593Smuzhiyun	add		%o3, %lo(1f - 1b), %o3
353*4882a593Smuzhiyun	jmpl		%o3 + %g2, %g0
354*4882a593Smuzhiyun	 nop
355*4882a593Smuzhiyun
356*4882a593Smuzhiyun	.align		64
357*4882a593Smuzhiyun1:	FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
358*4882a593Smuzhiyun	LOOP_CHUNK1(o1, o0, 1f)
359*4882a593Smuzhiyun	FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
360*4882a593Smuzhiyun	LOOP_CHUNK2(o1, o0, 2f)
361*4882a593Smuzhiyun	FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
362*4882a593Smuzhiyun	LOOP_CHUNK3(o1, o0, 3f)
363*4882a593Smuzhiyun	ba,pt		%xcc, 1b+4
364*4882a593Smuzhiyun	 faligndata	%f0, %f2, %f48
365*4882a593Smuzhiyun1:	FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
366*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
367*4882a593Smuzhiyun	FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
368*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 40f)
369*4882a593Smuzhiyun2:	FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
370*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
371*4882a593Smuzhiyun	FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
372*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 48f)
373*4882a593Smuzhiyun3:	FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
374*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
375*4882a593Smuzhiyun	FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
376*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 56f)
377*4882a593Smuzhiyun
378*4882a593Smuzhiyun1:	FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
379*4882a593Smuzhiyun	LOOP_CHUNK1(o1, o0, 1f)
380*4882a593Smuzhiyun	FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
381*4882a593Smuzhiyun	LOOP_CHUNK2(o1, o0, 2f)
382*4882a593Smuzhiyun	FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
383*4882a593Smuzhiyun	LOOP_CHUNK3(o1, o0, 3f)
384*4882a593Smuzhiyun	ba,pt		%xcc, 1b+4
385*4882a593Smuzhiyun	 faligndata	%f2, %f4, %f48
386*4882a593Smuzhiyun1:	FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
387*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
388*4882a593Smuzhiyun	FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
389*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 41f)
390*4882a593Smuzhiyun2:	FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
391*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
392*4882a593Smuzhiyun	FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
393*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 49f)
394*4882a593Smuzhiyun3:	FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
395*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
396*4882a593Smuzhiyun	FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
397*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 57f)
398*4882a593Smuzhiyun
399*4882a593Smuzhiyun1:	FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
400*4882a593Smuzhiyun	LOOP_CHUNK1(o1, o0, 1f)
401*4882a593Smuzhiyun	FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
402*4882a593Smuzhiyun	LOOP_CHUNK2(o1, o0, 2f)
403*4882a593Smuzhiyun	FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
404*4882a593Smuzhiyun	LOOP_CHUNK3(o1, o0, 3f)
405*4882a593Smuzhiyun	ba,pt		%xcc, 1b+4
406*4882a593Smuzhiyun	 faligndata	%f4, %f6, %f48
407*4882a593Smuzhiyun1:	FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
408*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
409*4882a593Smuzhiyun	FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
410*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 42f)
411*4882a593Smuzhiyun2:	FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
412*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
413*4882a593Smuzhiyun	FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
414*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 50f)
415*4882a593Smuzhiyun3:	FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
416*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
417*4882a593Smuzhiyun	FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
418*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 58f)
419*4882a593Smuzhiyun
420*4882a593Smuzhiyun1:	FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
421*4882a593Smuzhiyun	LOOP_CHUNK1(o1, o0, 1f)
422*4882a593Smuzhiyun	FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
423*4882a593Smuzhiyun	LOOP_CHUNK2(o1, o0, 2f)
424*4882a593Smuzhiyun	FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
425*4882a593Smuzhiyun	LOOP_CHUNK3(o1, o0, 3f)
426*4882a593Smuzhiyun	ba,pt		%xcc, 1b+4
427*4882a593Smuzhiyun	 faligndata	%f6, %f8, %f48
428*4882a593Smuzhiyun1:	FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
429*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
430*4882a593Smuzhiyun	FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
431*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 43f)
432*4882a593Smuzhiyun2:	FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
433*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
434*4882a593Smuzhiyun	FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
435*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 51f)
436*4882a593Smuzhiyun3:	FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
437*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
438*4882a593Smuzhiyun	FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
439*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 59f)
440*4882a593Smuzhiyun
441*4882a593Smuzhiyun1:	FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
442*4882a593Smuzhiyun	LOOP_CHUNK1(o1, o0, 1f)
443*4882a593Smuzhiyun	FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
444*4882a593Smuzhiyun	LOOP_CHUNK2(o1, o0, 2f)
445*4882a593Smuzhiyun	FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
446*4882a593Smuzhiyun	LOOP_CHUNK3(o1, o0, 3f)
447*4882a593Smuzhiyun	ba,pt		%xcc, 1b+4
448*4882a593Smuzhiyun	 faligndata	%f8, %f10, %f48
449*4882a593Smuzhiyun1:	FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
450*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
451*4882a593Smuzhiyun	FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
452*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 44f)
453*4882a593Smuzhiyun2:	FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
454*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
455*4882a593Smuzhiyun	FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
456*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 52f)
457*4882a593Smuzhiyun3:	FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
458*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
459*4882a593Smuzhiyun	FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
460*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 60f)
461*4882a593Smuzhiyun
462*4882a593Smuzhiyun1:	FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
463*4882a593Smuzhiyun	LOOP_CHUNK1(o1, o0, 1f)
464*4882a593Smuzhiyun	FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
465*4882a593Smuzhiyun	LOOP_CHUNK2(o1, o0, 2f)
466*4882a593Smuzhiyun	FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
467*4882a593Smuzhiyun	LOOP_CHUNK3(o1, o0, 3f)
468*4882a593Smuzhiyun	ba,pt		%xcc, 1b+4
469*4882a593Smuzhiyun	 faligndata	%f10, %f12, %f48
470*4882a593Smuzhiyun1:	FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
471*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
472*4882a593Smuzhiyun	FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
473*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 45f)
474*4882a593Smuzhiyun2:	FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
475*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
476*4882a593Smuzhiyun	FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
477*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 53f)
478*4882a593Smuzhiyun3:	FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
479*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
480*4882a593Smuzhiyun	FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
481*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 61f)
482*4882a593Smuzhiyun
483*4882a593Smuzhiyun1:	FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
484*4882a593Smuzhiyun	LOOP_CHUNK1(o1, o0, 1f)
485*4882a593Smuzhiyun	FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
486*4882a593Smuzhiyun	LOOP_CHUNK2(o1, o0, 2f)
487*4882a593Smuzhiyun	FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
488*4882a593Smuzhiyun	LOOP_CHUNK3(o1, o0, 3f)
489*4882a593Smuzhiyun	ba,pt		%xcc, 1b+4
490*4882a593Smuzhiyun	 faligndata	%f12, %f14, %f48
491*4882a593Smuzhiyun1:	FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
492*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
493*4882a593Smuzhiyun	FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
494*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 46f)
495*4882a593Smuzhiyun2:	FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
496*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
497*4882a593Smuzhiyun	FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
498*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 54f)
499*4882a593Smuzhiyun3:	FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
500*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
501*4882a593Smuzhiyun	FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
502*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 62f)
503*4882a593Smuzhiyun
504*4882a593Smuzhiyun1:	FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
505*4882a593Smuzhiyun	LOOP_CHUNK1(o1, o0, 1f)
506*4882a593Smuzhiyun	FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
507*4882a593Smuzhiyun	LOOP_CHUNK2(o1, o0, 2f)
508*4882a593Smuzhiyun	FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
509*4882a593Smuzhiyun	LOOP_CHUNK3(o1, o0, 3f)
510*4882a593Smuzhiyun	ba,pt		%xcc, 1b+4
511*4882a593Smuzhiyun	 faligndata	%f14, %f16, %f48
512*4882a593Smuzhiyun1:	FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
513*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
514*4882a593Smuzhiyun	FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
515*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 47f)
516*4882a593Smuzhiyun2:	FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
517*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
518*4882a593Smuzhiyun	FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
519*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 55f)
520*4882a593Smuzhiyun3:	FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
521*4882a593Smuzhiyun	STORE_SYNC(o0, f48)
522*4882a593Smuzhiyun	FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
523*4882a593Smuzhiyun	STORE_JUMP(o0, f48, 63f)
524*4882a593Smuzhiyun
525*4882a593Smuzhiyun40:	FINISH_VISCHUNK(o0, f0,  f2)
526*4882a593Smuzhiyun41:	FINISH_VISCHUNK(o0, f2,  f4)
527*4882a593Smuzhiyun42:	FINISH_VISCHUNK(o0, f4,  f6)
528*4882a593Smuzhiyun43:	FINISH_VISCHUNK(o0, f6,  f8)
529*4882a593Smuzhiyun44:	FINISH_VISCHUNK(o0, f8,  f10)
530*4882a593Smuzhiyun45:	FINISH_VISCHUNK(o0, f10, f12)
531*4882a593Smuzhiyun46:	FINISH_VISCHUNK(o0, f12, f14)
532*4882a593Smuzhiyun47:	UNEVEN_VISCHUNK(o0, f14, f0)
533*4882a593Smuzhiyun48:	FINISH_VISCHUNK(o0, f16, f18)
534*4882a593Smuzhiyun49:	FINISH_VISCHUNK(o0, f18, f20)
535*4882a593Smuzhiyun50:	FINISH_VISCHUNK(o0, f20, f22)
536*4882a593Smuzhiyun51:	FINISH_VISCHUNK(o0, f22, f24)
537*4882a593Smuzhiyun52:	FINISH_VISCHUNK(o0, f24, f26)
538*4882a593Smuzhiyun53:	FINISH_VISCHUNK(o0, f26, f28)
539*4882a593Smuzhiyun54:	FINISH_VISCHUNK(o0, f28, f30)
540*4882a593Smuzhiyun55:	UNEVEN_VISCHUNK(o0, f30, f0)
541*4882a593Smuzhiyun56:	FINISH_VISCHUNK(o0, f32, f34)
542*4882a593Smuzhiyun57:	FINISH_VISCHUNK(o0, f34, f36)
543*4882a593Smuzhiyun58:	FINISH_VISCHUNK(o0, f36, f38)
544*4882a593Smuzhiyun59:	FINISH_VISCHUNK(o0, f38, f40)
545*4882a593Smuzhiyun60:	FINISH_VISCHUNK(o0, f40, f42)
546*4882a593Smuzhiyun61:	FINISH_VISCHUNK(o0, f42, f44)
547*4882a593Smuzhiyun62:	FINISH_VISCHUNK(o0, f44, f46)
548*4882a593Smuzhiyun63:	UNEVEN_VISCHUNK_LAST(o0, f46, f0)
549*4882a593Smuzhiyun
550*4882a593Smuzhiyun93:	EX_LD_FP(LOAD(ldd, %o1, %f2), U1_g3_0_fp)
551*4882a593Smuzhiyun	add		%o1, 8, %o1
552*4882a593Smuzhiyun	subcc		%g3, 8, %g3
553*4882a593Smuzhiyun	faligndata	%f0, %f2, %f8
554*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f8, %o0), U1_g3_8_fp)
555*4882a593Smuzhiyun	bl,pn		%xcc, 95f
556*4882a593Smuzhiyun	 add		%o0, 8, %o0
557*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1, %f0), U1_g3_0_fp)
558*4882a593Smuzhiyun	add		%o1, 8, %o1
559*4882a593Smuzhiyun	subcc		%g3, 8, %g3
560*4882a593Smuzhiyun	faligndata	%f2, %f0, %f8
561*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f8, %o0), U1_g3_8_fp)
562*4882a593Smuzhiyun	bge,pt		%xcc, 93b
563*4882a593Smuzhiyun	 add		%o0, 8, %o0
564*4882a593Smuzhiyun
565*4882a593Smuzhiyun95:	brz,pt		%o2, 2f
566*4882a593Smuzhiyun	 mov		%g1, %o1
567*4882a593Smuzhiyun
568*4882a593Smuzhiyun1:	EX_LD_FP(LOAD(ldub, %o1, %o3), U1_o2_0_fp)
569*4882a593Smuzhiyun	add		%o1, 1, %o1
570*4882a593Smuzhiyun	subcc		%o2, 1, %o2
571*4882a593Smuzhiyun	EX_ST_FP(STORE(stb, %o3, %o0), U1_o2_1_fp)
572*4882a593Smuzhiyun	bne,pt		%xcc, 1b
573*4882a593Smuzhiyun	 add		%o0, 1, %o0
574*4882a593Smuzhiyun
575*4882a593Smuzhiyun2:	membar		#StoreLoad | #StoreStore
576*4882a593Smuzhiyun	VISExit
577*4882a593Smuzhiyun	retl
578*4882a593Smuzhiyun	 mov		EX_RETVAL(%o4), %o0
579*4882a593Smuzhiyun
580*4882a593Smuzhiyun	.align		64
581*4882a593Smuzhiyun70:	/* 16 < len <= (5 * 64) */
582*4882a593Smuzhiyun	bne,pn		%XCC, 75f
583*4882a593Smuzhiyun	 sub		%o0, %o1, %o3
584*4882a593Smuzhiyun
585*4882a593Smuzhiyun72:	andn		%o2, 0xf, %GLOBAL_SPARE
586*4882a593Smuzhiyun	and		%o2, 0xf, %o2
587*4882a593Smuzhiyun1:	EX_LD(LOAD(ldx, %o1 + 0x00, %o5), U1_gs_0)
588*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1 + 0x08, %g1), U1_gs_0)
589*4882a593Smuzhiyun	subcc		%GLOBAL_SPARE, 0x10, %GLOBAL_SPARE
590*4882a593Smuzhiyun	EX_ST(STORE(stx, %o5, %o1 + %o3), U1_gs_10)
591*4882a593Smuzhiyun	add		%o1, 0x8, %o1
592*4882a593Smuzhiyun	EX_ST(STORE(stx, %g1, %o1 + %o3), U1_gs_8)
593*4882a593Smuzhiyun	bgu,pt		%XCC, 1b
594*4882a593Smuzhiyun	 add		%o1, 0x8, %o1
595*4882a593Smuzhiyun73:	andcc		%o2, 0x8, %g0
596*4882a593Smuzhiyun	be,pt		%XCC, 1f
597*4882a593Smuzhiyun	 nop
598*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1, %o5), U1_o2_0)
599*4882a593Smuzhiyun	sub		%o2, 0x8, %o2
600*4882a593Smuzhiyun	EX_ST(STORE(stx, %o5, %o1 + %o3), U1_o2_8)
601*4882a593Smuzhiyun	add		%o1, 0x8, %o1
602*4882a593Smuzhiyun1:	andcc		%o2, 0x4, %g0
603*4882a593Smuzhiyun	be,pt		%XCC, 1f
604*4882a593Smuzhiyun	 nop
605*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o1, %o5), U1_o2_0)
606*4882a593Smuzhiyun	sub		%o2, 0x4, %o2
607*4882a593Smuzhiyun	EX_ST(STORE(stw, %o5, %o1 + %o3), U1_o2_4)
608*4882a593Smuzhiyun	add		%o1, 0x4, %o1
609*4882a593Smuzhiyun1:	cmp		%o2, 0
610*4882a593Smuzhiyun	be,pt		%XCC, 85f
611*4882a593Smuzhiyun	 nop
612*4882a593Smuzhiyun	ba,pt		%xcc, 90f
613*4882a593Smuzhiyun	 nop
614*4882a593Smuzhiyun
615*4882a593Smuzhiyun75:	andcc		%o0, 0x7, %g1
616*4882a593Smuzhiyun	sub		%g1, 0x8, %g1
617*4882a593Smuzhiyun	be,pn		%icc, 2f
618*4882a593Smuzhiyun	 sub		%g0, %g1, %g1
619*4882a593Smuzhiyun	sub		%o2, %g1, %o2
620*4882a593Smuzhiyun
621*4882a593Smuzhiyun1:	EX_LD(LOAD(ldub, %o1, %o5), U1_g1_0)
622*4882a593Smuzhiyun	subcc		%g1, 1, %g1
623*4882a593Smuzhiyun	EX_ST(STORE(stb, %o5, %o1 + %o3), U1_g1_1)
624*4882a593Smuzhiyun	bgu,pt		%icc, 1b
625*4882a593Smuzhiyun	 add		%o1, 1, %o1
626*4882a593Smuzhiyun
627*4882a593Smuzhiyun2:	add		%o1, %o3, %o0
628*4882a593Smuzhiyun	andcc		%o1, 0x7, %g1
629*4882a593Smuzhiyun	bne,pt		%icc, 8f
630*4882a593Smuzhiyun	 sll		%g1, 3, %g1
631*4882a593Smuzhiyun
632*4882a593Smuzhiyun	cmp		%o2, 16
633*4882a593Smuzhiyun	bgeu,pt		%icc, 72b
634*4882a593Smuzhiyun	 nop
635*4882a593Smuzhiyun	ba,a,pt		%xcc, 73b
636*4882a593Smuzhiyun
637*4882a593Smuzhiyun8:	mov		64, %o3
638*4882a593Smuzhiyun	andn		%o1, 0x7, %o1
639*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1, %g2), U1_o2_0)
640*4882a593Smuzhiyun	sub		%o3, %g1, %o3
641*4882a593Smuzhiyun	andn		%o2, 0x7, %GLOBAL_SPARE
642*4882a593Smuzhiyun	sllx		%g2, %g1, %g2
643*4882a593Smuzhiyun1:	EX_LD(LOAD(ldx, %o1 + 0x8, %g3), U1_gs_0_o2_adj)
644*4882a593Smuzhiyun	subcc		%GLOBAL_SPARE, 0x8, %GLOBAL_SPARE
645*4882a593Smuzhiyun	add		%o1, 0x8, %o1
646*4882a593Smuzhiyun	srlx		%g3, %o3, %o5
647*4882a593Smuzhiyun	or		%o5, %g2, %o5
648*4882a593Smuzhiyun	EX_ST(STORE(stx, %o5, %o0), U1_gs_8_o2_adj)
649*4882a593Smuzhiyun	add		%o0, 0x8, %o0
650*4882a593Smuzhiyun	bgu,pt		%icc, 1b
651*4882a593Smuzhiyun	 sllx		%g3, %g1, %g2
652*4882a593Smuzhiyun
653*4882a593Smuzhiyun	srl		%g1, 3, %g1
654*4882a593Smuzhiyun	andcc		%o2, 0x7, %o2
655*4882a593Smuzhiyun	be,pn		%icc, 85f
656*4882a593Smuzhiyun	 add		%o1, %g1, %o1
657*4882a593Smuzhiyun	ba,pt		%xcc, 90f
658*4882a593Smuzhiyun	 sub		%o0, %o1, %o3
659*4882a593Smuzhiyun
660*4882a593Smuzhiyun	.align		64
661*4882a593Smuzhiyun80:	/* 0 < len <= 16 */
662*4882a593Smuzhiyun	andcc		%o3, 0x3, %g0
663*4882a593Smuzhiyun	bne,pn		%XCC, 90f
664*4882a593Smuzhiyun	 sub		%o0, %o1, %o3
665*4882a593Smuzhiyun
666*4882a593Smuzhiyun1:	EX_LD(LOAD(lduw, %o1, %g1), U1_o2_0)
667*4882a593Smuzhiyun	subcc		%o2, 4, %o2
668*4882a593Smuzhiyun	EX_ST(STORE(stw, %g1, %o1 + %o3), U1_o2_4)
669*4882a593Smuzhiyun	bgu,pt		%XCC, 1b
670*4882a593Smuzhiyun	 add		%o1, 4, %o1
671*4882a593Smuzhiyun
672*4882a593Smuzhiyun85:	retl
673*4882a593Smuzhiyun	 mov		EX_RETVAL(%o4), %o0
674*4882a593Smuzhiyun
675*4882a593Smuzhiyun	.align		32
676*4882a593Smuzhiyun90:	EX_LD(LOAD(ldub, %o1, %g1), U1_o2_0)
677*4882a593Smuzhiyun	subcc		%o2, 1, %o2
678*4882a593Smuzhiyun	EX_ST(STORE(stb, %g1, %o1 + %o3), U1_o2_1)
679*4882a593Smuzhiyun	bgu,pt		%XCC, 90b
680*4882a593Smuzhiyun	 add		%o1, 1, %o1
681*4882a593Smuzhiyun	retl
682*4882a593Smuzhiyun	 mov		EX_RETVAL(%o4), %o0
683*4882a593Smuzhiyun
684*4882a593Smuzhiyun	.size		FUNC_NAME, .-FUNC_NAME
685*4882a593SmuzhiyunEXPORT_SYMBOL(FUNC_NAME)
686