xref: /OK3568_Linux_fs/kernel/arch/sparc/lib/U3memcpy.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun/* U3memcpy.S: UltraSparc-III optimized memcpy.
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
5*4882a593Smuzhiyun */
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun#ifdef __KERNEL__
8*4882a593Smuzhiyun#include <linux/linkage.h>
9*4882a593Smuzhiyun#include <asm/visasm.h>
10*4882a593Smuzhiyun#include <asm/asi.h>
11*4882a593Smuzhiyun#define GLOBAL_SPARE	%g7
12*4882a593Smuzhiyun#else
13*4882a593Smuzhiyun#define ASI_BLK_P 0xf0
14*4882a593Smuzhiyun#define FPRS_FEF  0x04
15*4882a593Smuzhiyun#ifdef MEMCPY_DEBUG
16*4882a593Smuzhiyun#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
17*4882a593Smuzhiyun		     clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;
18*4882a593Smuzhiyun#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
19*4882a593Smuzhiyun#else
20*4882a593Smuzhiyun#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
21*4882a593Smuzhiyun#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
22*4882a593Smuzhiyun#endif
23*4882a593Smuzhiyun#define GLOBAL_SPARE	%g5
24*4882a593Smuzhiyun#endif
25*4882a593Smuzhiyun
26*4882a593Smuzhiyun#ifndef EX_LD
27*4882a593Smuzhiyun#define EX_LD(x,y)	x
28*4882a593Smuzhiyun#endif
29*4882a593Smuzhiyun#ifndef EX_LD_FP
30*4882a593Smuzhiyun#define EX_LD_FP(x,y)	x
31*4882a593Smuzhiyun#endif
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun#ifndef EX_ST
34*4882a593Smuzhiyun#define EX_ST(x,y)	x
35*4882a593Smuzhiyun#endif
36*4882a593Smuzhiyun#ifndef EX_ST_FP
37*4882a593Smuzhiyun#define EX_ST_FP(x,y)	x
38*4882a593Smuzhiyun#endif
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun#ifndef LOAD
41*4882a593Smuzhiyun#define LOAD(type,addr,dest)	type [addr], dest
42*4882a593Smuzhiyun#endif
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun#ifndef STORE
45*4882a593Smuzhiyun#define STORE(type,src,addr)	type src, [addr]
46*4882a593Smuzhiyun#endif
47*4882a593Smuzhiyun
48*4882a593Smuzhiyun#ifndef STORE_BLK
49*4882a593Smuzhiyun#define STORE_BLK(src,addr)	stda src, [addr] ASI_BLK_P
50*4882a593Smuzhiyun#endif
51*4882a593Smuzhiyun
52*4882a593Smuzhiyun#ifndef FUNC_NAME
53*4882a593Smuzhiyun#define FUNC_NAME	U3memcpy
54*4882a593Smuzhiyun#endif
55*4882a593Smuzhiyun
56*4882a593Smuzhiyun#ifndef PREAMBLE
57*4882a593Smuzhiyun#define PREAMBLE
58*4882a593Smuzhiyun#endif
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun#ifndef XCC
61*4882a593Smuzhiyun#define XCC xcc
62*4882a593Smuzhiyun#endif
63*4882a593Smuzhiyun
64*4882a593Smuzhiyun	.register	%g2,#scratch
65*4882a593Smuzhiyun	.register	%g3,#scratch
66*4882a593Smuzhiyun
67*4882a593Smuzhiyun	/* Special/non-trivial issues of this code:
68*4882a593Smuzhiyun	 *
69*4882a593Smuzhiyun	 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
70*4882a593Smuzhiyun	 * 2) Only low 32 FPU registers are used so that only the
71*4882a593Smuzhiyun	 *    lower half of the FPU register set is dirtied by this
72*4882a593Smuzhiyun	 *    code.  This is especially important in the kernel.
73*4882a593Smuzhiyun	 * 3) This code never prefetches cachelines past the end
74*4882a593Smuzhiyun	 *    of the source buffer.
75*4882a593Smuzhiyun	 */
76*4882a593Smuzhiyun
77*4882a593Smuzhiyun	.text
78*4882a593Smuzhiyun#ifndef EX_RETVAL
79*4882a593Smuzhiyun#define EX_RETVAL(x)	x
80*4882a593Smuzhiyun__restore_fp:
81*4882a593Smuzhiyun	VISExitHalf
82*4882a593Smuzhiyun	retl
83*4882a593Smuzhiyun	 nop
84*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_g2_plus_g1_plus_1_fp)
85*4882a593Smuzhiyun	add	%g1, 1, %g1
86*4882a593Smuzhiyun	add	%g2, %g1, %g2
87*4882a593Smuzhiyun	ba,pt	%xcc, __restore_fp
88*4882a593Smuzhiyun	 add	%o2, %g2, %o0
89*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_g2_plus_g1_plus_1_fp)
90*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_g2_fp)
91*4882a593Smuzhiyun	ba,pt	%xcc, __restore_fp
92*4882a593Smuzhiyun	 add	%o2, %g2, %o0
93*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_g2_fp)
94*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_g2_plus_8_fp)
95*4882a593Smuzhiyun	add	%g2, 8, %g2
96*4882a593Smuzhiyun	ba,pt	%xcc, __restore_fp
97*4882a593Smuzhiyun	 add	%o2, %g2, %o0
98*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_g2_plus_8_fp)
99*4882a593SmuzhiyunENTRY(U3_retl_o2)
100*4882a593Smuzhiyun	retl
101*4882a593Smuzhiyun	 mov	%o2, %o0
102*4882a593SmuzhiyunENDPROC(U3_retl_o2)
103*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_1)
104*4882a593Smuzhiyun	retl
105*4882a593Smuzhiyun	 add	%o2, 1, %o0
106*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_1)
107*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_4)
108*4882a593Smuzhiyun	retl
109*4882a593Smuzhiyun	 add	%o2, 4, %o0
110*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_4)
111*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_8)
112*4882a593Smuzhiyun	retl
113*4882a593Smuzhiyun	 add	%o2, 8, %o0
114*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_8)
115*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_g1_plus_1)
116*4882a593Smuzhiyun	add	%g1, 1, %g1
117*4882a593Smuzhiyun	retl
118*4882a593Smuzhiyun	 add	%o2, %g1, %o0
119*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_g1_plus_1)
120*4882a593SmuzhiyunENTRY(U3_retl_o2_fp)
121*4882a593Smuzhiyun	ba,pt	%xcc, __restore_fp
122*4882a593Smuzhiyun	 mov	%o2, %o0
123*4882a593SmuzhiyunENDPROC(U3_retl_o2_fp)
124*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp)
125*4882a593Smuzhiyun	sll	%o3, 6, %o3
126*4882a593Smuzhiyun	add	%o3, 0x80, %o3
127*4882a593Smuzhiyun	ba,pt	%xcc, __restore_fp
128*4882a593Smuzhiyun	 add	%o2, %o3, %o0
129*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp)
130*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp)
131*4882a593Smuzhiyun	sll	%o3, 6, %o3
132*4882a593Smuzhiyun	add	%o3, 0x40, %o3
133*4882a593Smuzhiyun	ba,pt	%xcc, __restore_fp
134*4882a593Smuzhiyun	 add	%o2, %o3, %o0
135*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp)
136*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_GS_plus_0x10)
137*4882a593Smuzhiyun	add	GLOBAL_SPARE, 0x10, GLOBAL_SPARE
138*4882a593Smuzhiyun	retl
139*4882a593Smuzhiyun	 add	%o2, GLOBAL_SPARE, %o0
140*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_GS_plus_0x10)
141*4882a593SmuzhiyunENTRY(U3_retl_o2_plus_GS_plus_0x08)
142*4882a593Smuzhiyun	add	GLOBAL_SPARE, 0x08, GLOBAL_SPARE
143*4882a593Smuzhiyun	retl
144*4882a593Smuzhiyun	 add	%o2, GLOBAL_SPARE, %o0
145*4882a593SmuzhiyunENDPROC(U3_retl_o2_plus_GS_plus_0x08)
146*4882a593SmuzhiyunENTRY(U3_retl_o2_and_7_plus_GS)
147*4882a593Smuzhiyun	and	%o2, 7, %o2
148*4882a593Smuzhiyun	retl
149*4882a593Smuzhiyun	 add	%o2, GLOBAL_SPARE, %o0
150*4882a593SmuzhiyunENDPROC(U3_retl_o2_and_7_plus_GS)
151*4882a593SmuzhiyunENTRY(U3_retl_o2_and_7_plus_GS_plus_8)
152*4882a593Smuzhiyun	add	GLOBAL_SPARE, 8, GLOBAL_SPARE
153*4882a593Smuzhiyun	and	%o2, 7, %o2
154*4882a593Smuzhiyun	retl
155*4882a593Smuzhiyun	 add	%o2, GLOBAL_SPARE, %o0
156*4882a593SmuzhiyunENDPROC(U3_retl_o2_and_7_plus_GS_plus_8)
157*4882a593Smuzhiyun#endif
158*4882a593Smuzhiyun
159*4882a593Smuzhiyun	.align		64
160*4882a593Smuzhiyun
161*4882a593Smuzhiyun	/* The cheetah's flexible spine, oversized liver, enlarged heart,
162*4882a593Smuzhiyun	 * slender muscular body, and claws make it the swiftest hunter
163*4882a593Smuzhiyun	 * in Africa and the fastest animal on land.  Can reach speeds
164*4882a593Smuzhiyun	 * of up to 2.4GB per second.
165*4882a593Smuzhiyun	 */
166*4882a593Smuzhiyun
167*4882a593Smuzhiyun	.globl	FUNC_NAME
168*4882a593Smuzhiyun	.type	FUNC_NAME,#function
169*4882a593SmuzhiyunFUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
170*4882a593Smuzhiyun	srlx		%o2, 31, %g2
171*4882a593Smuzhiyun	cmp		%g2, 0
172*4882a593Smuzhiyun
173*4882a593Smuzhiyun	/* software trap 5 "Range Check" if dst >= 0x80000000 */
174*4882a593Smuzhiyun	tne		%xcc, 5
175*4882a593Smuzhiyun	PREAMBLE
176*4882a593Smuzhiyun	mov		%o0, %o4
177*4882a593Smuzhiyun
178*4882a593Smuzhiyun	/* if len == 0 */
179*4882a593Smuzhiyun	cmp		%o2, 0
180*4882a593Smuzhiyun	be,pn		%XCC, end_return
181*4882a593Smuzhiyun	 or		%o0, %o1, %o3
182*4882a593Smuzhiyun
183*4882a593Smuzhiyun	/* if len < 16 */
184*4882a593Smuzhiyun	cmp		%o2, 16
185*4882a593Smuzhiyun	blu,a,pn	%XCC, less_than_16
186*4882a593Smuzhiyun	 or		%o3, %o2, %o3
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun	/* if len < 192 */
189*4882a593Smuzhiyun	cmp		%o2, (3 * 64)
190*4882a593Smuzhiyun	blu,pt		%XCC, less_than_192
191*4882a593Smuzhiyun	 andcc		%o3, 0x7, %g0
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun	/* Clobbers o5/g1/g2/g3/g7/icc/xcc.  We must preserve
194*4882a593Smuzhiyun	 * o5 from here until we hit VISExitHalf.
195*4882a593Smuzhiyun	 */
196*4882a593Smuzhiyun	VISEntryHalf
197*4882a593Smuzhiyun
198*4882a593Smuzhiyun	/* Is 'dst' already aligned on an 64-byte boundary? */
199*4882a593Smuzhiyun	andcc		%o0, 0x3f, %g2
200*4882a593Smuzhiyun	be,pt		%XCC, 2f
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun	/* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
203*4882a593Smuzhiyun	 * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
204*4882a593Smuzhiyun	 * subtract this from 'len'.
205*4882a593Smuzhiyun	 */
206*4882a593Smuzhiyun	 sub		%o0, %o1, GLOBAL_SPARE
207*4882a593Smuzhiyun	sub		%g2, 0x40, %g2
208*4882a593Smuzhiyun	sub		%g0, %g2, %g2
209*4882a593Smuzhiyun	sub		%o2, %g2, %o2
210*4882a593Smuzhiyun	andcc		%g2, 0x7, %g1
211*4882a593Smuzhiyun	be,pt		%icc, 2f
212*4882a593Smuzhiyun	 and		%g2, 0x38, %g2
213*4882a593Smuzhiyun
214*4882a593Smuzhiyun1:	subcc		%g1, 0x1, %g1
215*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3), U3_retl_o2_plus_g2_plus_g1_plus_1)
216*4882a593Smuzhiyun	EX_ST_FP(STORE(stb, %o3, %o1 + GLOBAL_SPARE), U3_retl_o2_plus_g2_plus_g1_plus_1)
217*4882a593Smuzhiyun	bgu,pt		%XCC, 1b
218*4882a593Smuzhiyun	 add		%o1, 0x1, %o1
219*4882a593Smuzhiyun
220*4882a593Smuzhiyun	add		%o1, GLOBAL_SPARE, %o0
221*4882a593Smuzhiyun
222*4882a593Smuzhiyun2:	cmp		%g2, 0x0
223*4882a593Smuzhiyun	and		%o1, 0x7, %g1
224*4882a593Smuzhiyun	be,pt		%icc, 3f
225*4882a593Smuzhiyun	 alignaddr	%o1, %g0, %o1
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1, %f4), U3_retl_o2_plus_g2)
228*4882a593Smuzhiyun1:	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6), U3_retl_o2_plus_g2)
229*4882a593Smuzhiyun	add		%o1, 0x8, %o1
230*4882a593Smuzhiyun	subcc		%g2, 0x8, %g2
231*4882a593Smuzhiyun	faligndata	%f4, %f6, %f0
232*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f0, %o0), U3_retl_o2_plus_g2_plus_8)
233*4882a593Smuzhiyun	be,pn		%icc, 3f
234*4882a593Smuzhiyun	 add		%o0, 0x8, %o0
235*4882a593Smuzhiyun
236*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4), U3_retl_o2_plus_g2)
237*4882a593Smuzhiyun	add		%o1, 0x8, %o1
238*4882a593Smuzhiyun	subcc		%g2, 0x8, %g2
239*4882a593Smuzhiyun	faligndata	%f6, %f4, %f2
240*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f2, %o0), U3_retl_o2_plus_g2_plus_8)
241*4882a593Smuzhiyun	bne,pt		%icc, 1b
242*4882a593Smuzhiyun	 add		%o0, 0x8, %o0
243*4882a593Smuzhiyun
244*4882a593Smuzhiyun3:	LOAD(prefetch, %o1 + 0x000, #one_read)
245*4882a593Smuzhiyun	LOAD(prefetch, %o1 + 0x040, #one_read)
246*4882a593Smuzhiyun	andn		%o2, (0x40 - 1), GLOBAL_SPARE
247*4882a593Smuzhiyun	LOAD(prefetch, %o1 + 0x080, #one_read)
248*4882a593Smuzhiyun	LOAD(prefetch, %o1 + 0x0c0, #one_read)
249*4882a593Smuzhiyun	LOAD(prefetch, %o1 + 0x100, #one_read)
250*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x000, %f0), U3_retl_o2)
251*4882a593Smuzhiyun	LOAD(prefetch, %o1 + 0x140, #one_read)
252*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2)
253*4882a593Smuzhiyun	LOAD(prefetch, %o1 + 0x180, #one_read)
254*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2)
255*4882a593Smuzhiyun	LOAD(prefetch, %o1 + 0x1c0, #one_read)
256*4882a593Smuzhiyun	faligndata	%f0, %f2, %f16
257*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2)
258*4882a593Smuzhiyun	faligndata	%f2, %f4, %f18
259*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2)
260*4882a593Smuzhiyun	faligndata	%f4, %f6, %f20
261*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2)
262*4882a593Smuzhiyun	faligndata	%f6, %f8, %f22
263*4882a593Smuzhiyun
264*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2)
265*4882a593Smuzhiyun	faligndata	%f8, %f10, %f24
266*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2)
267*4882a593Smuzhiyun	faligndata	%f10, %f12, %f26
268*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2)
269*4882a593Smuzhiyun
270*4882a593Smuzhiyun	subcc		GLOBAL_SPARE, 0x80, GLOBAL_SPARE
271*4882a593Smuzhiyun	add		%o1, 0x40, %o1
272*4882a593Smuzhiyun	bgu,pt		%XCC, 1f
273*4882a593Smuzhiyun	 srl		GLOBAL_SPARE, 6, %o3
274*4882a593Smuzhiyun	ba,pt		%xcc, 2f
275*4882a593Smuzhiyun	 nop
276*4882a593Smuzhiyun
277*4882a593Smuzhiyun	.align		64
278*4882a593Smuzhiyun1:
279*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80)
280*4882a593Smuzhiyun	faligndata	%f12, %f14, %f28
281*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80)
282*4882a593Smuzhiyun	faligndata	%f14, %f0, %f30
283*4882a593Smuzhiyun	EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
284*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40)
285*4882a593Smuzhiyun	faligndata	%f0, %f2, %f16
286*4882a593Smuzhiyun	add		%o0, 0x40, %o0
287*4882a593Smuzhiyun
288*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40)
289*4882a593Smuzhiyun	faligndata	%f2, %f4, %f18
290*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40)
291*4882a593Smuzhiyun	faligndata	%f4, %f6, %f20
292*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40)
293*4882a593Smuzhiyun	subcc		%o3, 0x01, %o3
294*4882a593Smuzhiyun	faligndata	%f6, %f8, %f22
295*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x80)
296*4882a593Smuzhiyun
297*4882a593Smuzhiyun	faligndata	%f8, %f10, %f24
298*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
299*4882a593Smuzhiyun	LOAD(prefetch, %o1 + 0x1c0, #one_read)
300*4882a593Smuzhiyun	faligndata	%f10, %f12, %f26
301*4882a593Smuzhiyun	bg,pt		%XCC, 1b
302*4882a593Smuzhiyun	 add		%o1, 0x40, %o1
303*4882a593Smuzhiyun
304*4882a593Smuzhiyun	/* Finally we copy the last full 64-byte block. */
305*4882a593Smuzhiyun2:
306*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80)
307*4882a593Smuzhiyun	faligndata	%f12, %f14, %f28
308*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80)
309*4882a593Smuzhiyun	faligndata	%f14, %f0, %f30
310*4882a593Smuzhiyun	EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
311*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40)
312*4882a593Smuzhiyun	faligndata	%f0, %f2, %f16
313*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40)
314*4882a593Smuzhiyun	faligndata	%f2, %f4, %f18
315*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40)
316*4882a593Smuzhiyun	faligndata	%f4, %f6, %f20
317*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40)
318*4882a593Smuzhiyun	faligndata	%f6, %f8, %f22
319*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x40)
320*4882a593Smuzhiyun	faligndata	%f8, %f10, %f24
321*4882a593Smuzhiyun	cmp		%g1, 0
322*4882a593Smuzhiyun	be,pt		%XCC, 1f
323*4882a593Smuzhiyun	 add		%o0, 0x40, %o0
324*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x40)
325*4882a593Smuzhiyun1:	faligndata	%f10, %f12, %f26
326*4882a593Smuzhiyun	faligndata	%f12, %f14, %f28
327*4882a593Smuzhiyun	faligndata	%f14, %f0, %f30
328*4882a593Smuzhiyun	EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x40)
329*4882a593Smuzhiyun	add		%o0, 0x40, %o0
330*4882a593Smuzhiyun	add		%o1, 0x40, %o1
331*4882a593Smuzhiyun	membar		#Sync
332*4882a593Smuzhiyun
333*4882a593Smuzhiyun	/* Now we copy the (len modulo 64) bytes at the end.
334*4882a593Smuzhiyun	 * Note how we borrow the %f0 loaded above.
335*4882a593Smuzhiyun	 *
336*4882a593Smuzhiyun	 * Also notice how this code is careful not to perform a
337*4882a593Smuzhiyun	 * load past the end of the src buffer.
338*4882a593Smuzhiyun	 */
339*4882a593Smuzhiyun	and		%o2, 0x3f, %o2
340*4882a593Smuzhiyun	andcc		%o2, 0x38, %g2
341*4882a593Smuzhiyun	be,pn		%XCC, 2f
342*4882a593Smuzhiyun	 subcc		%g2, 0x8, %g2
343*4882a593Smuzhiyun	be,pn		%XCC, 2f
344*4882a593Smuzhiyun	 cmp		%g1, 0
345*4882a593Smuzhiyun
346*4882a593Smuzhiyun	sub		%o2, %g2, %o2
347*4882a593Smuzhiyun	be,a,pt		%XCC, 1f
348*4882a593Smuzhiyun	 EX_LD_FP(LOAD(ldd, %o1 + 0x00, %f0), U3_retl_o2_plus_g2)
349*4882a593Smuzhiyun
350*4882a593Smuzhiyun1:	EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f2), U3_retl_o2_plus_g2)
351*4882a593Smuzhiyun	add		%o1, 0x8, %o1
352*4882a593Smuzhiyun	subcc		%g2, 0x8, %g2
353*4882a593Smuzhiyun	faligndata	%f0, %f2, %f8
354*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8)
355*4882a593Smuzhiyun	be,pn		%XCC, 2f
356*4882a593Smuzhiyun	 add		%o0, 0x8, %o0
357*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f0), U3_retl_o2_plus_g2)
358*4882a593Smuzhiyun	add		%o1, 0x8, %o1
359*4882a593Smuzhiyun	subcc		%g2, 0x8, %g2
360*4882a593Smuzhiyun	faligndata	%f2, %f0, %f8
361*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8)
362*4882a593Smuzhiyun	bne,pn		%XCC, 1b
363*4882a593Smuzhiyun	 add		%o0, 0x8, %o0
364*4882a593Smuzhiyun
365*4882a593Smuzhiyun	/* If anything is left, we copy it one byte at a time.
366*4882a593Smuzhiyun	 * Note that %g1 is (src & 0x3) saved above before the
367*4882a593Smuzhiyun	 * alignaddr was performed.
368*4882a593Smuzhiyun	 */
369*4882a593Smuzhiyun2:
370*4882a593Smuzhiyun	cmp		%o2, 0
371*4882a593Smuzhiyun	add		%o1, %g1, %o1
372*4882a593Smuzhiyun	VISExitHalf
373*4882a593Smuzhiyun	be,pn		%XCC, end_return
374*4882a593Smuzhiyun	 sub		%o0, %o1, %o3
375*4882a593Smuzhiyun
376*4882a593Smuzhiyun	andcc		%g1, 0x7, %g0
377*4882a593Smuzhiyun	bne,pn		%icc, 90f
378*4882a593Smuzhiyun	 andcc		%o2, 0x8, %g0
379*4882a593Smuzhiyun	be,pt		%icc, 1f
380*4882a593Smuzhiyun	 nop
381*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2)
382*4882a593Smuzhiyun	EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2)
383*4882a593Smuzhiyun	add		%o1, 0x8, %o1
384*4882a593Smuzhiyun	sub		%o2, 8, %o2
385*4882a593Smuzhiyun
386*4882a593Smuzhiyun1:	andcc		%o2, 0x4, %g0
387*4882a593Smuzhiyun	be,pt		%icc, 1f
388*4882a593Smuzhiyun	 nop
389*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2)
390*4882a593Smuzhiyun	EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2)
391*4882a593Smuzhiyun	add		%o1, 0x4, %o1
392*4882a593Smuzhiyun	sub		%o2, 4, %o2
393*4882a593Smuzhiyun
394*4882a593Smuzhiyun1:	andcc		%o2, 0x2, %g0
395*4882a593Smuzhiyun	be,pt		%icc, 1f
396*4882a593Smuzhiyun	 nop
397*4882a593Smuzhiyun	EX_LD(LOAD(lduh, %o1, %o5), U3_retl_o2)
398*4882a593Smuzhiyun	EX_ST(STORE(sth, %o5, %o1 + %o3), U3_retl_o2)
399*4882a593Smuzhiyun	add		%o1, 0x2, %o1
400*4882a593Smuzhiyun	sub		%o2, 2, %o2
401*4882a593Smuzhiyun
402*4882a593Smuzhiyun1:	andcc		%o2, 0x1, %g0
403*4882a593Smuzhiyun	be,pt		%icc, end_return
404*4882a593Smuzhiyun	 nop
405*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2)
406*4882a593Smuzhiyun	ba,pt		%xcc, end_return
407*4882a593Smuzhiyun	 EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2)
408*4882a593Smuzhiyun
409*4882a593Smuzhiyun	.align		64
410*4882a593Smuzhiyun	/* 16 <= len < 192 */
411*4882a593Smuzhiyunless_than_192:
412*4882a593Smuzhiyun	bne,pn		%XCC, 75f
413*4882a593Smuzhiyun	 sub		%o0, %o1, %o3
414*4882a593Smuzhiyun
415*4882a593Smuzhiyun72:
416*4882a593Smuzhiyun	andn		%o2, 0xf, GLOBAL_SPARE
417*4882a593Smuzhiyun	and		%o2, 0xf, %o2
418*4882a593Smuzhiyun1:	subcc		GLOBAL_SPARE, 0x10, GLOBAL_SPARE
419*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1 + 0x00, %o5), U3_retl_o2_plus_GS_plus_0x10)
420*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1 + 0x08, %g1), U3_retl_o2_plus_GS_plus_0x10)
421*4882a593Smuzhiyun	EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x10)
422*4882a593Smuzhiyun	add		%o1, 0x8, %o1
423*4882a593Smuzhiyun	EX_ST(STORE(stx, %g1, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x08)
424*4882a593Smuzhiyun	bgu,pt		%XCC, 1b
425*4882a593Smuzhiyun	 add		%o1, 0x8, %o1
426*4882a593Smuzhiyun73:	andcc		%o2, 0x8, %g0
427*4882a593Smuzhiyun	be,pt		%XCC, 1f
428*4882a593Smuzhiyun	 nop
429*4882a593Smuzhiyun	sub		%o2, 0x8, %o2
430*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2_plus_8)
431*4882a593Smuzhiyun	EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_8)
432*4882a593Smuzhiyun	add		%o1, 0x8, %o1
433*4882a593Smuzhiyun1:	andcc		%o2, 0x4, %g0
434*4882a593Smuzhiyun	be,pt		%XCC, 1f
435*4882a593Smuzhiyun	 nop
436*4882a593Smuzhiyun	sub		%o2, 0x4, %o2
437*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2_plus_4)
438*4882a593Smuzhiyun	EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2_plus_4)
439*4882a593Smuzhiyun	add		%o1, 0x4, %o1
440*4882a593Smuzhiyun1:	cmp		%o2, 0
441*4882a593Smuzhiyun	be,pt		%XCC, end_return
442*4882a593Smuzhiyun	 nop
443*4882a593Smuzhiyun	ba,pt		%xcc, 90f
444*4882a593Smuzhiyun	 nop
445*4882a593Smuzhiyun
446*4882a593Smuzhiyun75:
447*4882a593Smuzhiyun	andcc		%o0, 0x7, %g1
448*4882a593Smuzhiyun	sub		%g1, 0x8, %g1
449*4882a593Smuzhiyun	be,pn		%icc, 2f
450*4882a593Smuzhiyun	 sub		%g0, %g1, %g1
451*4882a593Smuzhiyun	sub		%o2, %g1, %o2
452*4882a593Smuzhiyun
453*4882a593Smuzhiyun1:	subcc		%g1, 1, %g1
454*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2_plus_g1_plus_1)
455*4882a593Smuzhiyun	EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2_plus_g1_plus_1)
456*4882a593Smuzhiyun	bgu,pt		%icc, 1b
457*4882a593Smuzhiyun	 add		%o1, 1, %o1
458*4882a593Smuzhiyun
459*4882a593Smuzhiyun2:	add		%o1, %o3, %o0
460*4882a593Smuzhiyun	andcc		%o1, 0x7, %g1
461*4882a593Smuzhiyun	bne,pt		%icc, 8f
462*4882a593Smuzhiyun	 sll		%g1, 3, %g1
463*4882a593Smuzhiyun
464*4882a593Smuzhiyun	cmp		%o2, 16
465*4882a593Smuzhiyun	bgeu,pt		%icc, 72b
466*4882a593Smuzhiyun	 nop
467*4882a593Smuzhiyun	ba,a,pt		%xcc, 73b
468*4882a593Smuzhiyun
469*4882a593Smuzhiyun8:	mov		64, %o3
470*4882a593Smuzhiyun	andn		%o1, 0x7, %o1
471*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1, %g2), U3_retl_o2)
472*4882a593Smuzhiyun	sub		%o3, %g1, %o3
473*4882a593Smuzhiyun	andn		%o2, 0x7, GLOBAL_SPARE
474*4882a593Smuzhiyun	sllx		%g2, %g1, %g2
475*4882a593Smuzhiyun1:	EX_LD(LOAD(ldx, %o1 + 0x8, %g3), U3_retl_o2_and_7_plus_GS)
476*4882a593Smuzhiyun	subcc		GLOBAL_SPARE, 0x8, GLOBAL_SPARE
477*4882a593Smuzhiyun	add		%o1, 0x8, %o1
478*4882a593Smuzhiyun	srlx		%g3, %o3, %o5
479*4882a593Smuzhiyun	or		%o5, %g2, %o5
480*4882a593Smuzhiyun	EX_ST(STORE(stx, %o5, %o0), U3_retl_o2_and_7_plus_GS_plus_8)
481*4882a593Smuzhiyun	add		%o0, 0x8, %o0
482*4882a593Smuzhiyun	bgu,pt		%icc, 1b
483*4882a593Smuzhiyun	 sllx		%g3, %g1, %g2
484*4882a593Smuzhiyun
485*4882a593Smuzhiyun	srl		%g1, 3, %g1
486*4882a593Smuzhiyun	andcc		%o2, 0x7, %o2
487*4882a593Smuzhiyun	be,pn		%icc, end_return
488*4882a593Smuzhiyun	 add		%o1, %g1, %o1
489*4882a593Smuzhiyun	ba,pt		%xcc, 90f
490*4882a593Smuzhiyun	 sub		%o0, %o1, %o3
491*4882a593Smuzhiyun
492*4882a593Smuzhiyun	.align		64
493*4882a593Smuzhiyun	/* 0 < len < 16 */
494*4882a593Smuzhiyunless_than_16:
495*4882a593Smuzhiyun	andcc		%o3, 0x3, %g0
496*4882a593Smuzhiyun	bne,pn		%XCC, 90f
497*4882a593Smuzhiyun	 sub		%o0, %o1, %o3
498*4882a593Smuzhiyun
499*4882a593Smuzhiyun1:
500*4882a593Smuzhiyun	subcc		%o2, 4, %o2
501*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o1, %g1), U3_retl_o2_plus_4)
502*4882a593Smuzhiyun	EX_ST(STORE(stw, %g1, %o1 + %o3), U3_retl_o2_plus_4)
503*4882a593Smuzhiyun	bgu,pt		%XCC, 1b
504*4882a593Smuzhiyun	 add		%o1, 4, %o1
505*4882a593Smuzhiyun
506*4882a593Smuzhiyunend_return:
507*4882a593Smuzhiyun	retl
508*4882a593Smuzhiyun	 mov		EX_RETVAL(%o4), %o0
509*4882a593Smuzhiyun
510*4882a593Smuzhiyun	.align		32
511*4882a593Smuzhiyun90:
512*4882a593Smuzhiyun	subcc		%o2, 1, %o2
513*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o1, %g1), U3_retl_o2_plus_1)
514*4882a593Smuzhiyun	EX_ST(STORE(stb, %g1, %o1 + %o3), U3_retl_o2_plus_1)
515*4882a593Smuzhiyun	bgu,pt		%XCC, 90b
516*4882a593Smuzhiyun	 add		%o1, 1, %o1
517*4882a593Smuzhiyun	retl
518*4882a593Smuzhiyun	 mov		EX_RETVAL(%o4), %o0
519*4882a593Smuzhiyun
520*4882a593Smuzhiyun	.size		FUNC_NAME, .-FUNC_NAME
521