xref: /OK3568_Linux_fs/kernel/arch/sparc/lib/NG2memcpy.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun/* NG2memcpy.S: Niagara-2 optimized memcpy.
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * Copyright (C) 2007 David S. Miller (davem@davemloft.net)
5*4882a593Smuzhiyun */
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun#ifdef __KERNEL__
8*4882a593Smuzhiyun#include <linux/linkage.h>
9*4882a593Smuzhiyun#include <asm/visasm.h>
10*4882a593Smuzhiyun#include <asm/asi.h>
11*4882a593Smuzhiyun#define GLOBAL_SPARE	%g7
12*4882a593Smuzhiyun#else
13*4882a593Smuzhiyun#define ASI_PNF 0x82
14*4882a593Smuzhiyun#define ASI_BLK_P 0xf0
15*4882a593Smuzhiyun#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
16*4882a593Smuzhiyun#define FPRS_FEF  0x04
17*4882a593Smuzhiyun#ifdef MEMCPY_DEBUG
18*4882a593Smuzhiyun#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
19*4882a593Smuzhiyun		     clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
20*4882a593Smuzhiyun#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
21*4882a593Smuzhiyun#else
22*4882a593Smuzhiyun#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
23*4882a593Smuzhiyun#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
24*4882a593Smuzhiyun#endif
25*4882a593Smuzhiyun#define GLOBAL_SPARE	%g5
26*4882a593Smuzhiyun#endif
27*4882a593Smuzhiyun
28*4882a593Smuzhiyun#ifndef STORE_ASI
29*4882a593Smuzhiyun#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
30*4882a593Smuzhiyun#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
31*4882a593Smuzhiyun#else
32*4882a593Smuzhiyun#define STORE_ASI	0x80		/* ASI_P */
33*4882a593Smuzhiyun#endif
34*4882a593Smuzhiyun#endif
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun#ifndef EX_LD
37*4882a593Smuzhiyun#define EX_LD(x,y)	x
38*4882a593Smuzhiyun#endif
39*4882a593Smuzhiyun#ifndef EX_LD_FP
40*4882a593Smuzhiyun#define EX_LD_FP(x,y)	x
41*4882a593Smuzhiyun#endif
42*4882a593Smuzhiyun
43*4882a593Smuzhiyun#ifndef EX_ST
44*4882a593Smuzhiyun#define EX_ST(x,y)	x
45*4882a593Smuzhiyun#endif
46*4882a593Smuzhiyun#ifndef EX_ST_FP
47*4882a593Smuzhiyun#define EX_ST_FP(x,y)	x
48*4882a593Smuzhiyun#endif
49*4882a593Smuzhiyun
50*4882a593Smuzhiyun#ifndef LOAD
51*4882a593Smuzhiyun#define LOAD(type,addr,dest)	type [addr], dest
52*4882a593Smuzhiyun#endif
53*4882a593Smuzhiyun
54*4882a593Smuzhiyun#ifndef LOAD_BLK
55*4882a593Smuzhiyun#define LOAD_BLK(addr,dest)	ldda [addr] ASI_BLK_P, dest
56*4882a593Smuzhiyun#endif
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun#ifndef STORE
59*4882a593Smuzhiyun#ifndef MEMCPY_DEBUG
60*4882a593Smuzhiyun#define STORE(type,src,addr)	type src, [addr]
61*4882a593Smuzhiyun#else
62*4882a593Smuzhiyun#define STORE(type,src,addr)	type##a src, [addr] 0x80
63*4882a593Smuzhiyun#endif
64*4882a593Smuzhiyun#endif
65*4882a593Smuzhiyun
66*4882a593Smuzhiyun#ifndef STORE_BLK
67*4882a593Smuzhiyun#define STORE_BLK(src,addr)	stda src, [addr] ASI_BLK_P
68*4882a593Smuzhiyun#endif
69*4882a593Smuzhiyun
70*4882a593Smuzhiyun#ifndef STORE_INIT
71*4882a593Smuzhiyun#define STORE_INIT(src,addr)	stxa src, [addr] STORE_ASI
72*4882a593Smuzhiyun#endif
73*4882a593Smuzhiyun
74*4882a593Smuzhiyun#ifndef FUNC_NAME
75*4882a593Smuzhiyun#define FUNC_NAME	NG2memcpy
76*4882a593Smuzhiyun#endif
77*4882a593Smuzhiyun
78*4882a593Smuzhiyun#ifndef PREAMBLE
79*4882a593Smuzhiyun#define PREAMBLE
80*4882a593Smuzhiyun#endif
81*4882a593Smuzhiyun
82*4882a593Smuzhiyun#ifndef XCC
83*4882a593Smuzhiyun#define XCC xcc
84*4882a593Smuzhiyun#endif
85*4882a593Smuzhiyun
86*4882a593Smuzhiyun#define FREG_FROB(x0, x1, x2, x3, x4, x5, x6, x7, x8) \
87*4882a593Smuzhiyun	faligndata	%x0, %x1, %f0; \
88*4882a593Smuzhiyun	faligndata	%x1, %x2, %f2; \
89*4882a593Smuzhiyun	faligndata	%x2, %x3, %f4; \
90*4882a593Smuzhiyun	faligndata	%x3, %x4, %f6; \
91*4882a593Smuzhiyun	faligndata	%x4, %x5, %f8; \
92*4882a593Smuzhiyun	faligndata	%x5, %x6, %f10; \
93*4882a593Smuzhiyun	faligndata	%x6, %x7, %f12; \
94*4882a593Smuzhiyun	faligndata	%x7, %x8, %f14;
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun#define FREG_MOVE_1(x0) \
97*4882a593Smuzhiyun	fsrc2		%x0, %f0;
98*4882a593Smuzhiyun#define FREG_MOVE_2(x0, x1) \
99*4882a593Smuzhiyun	fsrc2		%x0, %f0; \
100*4882a593Smuzhiyun	fsrc2		%x1, %f2;
101*4882a593Smuzhiyun#define FREG_MOVE_3(x0, x1, x2) \
102*4882a593Smuzhiyun	fsrc2		%x0, %f0; \
103*4882a593Smuzhiyun	fsrc2		%x1, %f2; \
104*4882a593Smuzhiyun	fsrc2		%x2, %f4;
105*4882a593Smuzhiyun#define FREG_MOVE_4(x0, x1, x2, x3) \
106*4882a593Smuzhiyun	fsrc2		%x0, %f0; \
107*4882a593Smuzhiyun	fsrc2		%x1, %f2; \
108*4882a593Smuzhiyun	fsrc2		%x2, %f4; \
109*4882a593Smuzhiyun	fsrc2		%x3, %f6;
110*4882a593Smuzhiyun#define FREG_MOVE_5(x0, x1, x2, x3, x4) \
111*4882a593Smuzhiyun	fsrc2		%x0, %f0; \
112*4882a593Smuzhiyun	fsrc2		%x1, %f2; \
113*4882a593Smuzhiyun	fsrc2		%x2, %f4; \
114*4882a593Smuzhiyun	fsrc2		%x3, %f6; \
115*4882a593Smuzhiyun	fsrc2		%x4, %f8;
116*4882a593Smuzhiyun#define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \
117*4882a593Smuzhiyun	fsrc2		%x0, %f0; \
118*4882a593Smuzhiyun	fsrc2		%x1, %f2; \
119*4882a593Smuzhiyun	fsrc2		%x2, %f4; \
120*4882a593Smuzhiyun	fsrc2		%x3, %f6; \
121*4882a593Smuzhiyun	fsrc2		%x4, %f8; \
122*4882a593Smuzhiyun	fsrc2		%x5, %f10;
123*4882a593Smuzhiyun#define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \
124*4882a593Smuzhiyun	fsrc2		%x0, %f0; \
125*4882a593Smuzhiyun	fsrc2		%x1, %f2; \
126*4882a593Smuzhiyun	fsrc2		%x2, %f4; \
127*4882a593Smuzhiyun	fsrc2		%x3, %f6; \
128*4882a593Smuzhiyun	fsrc2		%x4, %f8; \
129*4882a593Smuzhiyun	fsrc2		%x5, %f10; \
130*4882a593Smuzhiyun	fsrc2		%x6, %f12;
131*4882a593Smuzhiyun#define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \
132*4882a593Smuzhiyun	fsrc2		%x0, %f0; \
133*4882a593Smuzhiyun	fsrc2		%x1, %f2; \
134*4882a593Smuzhiyun	fsrc2		%x2, %f4; \
135*4882a593Smuzhiyun	fsrc2		%x3, %f6; \
136*4882a593Smuzhiyun	fsrc2		%x4, %f8; \
137*4882a593Smuzhiyun	fsrc2		%x5, %f10; \
138*4882a593Smuzhiyun	fsrc2		%x6, %f12; \
139*4882a593Smuzhiyun	fsrc2		%x7, %f14;
140*4882a593Smuzhiyun#define FREG_LOAD_1(base, x0) \
141*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1)
142*4882a593Smuzhiyun#define FREG_LOAD_2(base, x0, x1) \
143*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
144*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1);
145*4882a593Smuzhiyun#define FREG_LOAD_3(base, x0, x1, x2) \
146*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
147*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
148*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1);
149*4882a593Smuzhiyun#define FREG_LOAD_4(base, x0, x1, x2, x3) \
150*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
151*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
152*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
153*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1);
154*4882a593Smuzhiyun#define FREG_LOAD_5(base, x0, x1, x2, x3, x4) \
155*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
156*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
157*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
158*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
159*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1);
160*4882a593Smuzhiyun#define FREG_LOAD_6(base, x0, x1, x2, x3, x4, x5) \
161*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
162*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
163*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
164*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
165*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1); \
166*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x28, %x5), NG2_retl_o2_plus_g1);
167*4882a593Smuzhiyun#define FREG_LOAD_7(base, x0, x1, x2, x3, x4, x5, x6) \
168*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
169*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
170*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
171*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
172*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1); \
173*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x28, %x5), NG2_retl_o2_plus_g1); \
174*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, base + 0x30, %x6), NG2_retl_o2_plus_g1);
175*4882a593Smuzhiyun
176*4882a593Smuzhiyun	.register	%g2,#scratch
177*4882a593Smuzhiyun	.register	%g3,#scratch
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun	.text
180*4882a593Smuzhiyun#ifndef EX_RETVAL
181*4882a593Smuzhiyun#define EX_RETVAL(x)	x
182*4882a593Smuzhiyun__restore_fp:
183*4882a593Smuzhiyun	VISExitHalf
184*4882a593Smuzhiyun__restore_asi:
185*4882a593Smuzhiyun	retl
186*4882a593Smuzhiyun	 wr	%g0, ASI_AIUS, %asi
187*4882a593SmuzhiyunENTRY(NG2_retl_o2)
188*4882a593Smuzhiyun	ba,pt	%xcc, __restore_asi
189*4882a593Smuzhiyun	 mov	%o2, %o0
190*4882a593SmuzhiyunENDPROC(NG2_retl_o2)
191*4882a593SmuzhiyunENTRY(NG2_retl_o2_plus_1)
192*4882a593Smuzhiyun	ba,pt	%xcc, __restore_asi
193*4882a593Smuzhiyun	 add	%o2, 1, %o0
194*4882a593SmuzhiyunENDPROC(NG2_retl_o2_plus_1)
195*4882a593SmuzhiyunENTRY(NG2_retl_o2_plus_4)
196*4882a593Smuzhiyun	ba,pt	%xcc, __restore_asi
197*4882a593Smuzhiyun	 add	%o2, 4, %o0
198*4882a593SmuzhiyunENDPROC(NG2_retl_o2_plus_4)
199*4882a593SmuzhiyunENTRY(NG2_retl_o2_plus_8)
200*4882a593Smuzhiyun	ba,pt	%xcc, __restore_asi
201*4882a593Smuzhiyun	 add	%o2, 8, %o0
202*4882a593SmuzhiyunENDPROC(NG2_retl_o2_plus_8)
203*4882a593SmuzhiyunENTRY(NG2_retl_o2_plus_o4_plus_1)
204*4882a593Smuzhiyun	add	%o4, 1, %o4
205*4882a593Smuzhiyun	ba,pt	%xcc, __restore_asi
206*4882a593Smuzhiyun	 add	%o2, %o4, %o0
207*4882a593SmuzhiyunENDPROC(NG2_retl_o2_plus_o4_plus_1)
208*4882a593SmuzhiyunENTRY(NG2_retl_o2_plus_o4_plus_8)
209*4882a593Smuzhiyun	add	%o4, 8, %o4
210*4882a593Smuzhiyun	ba,pt	%xcc, __restore_asi
211*4882a593Smuzhiyun	 add	%o2, %o4, %o0
212*4882a593SmuzhiyunENDPROC(NG2_retl_o2_plus_o4_plus_8)
213*4882a593SmuzhiyunENTRY(NG2_retl_o2_plus_o4_plus_16)
214*4882a593Smuzhiyun	add	%o4, 16, %o4
215*4882a593Smuzhiyun	ba,pt	%xcc, __restore_asi
216*4882a593Smuzhiyun	 add	%o2, %o4, %o0
217*4882a593SmuzhiyunENDPROC(NG2_retl_o2_plus_o4_plus_16)
218*4882a593SmuzhiyunENTRY(NG2_retl_o2_plus_g1_fp)
219*4882a593Smuzhiyun	ba,pt	%xcc, __restore_fp
220*4882a593Smuzhiyun	 add	%o2, %g1, %o0
221*4882a593SmuzhiyunENDPROC(NG2_retl_o2_plus_g1_fp)
222*4882a593SmuzhiyunENTRY(NG2_retl_o2_plus_g1_plus_64_fp)
223*4882a593Smuzhiyun	add	%g1, 64, %g1
224*4882a593Smuzhiyun	ba,pt	%xcc, __restore_fp
225*4882a593Smuzhiyun	 add	%o2, %g1, %o0
226*4882a593SmuzhiyunENDPROC(NG2_retl_o2_plus_g1_plus_64_fp)
227*4882a593SmuzhiyunENTRY(NG2_retl_o2_plus_g1_plus_1)
228*4882a593Smuzhiyun	add	%g1, 1, %g1
229*4882a593Smuzhiyun	ba,pt	%xcc, __restore_asi
230*4882a593Smuzhiyun	 add	%o2, %g1, %o0
231*4882a593SmuzhiyunENDPROC(NG2_retl_o2_plus_g1_plus_1)
232*4882a593SmuzhiyunENTRY(NG2_retl_o2_and_7_plus_o4)
233*4882a593Smuzhiyun	and	%o2, 7, %o2
234*4882a593Smuzhiyun	ba,pt	%xcc, __restore_asi
235*4882a593Smuzhiyun	 add	%o2, %o4, %o0
236*4882a593SmuzhiyunENDPROC(NG2_retl_o2_and_7_plus_o4)
237*4882a593SmuzhiyunENTRY(NG2_retl_o2_and_7_plus_o4_plus_8)
238*4882a593Smuzhiyun	and	%o2, 7, %o2
239*4882a593Smuzhiyun	add	%o4, 8, %o4
240*4882a593Smuzhiyun	ba,pt	%xcc, __restore_asi
241*4882a593Smuzhiyun	 add	%o2, %o4, %o0
242*4882a593SmuzhiyunENDPROC(NG2_retl_o2_and_7_plus_o4_plus_8)
243*4882a593Smuzhiyun#endif
244*4882a593Smuzhiyun
245*4882a593Smuzhiyun	.align		64
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun	.globl	FUNC_NAME
248*4882a593Smuzhiyun	.type	FUNC_NAME,#function
249*4882a593SmuzhiyunFUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
250*4882a593Smuzhiyun	srlx		%o2, 31, %g2
251*4882a593Smuzhiyun	cmp		%g2, 0
252*4882a593Smuzhiyun	tne		%xcc, 5
253*4882a593Smuzhiyun	PREAMBLE
254*4882a593Smuzhiyun	mov		%o0, %o3
255*4882a593Smuzhiyun	cmp		%o2, 0
256*4882a593Smuzhiyun	be,pn		%XCC, 85f
257*4882a593Smuzhiyun	 or		%o0, %o1, GLOBAL_SPARE
258*4882a593Smuzhiyun	cmp		%o2, 16
259*4882a593Smuzhiyun	blu,a,pn	%XCC, 80f
260*4882a593Smuzhiyun	 or		GLOBAL_SPARE, %o2, GLOBAL_SPARE
261*4882a593Smuzhiyun
262*4882a593Smuzhiyun	/* 2 blocks (128 bytes) is the minimum we can do the block
263*4882a593Smuzhiyun	 * copy with.  We need to ensure that we'll iterate at least
264*4882a593Smuzhiyun	 * once in the block copy loop.  At worst we'll need to align
265*4882a593Smuzhiyun	 * the destination to a 64-byte boundary which can chew up
266*4882a593Smuzhiyun	 * to (64 - 1) bytes from the length before we perform the
267*4882a593Smuzhiyun	 * block copy loop.
268*4882a593Smuzhiyun	 *
269*4882a593Smuzhiyun	 * However, the cut-off point, performance wise, is around
270*4882a593Smuzhiyun	 * 4 64-byte blocks.
271*4882a593Smuzhiyun	 */
272*4882a593Smuzhiyun	cmp		%o2, (4 * 64)
273*4882a593Smuzhiyun	blu,pt		%XCC, 75f
274*4882a593Smuzhiyun	 andcc		GLOBAL_SPARE, 0x7, %g0
275*4882a593Smuzhiyun
276*4882a593Smuzhiyun	/* %o0:	dst
277*4882a593Smuzhiyun	 * %o1:	src
278*4882a593Smuzhiyun	 * %o2:	len  (known to be >= 128)
279*4882a593Smuzhiyun	 *
280*4882a593Smuzhiyun	 * The block copy loops can use %o4, %g2, %g3 as
281*4882a593Smuzhiyun	 * temporaries while copying the data.  %o5 must
282*4882a593Smuzhiyun	 * be preserved between VISEntryHalf and VISExitHalf
283*4882a593Smuzhiyun	 */
284*4882a593Smuzhiyun
285*4882a593Smuzhiyun	LOAD(prefetch, %o1 + 0x000, #one_read)
286*4882a593Smuzhiyun	LOAD(prefetch, %o1 + 0x040, #one_read)
287*4882a593Smuzhiyun	LOAD(prefetch, %o1 + 0x080, #one_read)
288*4882a593Smuzhiyun
289*4882a593Smuzhiyun	/* Align destination on 64-byte boundary.  */
290*4882a593Smuzhiyun	andcc		%o0, (64 - 1), %o4
291*4882a593Smuzhiyun	be,pt		%XCC, 2f
292*4882a593Smuzhiyun	 sub		%o4, 64, %o4
293*4882a593Smuzhiyun	sub		%g0, %o4, %o4	! bytes to align dst
294*4882a593Smuzhiyun	sub		%o2, %o4, %o2
295*4882a593Smuzhiyun1:	subcc		%o4, 1, %o4
296*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o1, %g1), NG2_retl_o2_plus_o4_plus_1)
297*4882a593Smuzhiyun	EX_ST(STORE(stb, %g1, %o0), NG2_retl_o2_plus_o4_plus_1)
298*4882a593Smuzhiyun	add		%o1, 1, %o1
299*4882a593Smuzhiyun	bne,pt		%XCC, 1b
300*4882a593Smuzhiyun	add		%o0, 1, %o0
301*4882a593Smuzhiyun
302*4882a593Smuzhiyun2:
303*4882a593Smuzhiyun	/* Clobbers o5/g1/g2/g3/g7/icc/xcc.  We must preserve
304*4882a593Smuzhiyun	 * o5 from here until we hit VISExitHalf.
305*4882a593Smuzhiyun	 */
306*4882a593Smuzhiyun	VISEntryHalf
307*4882a593Smuzhiyun
308*4882a593Smuzhiyun	membar		#Sync
309*4882a593Smuzhiyun	alignaddr	%o1, %g0, %g0
310*4882a593Smuzhiyun
311*4882a593Smuzhiyun	add		%o1, (64 - 1), %o4
312*4882a593Smuzhiyun	andn		%o4, (64 - 1), %o4
313*4882a593Smuzhiyun	andn		%o2, (64 - 1), %g1
314*4882a593Smuzhiyun	sub		%o2, %g1, %o2
315*4882a593Smuzhiyun
316*4882a593Smuzhiyun	and		%o1, (64 - 1), %g2
317*4882a593Smuzhiyun	add		%o1, %g1, %o1
318*4882a593Smuzhiyun	sub		%o0, %o4, %g3
319*4882a593Smuzhiyun	brz,pt		%g2, 190f
320*4882a593Smuzhiyun	 cmp		%g2, 32
321*4882a593Smuzhiyun	blu,a		5f
322*4882a593Smuzhiyun	 cmp		%g2, 16
323*4882a593Smuzhiyun	cmp		%g2, 48
324*4882a593Smuzhiyun	blu,a		4f
325*4882a593Smuzhiyun	 cmp		%g2, 40
326*4882a593Smuzhiyun	cmp		%g2, 56
327*4882a593Smuzhiyun	blu		170f
328*4882a593Smuzhiyun	 nop
329*4882a593Smuzhiyun	ba,a,pt		%xcc, 180f
330*4882a593Smuzhiyun	 nop
331*4882a593Smuzhiyun
332*4882a593Smuzhiyun4:	/* 32 <= low bits < 48 */
333*4882a593Smuzhiyun	blu		150f
334*4882a593Smuzhiyun	 nop
335*4882a593Smuzhiyun	ba,a,pt		%xcc, 160f
336*4882a593Smuzhiyun	 nop
337*4882a593Smuzhiyun5:	/* 0 < low bits < 32 */
338*4882a593Smuzhiyun	blu,a		6f
339*4882a593Smuzhiyun	 cmp		%g2, 8
340*4882a593Smuzhiyun	cmp		%g2, 24
341*4882a593Smuzhiyun	blu		130f
342*4882a593Smuzhiyun	 nop
343*4882a593Smuzhiyun	ba,a,pt		%xcc, 140f
344*4882a593Smuzhiyun	 nop
345*4882a593Smuzhiyun6:	/* 0 < low bits < 16 */
346*4882a593Smuzhiyun	bgeu		120f
347*4882a593Smuzhiyun	 nop
348*4882a593Smuzhiyun	/* fall through for 0 < low bits < 8 */
349*4882a593Smuzhiyun110:	sub		%o4, 64, %g2
350*4882a593Smuzhiyun	EX_LD_FP(LOAD_BLK(%g2, %f0), NG2_retl_o2_plus_g1)
351*4882a593Smuzhiyun1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
352*4882a593Smuzhiyun	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
353*4882a593Smuzhiyun	FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f14, f16)
354*4882a593Smuzhiyun	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
355*4882a593Smuzhiyun	FREG_MOVE_8(f16, f18, f20, f22, f24, f26, f28, f30)
356*4882a593Smuzhiyun	subcc		%g1, 64, %g1
357*4882a593Smuzhiyun	add		%o4, 64, %o4
358*4882a593Smuzhiyun	bne,pt		%xcc, 1b
359*4882a593Smuzhiyun	 LOAD(prefetch, %o4 + 64, #one_read)
360*4882a593Smuzhiyun	ba,pt		%xcc, 195f
361*4882a593Smuzhiyun	 nop
362*4882a593Smuzhiyun
363*4882a593Smuzhiyun120:	sub		%o4, 56, %g2
364*4882a593Smuzhiyun	FREG_LOAD_7(%g2, f0, f2, f4, f6, f8, f10, f12)
365*4882a593Smuzhiyun1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
366*4882a593Smuzhiyun	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
367*4882a593Smuzhiyun	FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f16, f18)
368*4882a593Smuzhiyun	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
369*4882a593Smuzhiyun	FREG_MOVE_7(f18, f20, f22, f24, f26, f28, f30)
370*4882a593Smuzhiyun	subcc		%g1, 64, %g1
371*4882a593Smuzhiyun	add		%o4, 64, %o4
372*4882a593Smuzhiyun	bne,pt		%xcc, 1b
373*4882a593Smuzhiyun	 LOAD(prefetch, %o4 + 64, #one_read)
374*4882a593Smuzhiyun	ba,pt		%xcc, 195f
375*4882a593Smuzhiyun	 nop
376*4882a593Smuzhiyun
377*4882a593Smuzhiyun130:	sub		%o4, 48, %g2
378*4882a593Smuzhiyun	FREG_LOAD_6(%g2, f0, f2, f4, f6, f8, f10)
379*4882a593Smuzhiyun1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
380*4882a593Smuzhiyun	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
381*4882a593Smuzhiyun	FREG_FROB(f0, f2, f4, f6, f8, f10, f16, f18, f20)
382*4882a593Smuzhiyun	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
383*4882a593Smuzhiyun	FREG_MOVE_6(f20, f22, f24, f26, f28, f30)
384*4882a593Smuzhiyun	subcc		%g1, 64, %g1
385*4882a593Smuzhiyun	add		%o4, 64, %o4
386*4882a593Smuzhiyun	bne,pt		%xcc, 1b
387*4882a593Smuzhiyun	 LOAD(prefetch, %o4 + 64, #one_read)
388*4882a593Smuzhiyun	ba,pt		%xcc, 195f
389*4882a593Smuzhiyun	 nop
390*4882a593Smuzhiyun
391*4882a593Smuzhiyun140:	sub		%o4, 40, %g2
392*4882a593Smuzhiyun	FREG_LOAD_5(%g2, f0, f2, f4, f6, f8)
393*4882a593Smuzhiyun1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
394*4882a593Smuzhiyun	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
395*4882a593Smuzhiyun	FREG_FROB(f0, f2, f4, f6, f8, f16, f18, f20, f22)
396*4882a593Smuzhiyun	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
397*4882a593Smuzhiyun	FREG_MOVE_5(f22, f24, f26, f28, f30)
398*4882a593Smuzhiyun	subcc		%g1, 64, %g1
399*4882a593Smuzhiyun	add		%o4, 64, %o4
400*4882a593Smuzhiyun	bne,pt		%xcc, 1b
401*4882a593Smuzhiyun	 LOAD(prefetch, %o4 + 64, #one_read)
402*4882a593Smuzhiyun	ba,pt		%xcc, 195f
403*4882a593Smuzhiyun	 nop
404*4882a593Smuzhiyun
405*4882a593Smuzhiyun150:	sub		%o4, 32, %g2
406*4882a593Smuzhiyun	FREG_LOAD_4(%g2, f0, f2, f4, f6)
407*4882a593Smuzhiyun1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
408*4882a593Smuzhiyun	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
409*4882a593Smuzhiyun	FREG_FROB(f0, f2, f4, f6, f16, f18, f20, f22, f24)
410*4882a593Smuzhiyun	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
411*4882a593Smuzhiyun	FREG_MOVE_4(f24, f26, f28, f30)
412*4882a593Smuzhiyun	subcc		%g1, 64, %g1
413*4882a593Smuzhiyun	add		%o4, 64, %o4
414*4882a593Smuzhiyun	bne,pt		%xcc, 1b
415*4882a593Smuzhiyun	 LOAD(prefetch, %o4 + 64, #one_read)
416*4882a593Smuzhiyun	ba,pt		%xcc, 195f
417*4882a593Smuzhiyun	 nop
418*4882a593Smuzhiyun
419*4882a593Smuzhiyun160:	sub		%o4, 24, %g2
420*4882a593Smuzhiyun	FREG_LOAD_3(%g2, f0, f2, f4)
421*4882a593Smuzhiyun1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
422*4882a593Smuzhiyun	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
423*4882a593Smuzhiyun	FREG_FROB(f0, f2, f4, f16, f18, f20, f22, f24, f26)
424*4882a593Smuzhiyun	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
425*4882a593Smuzhiyun	FREG_MOVE_3(f26, f28, f30)
426*4882a593Smuzhiyun	subcc		%g1, 64, %g1
427*4882a593Smuzhiyun	add		%o4, 64, %o4
428*4882a593Smuzhiyun	bne,pt		%xcc, 1b
429*4882a593Smuzhiyun	 LOAD(prefetch, %o4 + 64, #one_read)
430*4882a593Smuzhiyun	ba,pt		%xcc, 195f
431*4882a593Smuzhiyun	 nop
432*4882a593Smuzhiyun
433*4882a593Smuzhiyun170:	sub		%o4, 16, %g2
434*4882a593Smuzhiyun	FREG_LOAD_2(%g2, f0, f2)
435*4882a593Smuzhiyun1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
436*4882a593Smuzhiyun	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
437*4882a593Smuzhiyun	FREG_FROB(f0, f2, f16, f18, f20, f22, f24, f26, f28)
438*4882a593Smuzhiyun	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
439*4882a593Smuzhiyun	FREG_MOVE_2(f28, f30)
440*4882a593Smuzhiyun	subcc		%g1, 64, %g1
441*4882a593Smuzhiyun	add		%o4, 64, %o4
442*4882a593Smuzhiyun	bne,pt		%xcc, 1b
443*4882a593Smuzhiyun	 LOAD(prefetch, %o4 + 64, #one_read)
444*4882a593Smuzhiyun	ba,pt		%xcc, 195f
445*4882a593Smuzhiyun	 nop
446*4882a593Smuzhiyun
447*4882a593Smuzhiyun180:	sub		%o4, 8, %g2
448*4882a593Smuzhiyun	FREG_LOAD_1(%g2, f0)
449*4882a593Smuzhiyun1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
450*4882a593Smuzhiyun	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
451*4882a593Smuzhiyun	FREG_FROB(f0, f16, f18, f20, f22, f24, f26, f28, f30)
452*4882a593Smuzhiyun	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
453*4882a593Smuzhiyun	FREG_MOVE_1(f30)
454*4882a593Smuzhiyun	subcc		%g1, 64, %g1
455*4882a593Smuzhiyun	add		%o4, 64, %o4
456*4882a593Smuzhiyun	bne,pt		%xcc, 1b
457*4882a593Smuzhiyun	 LOAD(prefetch, %o4 + 64, #one_read)
458*4882a593Smuzhiyun	ba,pt		%xcc, 195f
459*4882a593Smuzhiyun	 nop
460*4882a593Smuzhiyun
461*4882a593Smuzhiyun190:
462*4882a593Smuzhiyun1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
463*4882a593Smuzhiyun	subcc		%g1, 64, %g1
464*4882a593Smuzhiyun	EX_LD_FP(LOAD_BLK(%o4, %f0), NG2_retl_o2_plus_g1_plus_64)
465*4882a593Smuzhiyun	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1_plus_64)
466*4882a593Smuzhiyun	add		%o4, 64, %o4
467*4882a593Smuzhiyun	bne,pt		%xcc, 1b
468*4882a593Smuzhiyun	 LOAD(prefetch, %o4 + 64, #one_read)
469*4882a593Smuzhiyun
470*4882a593Smuzhiyun195:
471*4882a593Smuzhiyun	add		%o4, %g3, %o0
472*4882a593Smuzhiyun	membar		#Sync
473*4882a593Smuzhiyun
474*4882a593Smuzhiyun	VISExitHalf
475*4882a593Smuzhiyun
476*4882a593Smuzhiyun	/* %o2 contains any final bytes still needed to be copied
477*4882a593Smuzhiyun	 * over. If anything is left, we copy it one byte at a time.
478*4882a593Smuzhiyun	 */
479*4882a593Smuzhiyun	brz,pt		%o2, 85f
480*4882a593Smuzhiyun	 sub		%o0, %o1, GLOBAL_SPARE
481*4882a593Smuzhiyun	ba,a,pt		%XCC, 90f
482*4882a593Smuzhiyun	 nop
483*4882a593Smuzhiyun
484*4882a593Smuzhiyun	.align		64
485*4882a593Smuzhiyun75: /* 16 < len <= 64 */
486*4882a593Smuzhiyun	bne,pn		%XCC, 75f
487*4882a593Smuzhiyun	 sub		%o0, %o1, GLOBAL_SPARE
488*4882a593Smuzhiyun
489*4882a593Smuzhiyun72:
490*4882a593Smuzhiyun	andn		%o2, 0xf, %o4
491*4882a593Smuzhiyun	and		%o2, 0xf, %o2
492*4882a593Smuzhiyun1:	subcc		%o4, 0x10, %o4
493*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1, %o5), NG2_retl_o2_plus_o4_plus_16)
494*4882a593Smuzhiyun	add		%o1, 0x08, %o1
495*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1, %g1), NG2_retl_o2_plus_o4_plus_16)
496*4882a593Smuzhiyun	sub		%o1, 0x08, %o1
497*4882a593Smuzhiyun	EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_o4_plus_16)
498*4882a593Smuzhiyun	add		%o1, 0x8, %o1
499*4882a593Smuzhiyun	EX_ST(STORE(stx, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_o4_plus_8)
500*4882a593Smuzhiyun	bgu,pt		%XCC, 1b
501*4882a593Smuzhiyun	 add		%o1, 0x8, %o1
502*4882a593Smuzhiyun73:	andcc		%o2, 0x8, %g0
503*4882a593Smuzhiyun	be,pt		%XCC, 1f
504*4882a593Smuzhiyun	 nop
505*4882a593Smuzhiyun	sub		%o2, 0x8, %o2
506*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1, %o5), NG2_retl_o2_plus_8)
507*4882a593Smuzhiyun	EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_8)
508*4882a593Smuzhiyun	add		%o1, 0x8, %o1
509*4882a593Smuzhiyun1:	andcc		%o2, 0x4, %g0
510*4882a593Smuzhiyun	be,pt		%XCC, 1f
511*4882a593Smuzhiyun	 nop
512*4882a593Smuzhiyun	sub		%o2, 0x4, %o2
513*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o1, %o5), NG2_retl_o2_plus_4)
514*4882a593Smuzhiyun	EX_ST(STORE(stw, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_4)
515*4882a593Smuzhiyun	add		%o1, 0x4, %o1
516*4882a593Smuzhiyun1:	cmp		%o2, 0
517*4882a593Smuzhiyun	be,pt		%XCC, 85f
518*4882a593Smuzhiyun	 nop
519*4882a593Smuzhiyun	ba,pt		%xcc, 90f
520*4882a593Smuzhiyun	 nop
521*4882a593Smuzhiyun
522*4882a593Smuzhiyun75:
523*4882a593Smuzhiyun	andcc		%o0, 0x7, %g1
524*4882a593Smuzhiyun	sub		%g1, 0x8, %g1
525*4882a593Smuzhiyun	be,pn		%icc, 2f
526*4882a593Smuzhiyun	 sub		%g0, %g1, %g1
527*4882a593Smuzhiyun	sub		%o2, %g1, %o2
528*4882a593Smuzhiyun
529*4882a593Smuzhiyun1:	subcc		%g1, 1, %g1
530*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o1, %o5), NG2_retl_o2_plus_g1_plus_1)
531*4882a593Smuzhiyun	EX_ST(STORE(stb, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_g1_plus_1)
532*4882a593Smuzhiyun	bgu,pt		%icc, 1b
533*4882a593Smuzhiyun	 add		%o1, 1, %o1
534*4882a593Smuzhiyun
535*4882a593Smuzhiyun2:	add		%o1, GLOBAL_SPARE, %o0
536*4882a593Smuzhiyun	andcc		%o1, 0x7, %g1
537*4882a593Smuzhiyun	bne,pt		%icc, 8f
538*4882a593Smuzhiyun	 sll		%g1, 3, %g1
539*4882a593Smuzhiyun
540*4882a593Smuzhiyun	cmp		%o2, 16
541*4882a593Smuzhiyun	bgeu,pt		%icc, 72b
542*4882a593Smuzhiyun	 nop
543*4882a593Smuzhiyun	ba,a,pt		%xcc, 73b
544*4882a593Smuzhiyun
545*4882a593Smuzhiyun8:	mov		64, GLOBAL_SPARE
546*4882a593Smuzhiyun	andn		%o1, 0x7, %o1
547*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1, %g2), NG2_retl_o2)
548*4882a593Smuzhiyun	sub		GLOBAL_SPARE, %g1, GLOBAL_SPARE
549*4882a593Smuzhiyun	andn		%o2, 0x7, %o4
550*4882a593Smuzhiyun	sllx		%g2, %g1, %g2
551*4882a593Smuzhiyun1:	add		%o1, 0x8, %o1
552*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1, %g3), NG2_retl_o2_and_7_plus_o4)
553*4882a593Smuzhiyun	subcc		%o4, 0x8, %o4
554*4882a593Smuzhiyun	srlx		%g3, GLOBAL_SPARE, %o5
555*4882a593Smuzhiyun	or		%o5, %g2, %o5
556*4882a593Smuzhiyun	EX_ST(STORE(stx, %o5, %o0), NG2_retl_o2_and_7_plus_o4_plus_8)
557*4882a593Smuzhiyun	add		%o0, 0x8, %o0
558*4882a593Smuzhiyun	bgu,pt		%icc, 1b
559*4882a593Smuzhiyun	 sllx		%g3, %g1, %g2
560*4882a593Smuzhiyun
561*4882a593Smuzhiyun	srl		%g1, 3, %g1
562*4882a593Smuzhiyun	andcc		%o2, 0x7, %o2
563*4882a593Smuzhiyun	be,pn		%icc, 85f
564*4882a593Smuzhiyun	 add		%o1, %g1, %o1
565*4882a593Smuzhiyun	ba,pt		%xcc, 90f
566*4882a593Smuzhiyun	 sub		%o0, %o1, GLOBAL_SPARE
567*4882a593Smuzhiyun
568*4882a593Smuzhiyun	.align		64
569*4882a593Smuzhiyun80: /* 0 < len <= 16 */
570*4882a593Smuzhiyun	andcc		GLOBAL_SPARE, 0x3, %g0
571*4882a593Smuzhiyun	bne,pn		%XCC, 90f
572*4882a593Smuzhiyun	 sub		%o0, %o1, GLOBAL_SPARE
573*4882a593Smuzhiyun
574*4882a593Smuzhiyun1:
575*4882a593Smuzhiyun	subcc		%o2, 4, %o2
576*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o1, %g1), NG2_retl_o2_plus_4)
577*4882a593Smuzhiyun	EX_ST(STORE(stw, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_4)
578*4882a593Smuzhiyun	bgu,pt		%XCC, 1b
579*4882a593Smuzhiyun	 add		%o1, 4, %o1
580*4882a593Smuzhiyun
581*4882a593Smuzhiyun85:	retl
582*4882a593Smuzhiyun	 mov		EX_RETVAL(%o3), %o0
583*4882a593Smuzhiyun
584*4882a593Smuzhiyun	.align		32
585*4882a593Smuzhiyun90:
586*4882a593Smuzhiyun	subcc		%o2, 1, %o2
587*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o1, %g1), NG2_retl_o2_plus_1)
588*4882a593Smuzhiyun	EX_ST(STORE(stb, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_1)
589*4882a593Smuzhiyun	bgu,pt		%XCC, 90b
590*4882a593Smuzhiyun	 add		%o1, 1, %o1
591*4882a593Smuzhiyun	retl
592*4882a593Smuzhiyun	 mov		EX_RETVAL(%o3), %o0
593*4882a593Smuzhiyun
594*4882a593Smuzhiyun	.size		FUNC_NAME, .-FUNC_NAME
595