xref: /OK3568_Linux_fs/kernel/arch/sparc/lib/M7memcpy.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/*
2*4882a593Smuzhiyun * M7memcpy: Optimized SPARC M7 memcpy
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
5*4882a593Smuzhiyun */
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun	.file	"M7memcpy.S"
8*4882a593Smuzhiyun
9*4882a593Smuzhiyun/*
10*4882a593Smuzhiyun * memcpy(s1, s2, len)
11*4882a593Smuzhiyun *
12*4882a593Smuzhiyun * Copy s2 to s1, always copy n bytes.
13*4882a593Smuzhiyun * Note: this C code does not work for overlapped copies.
14*4882a593Smuzhiyun *
15*4882a593Smuzhiyun * Fast assembler language version of the following C-program for memcpy
16*4882a593Smuzhiyun * which represents the `standard' for the C-library.
17*4882a593Smuzhiyun *
18*4882a593Smuzhiyun *	void *
19*4882a593Smuzhiyun *	memcpy(void *s, const void *s0, size_t n)
20*4882a593Smuzhiyun *	{
21*4882a593Smuzhiyun *		if (n != 0) {
22*4882a593Smuzhiyun *		    char *s1 = s;
23*4882a593Smuzhiyun *		    const char *s2 = s0;
24*4882a593Smuzhiyun *		    do {
25*4882a593Smuzhiyun *			*s1++ = *s2++;
26*4882a593Smuzhiyun *		    } while (--n != 0);
27*4882a593Smuzhiyun *		}
28*4882a593Smuzhiyun *		return (s);
29*4882a593Smuzhiyun *	}
30*4882a593Smuzhiyun *
31*4882a593Smuzhiyun *
32*4882a593Smuzhiyun * SPARC T7/M7 Flow :
33*4882a593Smuzhiyun *
34*4882a593Smuzhiyun * if (count < SMALL_MAX) {
35*4882a593Smuzhiyun *   if count < SHORTCOPY              (SHORTCOPY=3)
36*4882a593Smuzhiyun *	copy bytes; exit with dst addr
37*4882a593Smuzhiyun *   if src & dst aligned on word boundary but not long word boundary,
38*4882a593Smuzhiyun *     copy with ldw/stw; branch to finish_up
39*4882a593Smuzhiyun *   if src & dst aligned on long word boundary
40*4882a593Smuzhiyun *     copy with ldx/stx; branch to finish_up
41*4882a593Smuzhiyun *   if src & dst not aligned and length <= SHORTCHECK   (SHORTCHECK=14)
42*4882a593Smuzhiyun *     copy bytes; exit with dst addr
43*4882a593Smuzhiyun *   move enough bytes to get src to word boundary
44*4882a593Smuzhiyun *   if dst now on word boundary
45*4882a593Smuzhiyun * move_words:
46*4882a593Smuzhiyun *     copy words; branch to finish_up
47*4882a593Smuzhiyun *   if dst now on half word boundary
48*4882a593Smuzhiyun *     load words, shift half words, store words; branch to finish_up
49*4882a593Smuzhiyun *   if dst on byte 1
50*4882a593Smuzhiyun *     load words, shift 3 bytes, store words; branch to finish_up
51*4882a593Smuzhiyun *   if dst on byte 3
52*4882a593Smuzhiyun *     load words, shift 1 byte, store words; branch to finish_up
53*4882a593Smuzhiyun * finish_up:
54*4882a593Smuzhiyun *     copy bytes; exit with dst addr
55*4882a593Smuzhiyun * } else {                                         More than SMALL_MAX bytes
56*4882a593Smuzhiyun *   move bytes until dst is on long word boundary
57*4882a593Smuzhiyun *   if( src is on long word boundary ) {
58*4882a593Smuzhiyun *     if (count < MED_MAX) {
59*4882a593Smuzhiyun * finish_long:					   src/dst aligned on 8 bytes
60*4882a593Smuzhiyun *       copy with ldx/stx in 8-way unrolled loop;
61*4882a593Smuzhiyun *       copy final 0-63 bytes; exit with dst addr
62*4882a593Smuzhiyun *     } else {				     src/dst aligned; count > MED_MAX
63*4882a593Smuzhiyun *       align dst on 64 byte boundary; for main data movement:
64*4882a593Smuzhiyun *       prefetch src data to L2 cache; let HW prefetch move data to L1 cache
65*4882a593Smuzhiyun *       Use BIS (block initializing store) to avoid copying store cache
66*4882a593Smuzhiyun *       lines from memory. But pre-store first element of each cache line
67*4882a593Smuzhiyun *       ST_CHUNK lines in advance of the rest of that cache line. That
68*4882a593Smuzhiyun *       gives time for replacement cache lines to be written back without
69*4882a593Smuzhiyun *       excess STQ and Miss Buffer filling. Repeat until near the end,
70*4882a593Smuzhiyun *       then finish up storing before going to finish_long.
71*4882a593Smuzhiyun *     }
72*4882a593Smuzhiyun *   } else {                                   src/dst not aligned on 8 bytes
73*4882a593Smuzhiyun *     if src is word aligned and count < MED_WMAX
74*4882a593Smuzhiyun *       move words in 8-way unrolled loop
75*4882a593Smuzhiyun *       move final 0-31 bytes; exit with dst addr
76*4882a593Smuzhiyun *     if count < MED_UMAX
77*4882a593Smuzhiyun *       use alignaddr/faligndata combined with ldd/std in 8-way
78*4882a593Smuzhiyun *       unrolled loop to move data.
79*4882a593Smuzhiyun *       go to unalign_done
80*4882a593Smuzhiyun *     else
81*4882a593Smuzhiyun *       setup alignaddr for faligndata instructions
82*4882a593Smuzhiyun *       align dst on 64 byte boundary; prefetch src data to L1 cache
83*4882a593Smuzhiyun *       loadx8, falign, block-store, prefetch loop
84*4882a593Smuzhiyun *	 (only use block-init-store when src/dst on 8 byte boundaries.)
85*4882a593Smuzhiyun * unalign_done:
86*4882a593Smuzhiyun *       move remaining bytes for unaligned cases. exit with dst addr.
87*4882a593Smuzhiyun * }
88*4882a593Smuzhiyun *
89*4882a593Smuzhiyun */
90*4882a593Smuzhiyun
91*4882a593Smuzhiyun#include <asm/visasm.h>
92*4882a593Smuzhiyun#include <asm/asi.h>
93*4882a593Smuzhiyun
94*4882a593Smuzhiyun#if !defined(EX_LD) && !defined(EX_ST)
95*4882a593Smuzhiyun#define NON_USER_COPY
96*4882a593Smuzhiyun#endif
97*4882a593Smuzhiyun
98*4882a593Smuzhiyun#ifndef EX_LD
99*4882a593Smuzhiyun#define EX_LD(x,y)	x
100*4882a593Smuzhiyun#endif
101*4882a593Smuzhiyun#ifndef EX_LD_FP
102*4882a593Smuzhiyun#define EX_LD_FP(x,y)	x
103*4882a593Smuzhiyun#endif
104*4882a593Smuzhiyun
105*4882a593Smuzhiyun#ifndef EX_ST
106*4882a593Smuzhiyun#define EX_ST(x,y)	x
107*4882a593Smuzhiyun#endif
108*4882a593Smuzhiyun#ifndef EX_ST_FP
109*4882a593Smuzhiyun#define EX_ST_FP(x,y)	x
110*4882a593Smuzhiyun#endif
111*4882a593Smuzhiyun
112*4882a593Smuzhiyun#ifndef EX_RETVAL
113*4882a593Smuzhiyun#define EX_RETVAL(x)    x
114*4882a593Smuzhiyun#endif
115*4882a593Smuzhiyun
116*4882a593Smuzhiyun#ifndef LOAD
117*4882a593Smuzhiyun#define LOAD(type,addr,dest)	type [addr], dest
118*4882a593Smuzhiyun#endif
119*4882a593Smuzhiyun
120*4882a593Smuzhiyun#ifndef STORE
121*4882a593Smuzhiyun#define STORE(type,src,addr)	type src, [addr]
122*4882a593Smuzhiyun#endif
123*4882a593Smuzhiyun
124*4882a593Smuzhiyun/*
125*4882a593Smuzhiyun * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache
126*4882a593Smuzhiyun * line as "least recently used" which means if many threads are
127*4882a593Smuzhiyun * active, it has a high probability of being pushed out of the cache
128*4882a593Smuzhiyun * between the first initializing store and the final stores.
129*4882a593Smuzhiyun * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which
130*4882a593Smuzhiyun * marks the cache line as "most recently used" for all
131*4882a593Smuzhiyun * but the last cache line
132*4882a593Smuzhiyun */
133*4882a593Smuzhiyun#ifndef STORE_ASI
134*4882a593Smuzhiyun#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
135*4882a593Smuzhiyun#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
136*4882a593Smuzhiyun#else
137*4882a593Smuzhiyun#define STORE_ASI	0x80		/* ASI_P */
138*4882a593Smuzhiyun#endif
139*4882a593Smuzhiyun#endif
140*4882a593Smuzhiyun
141*4882a593Smuzhiyun#ifndef STORE_MRU_ASI
142*4882a593Smuzhiyun#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
143*4882a593Smuzhiyun#define STORE_MRU_ASI	ASI_ST_BLKINIT_MRU_P
144*4882a593Smuzhiyun#else
145*4882a593Smuzhiyun#define STORE_MRU_ASI	0x80		/* ASI_P */
146*4882a593Smuzhiyun#endif
147*4882a593Smuzhiyun#endif
148*4882a593Smuzhiyun
149*4882a593Smuzhiyun#ifndef STORE_INIT
150*4882a593Smuzhiyun#define STORE_INIT(src,addr)	stxa src, [addr] STORE_ASI
151*4882a593Smuzhiyun#endif
152*4882a593Smuzhiyun
153*4882a593Smuzhiyun#ifndef STORE_INIT_MRU
154*4882a593Smuzhiyun#define STORE_INIT_MRU(src,addr)	stxa src, [addr] STORE_MRU_ASI
155*4882a593Smuzhiyun#endif
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun#ifndef FUNC_NAME
158*4882a593Smuzhiyun#define FUNC_NAME	M7memcpy
159*4882a593Smuzhiyun#endif
160*4882a593Smuzhiyun
161*4882a593Smuzhiyun#ifndef PREAMBLE
162*4882a593Smuzhiyun#define PREAMBLE
163*4882a593Smuzhiyun#endif
164*4882a593Smuzhiyun
165*4882a593Smuzhiyun#define	BLOCK_SIZE	64
166*4882a593Smuzhiyun#define	SHORTCOPY	3
167*4882a593Smuzhiyun#define	SHORTCHECK	14
168*4882a593Smuzhiyun#define	SHORT_LONG	64	/* max copy for short longword-aligned case */
169*4882a593Smuzhiyun				/* must be at least 64 */
170*4882a593Smuzhiyun#define	SMALL_MAX	128
171*4882a593Smuzhiyun#define	MED_UMAX	1024	/* max copy for medium un-aligned case */
172*4882a593Smuzhiyun#define	MED_WMAX	1024	/* max copy for medium word-aligned case */
173*4882a593Smuzhiyun#define	MED_MAX		1024	/* max copy for medium longword-aligned case */
174*4882a593Smuzhiyun#define ST_CHUNK	24	/* ST_CHUNK - block of values for BIS Store */
175*4882a593Smuzhiyun#define ALIGN_PRE	24	/* distance for aligned prefetch loop */
176*4882a593Smuzhiyun
177*4882a593Smuzhiyun	.register	%g2,#scratch
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun	.section	".text"
180*4882a593Smuzhiyun	.global		FUNC_NAME
181*4882a593Smuzhiyun	.type		FUNC_NAME, #function
182*4882a593Smuzhiyun	.align		16
183*4882a593SmuzhiyunFUNC_NAME:
184*4882a593Smuzhiyun	srlx            %o2, 31, %g2
185*4882a593Smuzhiyun	cmp             %g2, 0
186*4882a593Smuzhiyun	tne             %xcc, 5
187*4882a593Smuzhiyun	PREAMBLE
188*4882a593Smuzhiyun	mov		%o0, %g1	! save %o0
189*4882a593Smuzhiyun	brz,pn          %o2, .Lsmallx
190*4882a593Smuzhiyun	 cmp            %o2, 3
191*4882a593Smuzhiyun	ble,pn          %icc, .Ltiny_cp
192*4882a593Smuzhiyun	 cmp            %o2, 19
193*4882a593Smuzhiyun	ble,pn          %icc, .Lsmall_cp
194*4882a593Smuzhiyun	 or             %o0, %o1, %g2
195*4882a593Smuzhiyun	cmp             %o2, SMALL_MAX
196*4882a593Smuzhiyun	bl,pn           %icc, .Lmedium_cp
197*4882a593Smuzhiyun	 nop
198*4882a593Smuzhiyun
199*4882a593Smuzhiyun.Lmedium:
200*4882a593Smuzhiyun	neg	%o0, %o5
201*4882a593Smuzhiyun	andcc	%o5, 7, %o5		! bytes till DST 8 byte aligned
202*4882a593Smuzhiyun	brz,pt	%o5, .Ldst_aligned_on_8
203*4882a593Smuzhiyun
204*4882a593Smuzhiyun	! %o5 has the bytes to be written in partial store.
205*4882a593Smuzhiyun	 sub	%o2, %o5, %o2
206*4882a593Smuzhiyun	sub	%o1, %o0, %o1		! %o1 gets the difference
207*4882a593Smuzhiyun7:					! dst aligning loop
208*4882a593Smuzhiyun	add	%o1, %o0, %o4
209*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o4, %o4), memcpy_retl_o2_plus_o5)	! load one byte
210*4882a593Smuzhiyun	subcc	%o5, 1, %o5
211*4882a593Smuzhiyun	EX_ST(STORE(stb, %o4, %o0), memcpy_retl_o2_plus_o5_plus_1)
212*4882a593Smuzhiyun	bgu,pt	%xcc, 7b
213*4882a593Smuzhiyun	 add	%o0, 1, %o0		! advance dst
214*4882a593Smuzhiyun	add	%o1, %o0, %o1		! restore %o1
215*4882a593Smuzhiyun.Ldst_aligned_on_8:
216*4882a593Smuzhiyun	andcc	%o1, 7, %o5
217*4882a593Smuzhiyun	brnz,pt	%o5, .Lsrc_dst_unaligned_on_8
218*4882a593Smuzhiyun	 nop
219*4882a593Smuzhiyun
220*4882a593Smuzhiyun.Lsrc_dst_aligned_on_8:
221*4882a593Smuzhiyun	! check if we are copying MED_MAX or more bytes
222*4882a593Smuzhiyun	set MED_MAX, %o3
223*4882a593Smuzhiyun	cmp %o2, %o3 			! limit to store buffer size
224*4882a593Smuzhiyun	bgu,pn	%xcc, .Llarge_align8_copy
225*4882a593Smuzhiyun	 nop
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun/*
228*4882a593Smuzhiyun * Special case for handling when src and dest are both long word aligned
229*4882a593Smuzhiyun * and total data to move is less than MED_MAX bytes
230*4882a593Smuzhiyun */
231*4882a593Smuzhiyun.Lmedlong:
232*4882a593Smuzhiyun	subcc	%o2, 63, %o2		! adjust length to allow cc test
233*4882a593Smuzhiyun	ble,pn	%xcc, .Lmedl63		! skip big loop if less than 64 bytes
234*4882a593Smuzhiyun	 nop
235*4882a593Smuzhiyun.Lmedl64:
236*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_63)	! load
237*4882a593Smuzhiyun	subcc	%o2, 64, %o2		! decrement length count
238*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_63_64)	! and store
239*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_63_56)	! a block of 64
240*4882a593Smuzhiyun	EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_63_56)
241*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_63_48)
242*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_63_48)
243*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_retl_o2_plus_63_40)
244*4882a593Smuzhiyun	EX_ST(STORE(stx, %o3, %o0+24), memcpy_retl_o2_plus_63_40)
245*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_63_32)! load and store
246*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0+32), memcpy_retl_o2_plus_63_32)
247*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_retl_o2_plus_63_24)! a block of 64
248*4882a593Smuzhiyun	add	%o1, 64, %o1		! increase src ptr by 64
249*4882a593Smuzhiyun	EX_ST(STORE(stx, %o3, %o0+40), memcpy_retl_o2_plus_63_24)
250*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_63_16)
251*4882a593Smuzhiyun	add	%o0, 64, %o0		! increase dst ptr by 64
252*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_63_16)
253*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_63_8)
254*4882a593Smuzhiyun	bgu,pt	%xcc, .Lmedl64		! repeat if at least 64 bytes left
255*4882a593Smuzhiyun	 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_63_8)
256*4882a593Smuzhiyun.Lmedl63:
257*4882a593Smuzhiyun	addcc	%o2, 32, %o2		! adjust remaining count
258*4882a593Smuzhiyun	ble,pt	%xcc, .Lmedl31		! to skip if 31 or fewer bytes left
259*4882a593Smuzhiyun	 nop
260*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_31)	! load
261*4882a593Smuzhiyun	sub	%o2, 32, %o2		! decrement length count
262*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_31_32)	! and store
263*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_31_24)	! a block of 32
264*4882a593Smuzhiyun	add	%o1, 32, %o1		! increase src ptr by 32
265*4882a593Smuzhiyun	EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_31_24)
266*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
267*4882a593Smuzhiyun	add	%o0, 32, %o0		! increase dst ptr by 32
268*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_31_16)
269*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_31_8)
270*4882a593Smuzhiyun	EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_31_8)
271*4882a593Smuzhiyun.Lmedl31:
272*4882a593Smuzhiyun	addcc	%o2, 16, %o2		! adjust remaining count
273*4882a593Smuzhiyun	ble,pt	%xcc, .Lmedl15		! skip if 15 or fewer bytes left
274*4882a593Smuzhiyun	 nop				!
275*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_15)
276*4882a593Smuzhiyun	add	%o1, 16, %o1		! increase src ptr by 16
277*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_15)
278*4882a593Smuzhiyun	sub	%o2, 16, %o2		! decrease count by 16
279*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_15_8)
280*4882a593Smuzhiyun	add	%o0, 16, %o0		! increase dst ptr by 16
281*4882a593Smuzhiyun	EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_15_8)
282*4882a593Smuzhiyun.Lmedl15:
283*4882a593Smuzhiyun	addcc	%o2, 15, %o2		! restore count
284*4882a593Smuzhiyun	bz,pt	%xcc, .Lsmallx	! exit if finished
285*4882a593Smuzhiyun	 cmp	%o2, 8
286*4882a593Smuzhiyun	blt,pt	%xcc, .Lmedw7		! skip if 7 or fewer bytes left
287*4882a593Smuzhiyun	 tst	%o2
288*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)	! load 8 bytes
289*4882a593Smuzhiyun	add	%o1, 8, %o1		! increase src ptr by 8
290*4882a593Smuzhiyun	add	%o0, 8, %o0		! increase dst ptr by 8
291*4882a593Smuzhiyun	subcc	%o2, 8, %o2		! decrease count by 8
292*4882a593Smuzhiyun	bnz,pn	%xcc, .Lmedw7
293*4882a593Smuzhiyun	 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)	! and store 8
294*4882a593Smuzhiyun	retl
295*4882a593Smuzhiyun	 mov	EX_RETVAL(%g1), %o0	! restore %o0
296*4882a593Smuzhiyun
297*4882a593Smuzhiyun	.align 16
298*4882a593Smuzhiyun.Lsrc_dst_unaligned_on_8:
299*4882a593Smuzhiyun	! DST is 8-byte aligned, src is not
300*4882a593Smuzhiyun2:
301*4882a593Smuzhiyun	andcc	%o1, 0x3, %o5		! test word alignment
302*4882a593Smuzhiyun	bnz,pt	%xcc, .Lunalignsetup	! branch to skip if not word aligned
303*4882a593Smuzhiyun	 nop
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun/*
306*4882a593Smuzhiyun * Handle all cases where src and dest are aligned on word
307*4882a593Smuzhiyun * boundaries. Use unrolled loops for better performance.
308*4882a593Smuzhiyun * This option wins over standard large data move when
309*4882a593Smuzhiyun * source and destination is in cache for.Lmedium
310*4882a593Smuzhiyun * to short data moves.
311*4882a593Smuzhiyun */
312*4882a593Smuzhiyun	set MED_WMAX, %o3
313*4882a593Smuzhiyun	cmp %o2, %o3 			! limit to store buffer size
314*4882a593Smuzhiyun	bge,pt	%xcc, .Lunalignrejoin	! otherwise rejoin main loop
315*4882a593Smuzhiyun	 nop
316*4882a593Smuzhiyun
317*4882a593Smuzhiyun	subcc	%o2, 31, %o2		! adjust length to allow cc test
318*4882a593Smuzhiyun					! for end of loop
319*4882a593Smuzhiyun	ble,pt	%xcc, .Lmedw31		! skip big loop if less than 16
320*4882a593Smuzhiyun.Lmedw32:
321*4882a593Smuzhiyun	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_31)! move a block of 32
322*4882a593Smuzhiyun	sllx	%o4, 32, %o5
323*4882a593Smuzhiyun	EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_31)
324*4882a593Smuzhiyun	or	%o4, %o5, %o5
325*4882a593Smuzhiyun	EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_31)
326*4882a593Smuzhiyun	subcc	%o2, 32, %o2		! decrement length count
327*4882a593Smuzhiyun	EX_LD(LOAD(ld, %o1+8, %o4), memcpy_retl_o2_plus_31_24)
328*4882a593Smuzhiyun	sllx	%o4, 32, %o5
329*4882a593Smuzhiyun	EX_LD(LOAD(ld, %o1+12, %o4), memcpy_retl_o2_plus_31_24)
330*4882a593Smuzhiyun	or	%o4, %o5, %o5
331*4882a593Smuzhiyun	EX_ST(STORE(stx, %o5, %o0+8), memcpy_retl_o2_plus_31_24)
332*4882a593Smuzhiyun	add	%o1, 32, %o1		! increase src ptr by 32
333*4882a593Smuzhiyun	EX_LD(LOAD(ld, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
334*4882a593Smuzhiyun	sllx	%o4, 32, %o5
335*4882a593Smuzhiyun	EX_LD(LOAD(ld, %o1-12, %o4), memcpy_retl_o2_plus_31_16)
336*4882a593Smuzhiyun	or	%o4, %o5, %o5
337*4882a593Smuzhiyun	EX_ST(STORE(stx, %o5, %o0+16), memcpy_retl_o2_plus_31_16)
338*4882a593Smuzhiyun	add	%o0, 32, %o0		! increase dst ptr by 32
339*4882a593Smuzhiyun	EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_31_8)
340*4882a593Smuzhiyun	sllx	%o4, 32, %o5
341*4882a593Smuzhiyun	EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_31_8)
342*4882a593Smuzhiyun	or	%o4, %o5, %o5
343*4882a593Smuzhiyun	bgu,pt	%xcc, .Lmedw32		! repeat if at least 32 bytes left
344*4882a593Smuzhiyun	 EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_31_8)
345*4882a593Smuzhiyun.Lmedw31:
346*4882a593Smuzhiyun	addcc	%o2, 31, %o2		! restore count
347*4882a593Smuzhiyun
348*4882a593Smuzhiyun	bz,pt	%xcc, .Lsmallx	! exit if finished
349*4882a593Smuzhiyun	 nop
350*4882a593Smuzhiyun	cmp	%o2, 16
351*4882a593Smuzhiyun	blt,pt	%xcc, .Lmedw15
352*4882a593Smuzhiyun	 nop
353*4882a593Smuzhiyun	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)! move a block of 16 bytes
354*4882a593Smuzhiyun	sllx	%o4, 32, %o5
355*4882a593Smuzhiyun	subcc	%o2, 16, %o2		! decrement length count
356*4882a593Smuzhiyun	EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_16)
357*4882a593Smuzhiyun	or	%o4, %o5, %o5
358*4882a593Smuzhiyun	EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_16)
359*4882a593Smuzhiyun	add	%o1, 16, %o1		! increase src ptr by 16
360*4882a593Smuzhiyun	EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_8)
361*4882a593Smuzhiyun	add	%o0, 16, %o0		! increase dst ptr by 16
362*4882a593Smuzhiyun	sllx	%o4, 32, %o5
363*4882a593Smuzhiyun	EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_8)
364*4882a593Smuzhiyun	or	%o4, %o5, %o5
365*4882a593Smuzhiyun	EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_8)
366*4882a593Smuzhiyun.Lmedw15:
367*4882a593Smuzhiyun	bz,pt	%xcc, .Lsmallx	! exit if finished
368*4882a593Smuzhiyun	 cmp	%o2, 8
369*4882a593Smuzhiyun	blt,pn	%xcc, .Lmedw7		! skip if 7 or fewer bytes left
370*4882a593Smuzhiyun	 tst	%o2
371*4882a593Smuzhiyun	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)	! load 4 bytes
372*4882a593Smuzhiyun	subcc	%o2, 8, %o2		! decrease count by 8
373*4882a593Smuzhiyun	EX_ST(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_8)! and store 4 bytes
374*4882a593Smuzhiyun	add	%o1, 8, %o1		! increase src ptr by 8
375*4882a593Smuzhiyun	EX_LD(LOAD(ld, %o1-4, %o3), memcpy_retl_o2_plus_4)	! load 4 bytes
376*4882a593Smuzhiyun	add	%o0, 8, %o0		! increase dst ptr by 8
377*4882a593Smuzhiyun	EX_ST(STORE(stw, %o3, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
378*4882a593Smuzhiyun	bz,pt	%xcc, .Lsmallx	! exit if finished
379*4882a593Smuzhiyun.Lmedw7:				! count is ge 1, less than 8
380*4882a593Smuzhiyun	cmp	%o2, 4			! check for 4 bytes left
381*4882a593Smuzhiyun	blt,pn	%xcc, .Lsmallleft3	! skip if 3 or fewer bytes left
382*4882a593Smuzhiyun	 nop				!
383*4882a593Smuzhiyun	EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)	! load 4 bytes
384*4882a593Smuzhiyun	add	%o1, 4, %o1		! increase src ptr by 4
385*4882a593Smuzhiyun	add	%o0, 4, %o0		! increase dst ptr by 4
386*4882a593Smuzhiyun	subcc	%o2, 4, %o2		! decrease count by 4
387*4882a593Smuzhiyun	bnz	.Lsmallleft3
388*4882a593Smuzhiyun	 EX_ST(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
389*4882a593Smuzhiyun	retl
390*4882a593Smuzhiyun	 mov	EX_RETVAL(%g1), %o0
391*4882a593Smuzhiyun
392*4882a593Smuzhiyun	.align 16
393*4882a593Smuzhiyun.Llarge_align8_copy:			! Src and dst share 8 byte alignment
394*4882a593Smuzhiyun	! align dst to 64 byte boundary
395*4882a593Smuzhiyun	andcc	%o0, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
396*4882a593Smuzhiyun	brz,pn	%o3, .Laligned_to_64
397*4882a593Smuzhiyun	 andcc	%o0, 8, %o3		! odd long words to move?
398*4882a593Smuzhiyun	brz,pt	%o3, .Laligned_to_16
399*4882a593Smuzhiyun	 nop
400*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
401*4882a593Smuzhiyun	sub	%o2, 8, %o2
402*4882a593Smuzhiyun	add	%o1, 8, %o1		! increment src ptr
403*4882a593Smuzhiyun	add	%o0, 8, %o0		! increment dst ptr
404*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
405*4882a593Smuzhiyun.Laligned_to_16:
406*4882a593Smuzhiyun	andcc	%o0, 16, %o3		! pair of long words to move?
407*4882a593Smuzhiyun	brz,pt	%o3, .Laligned_to_32
408*4882a593Smuzhiyun	 nop
409*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
410*4882a593Smuzhiyun	sub	%o2, 16, %o2
411*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_16)
412*4882a593Smuzhiyun	add	%o1, 16, %o1		! increment src ptr
413*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
414*4882a593Smuzhiyun	add	%o0, 16, %o0		! increment dst ptr
415*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
416*4882a593Smuzhiyun.Laligned_to_32:
417*4882a593Smuzhiyun	andcc	%o0, 32, %o3		! four long words to move?
418*4882a593Smuzhiyun	brz,pt	%o3, .Laligned_to_64
419*4882a593Smuzhiyun	 nop
420*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
421*4882a593Smuzhiyun	sub	%o2, 32, %o2
422*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_32)
423*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_24)
424*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_24)
425*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_16)
426*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_16)
427*4882a593Smuzhiyun	add	%o1, 32, %o1		! increment src ptr
428*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
429*4882a593Smuzhiyun	add	%o0, 32, %o0		! increment dst ptr
430*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
431*4882a593Smuzhiyun.Laligned_to_64:
432*4882a593Smuzhiyun!
433*4882a593Smuzhiyun!	Using block init store (BIS) instructions to avoid fetching cache
434*4882a593Smuzhiyun!	lines from memory. Use ST_CHUNK stores to first element of each cache
435*4882a593Smuzhiyun!	line (similar to prefetching) to avoid overfilling STQ or miss buffers.
436*4882a593Smuzhiyun!	Gives existing cache lines time to be moved out of L1/L2/L3 cache.
437*4882a593Smuzhiyun!	Initial stores using MRU version of BIS to keep cache line in
438*4882a593Smuzhiyun!	cache until we are ready to store final element of cache line.
439*4882a593Smuzhiyun!	Then store last element using the LRU version of BIS.
440*4882a593Smuzhiyun!
441*4882a593Smuzhiyun	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
442*4882a593Smuzhiyun	and	%o2, 0x3f, %o2		! residue bytes in %o2
443*4882a593Smuzhiyun!
444*4882a593Smuzhiyun!	We use STORE_MRU_ASI for the first seven stores to each cache line
445*4882a593Smuzhiyun!	followed by STORE_ASI (mark as LRU) for the last store. That
446*4882a593Smuzhiyun!	mixed approach reduces the probability that the cache line is removed
447*4882a593Smuzhiyun!	before we finish setting it, while minimizing the effects on
448*4882a593Smuzhiyun!	other cached values during a large memcpy
449*4882a593Smuzhiyun!
450*4882a593Smuzhiyun!	ST_CHUNK batches up initial BIS operations for several cache lines
451*4882a593Smuzhiyun!	to allow multiple requests to not be blocked by overflowing the
452*4882a593Smuzhiyun!	the store miss buffer. Then the matching stores for all those
453*4882a593Smuzhiyun!	BIS operations are executed.
454*4882a593Smuzhiyun!
455*4882a593Smuzhiyun
456*4882a593Smuzhiyun	sub	%o0, 8, %o0		! adjust %o0 for ASI alignment
457*4882a593Smuzhiyun.Lalign_loop:
458*4882a593Smuzhiyun	cmp	%o5, ST_CHUNK*64
459*4882a593Smuzhiyun	blu,pt	%xcc, .Lalign_loop_fin
460*4882a593Smuzhiyun	 mov	ST_CHUNK,%o3
461*4882a593Smuzhiyun.Lalign_loop_start:
462*4882a593Smuzhiyun	prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21
463*4882a593Smuzhiyun	subcc	%o3, 1, %o3
464*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
465*4882a593Smuzhiyun	add	%o1, 64, %o1
466*4882a593Smuzhiyun	add	%o0, 8, %o0
467*4882a593Smuzhiyun	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
468*4882a593Smuzhiyun	bgu	%xcc,.Lalign_loop_start
469*4882a593Smuzhiyun	 add	%o0, 56, %o0
470*4882a593Smuzhiyun
471*4882a593Smuzhiyun	mov	ST_CHUNK,%o3
472*4882a593Smuzhiyun	sllx	%o3, 6, %o4		! ST_CHUNK*64
473*4882a593Smuzhiyun	sub	%o1, %o4, %o1		! reset %o1
474*4882a593Smuzhiyun	sub	%o0, %o4, %o0		! reset %o0
475*4882a593Smuzhiyun
476*4882a593Smuzhiyun.Lalign_loop_rest:
477*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
478*4882a593Smuzhiyun	add	%o0, 16, %o0
479*4882a593Smuzhiyun	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
480*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
481*4882a593Smuzhiyun	add	%o0, 8, %o0
482*4882a593Smuzhiyun	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
483*4882a593Smuzhiyun	subcc	%o3, 1, %o3
484*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5)
485*4882a593Smuzhiyun	add	%o0, 8, %o0
486*4882a593Smuzhiyun	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
487*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5)
488*4882a593Smuzhiyun	add	%o0, 8, %o0
489*4882a593Smuzhiyun	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
490*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5)
491*4882a593Smuzhiyun	add	%o0, 8, %o0
492*4882a593Smuzhiyun	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
493*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5)
494*4882a593Smuzhiyun	add	%o1, 64, %o1
495*4882a593Smuzhiyun	add	%o0, 8, %o0
496*4882a593Smuzhiyun	EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
497*4882a593Smuzhiyun	add	%o0, 8, %o0
498*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5)
499*4882a593Smuzhiyun	sub	%o5, 64, %o5
500*4882a593Smuzhiyun	bgu	%xcc,.Lalign_loop_rest
501*4882a593Smuzhiyun	! mark cache line as LRU
502*4882a593Smuzhiyun	 EX_ST(STORE_INIT(%o4, %o0), memcpy_retl_o2_plus_o5_plus_64)
503*4882a593Smuzhiyun
504*4882a593Smuzhiyun	cmp	%o5, ST_CHUNK*64
505*4882a593Smuzhiyun	bgu,pt	%xcc, .Lalign_loop_start
506*4882a593Smuzhiyun	 mov	ST_CHUNK,%o3
507*4882a593Smuzhiyun
508*4882a593Smuzhiyun	cmp	%o5, 0
509*4882a593Smuzhiyun	beq	.Lalign_done
510*4882a593Smuzhiyun	 nop
511*4882a593Smuzhiyun.Lalign_loop_fin:
512*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
513*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_o5)
514*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
515*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0+8+8), memcpy_retl_o2_plus_o5)
516*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
517*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0+8+16), memcpy_retl_o2_plus_o5)
518*4882a593Smuzhiyun	subcc	%o5, 64, %o5
519*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5_64)
520*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0+8+24), memcpy_retl_o2_plus_o5_64)
521*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5_64)
522*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0+8+32), memcpy_retl_o2_plus_o5_64)
523*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5_64)
524*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0+8+40), memcpy_retl_o2_plus_o5_64)
525*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5_64)
526*4882a593Smuzhiyun	add	%o1, 64, %o1
527*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0+8+48), memcpy_retl_o2_plus_o5_64)
528*4882a593Smuzhiyun	add	%o0, 64, %o0
529*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5_64)
530*4882a593Smuzhiyun	bgu	%xcc,.Lalign_loop_fin
531*4882a593Smuzhiyun	 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_o5_64)
532*4882a593Smuzhiyun
533*4882a593Smuzhiyun.Lalign_done:
534*4882a593Smuzhiyun	add	%o0, 8, %o0		! restore %o0 from ASI alignment
535*4882a593Smuzhiyun	membar	#StoreStore
536*4882a593Smuzhiyun	sub	%o2, 63, %o2		! adjust length to allow cc test
537*4882a593Smuzhiyun	ba	.Lmedl63		! in .Lmedl63
538*4882a593Smuzhiyun	 nop
539*4882a593Smuzhiyun
540*4882a593Smuzhiyun	.align 16
541*4882a593Smuzhiyun	! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX
542*4882a593Smuzhiyun.Lunalignsetup:
543*4882a593Smuzhiyun.Lunalignrejoin:
544*4882a593Smuzhiyun	mov	%g1, %o3	! save %g1 as VISEntryHalf clobbers it
545*4882a593Smuzhiyun#ifdef NON_USER_COPY
546*4882a593Smuzhiyun	VISEntryHalfFast(.Lmedium_vis_entry_fail_cp)
547*4882a593Smuzhiyun#else
548*4882a593Smuzhiyun	VISEntryHalf
549*4882a593Smuzhiyun#endif
550*4882a593Smuzhiyun	mov	%o3, %g1	! restore %g1
551*4882a593Smuzhiyun
552*4882a593Smuzhiyun	set MED_UMAX, %o3
553*4882a593Smuzhiyun	cmp %o2, %o3 		! check for.Lmedium unaligned limit
554*4882a593Smuzhiyun	bge,pt	%xcc,.Lunalign_large
555*4882a593Smuzhiyun	 prefetch [%o1 + (4 * BLOCK_SIZE)], 20
556*4882a593Smuzhiyun	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
557*4882a593Smuzhiyun	and	%o2, 0x3f, %o2		! residue bytes in %o2
558*4882a593Smuzhiyun	cmp	%o2, 8			! Insure we do not load beyond
559*4882a593Smuzhiyun	bgt	.Lunalign_adjust	! end of source buffer
560*4882a593Smuzhiyun	 andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
561*4882a593Smuzhiyun	add	%o2, 64, %o2		! adjust to leave loop
562*4882a593Smuzhiyun	sub	%o5, 64, %o5		! early if necessary
563*4882a593Smuzhiyun.Lunalign_adjust:
564*4882a593Smuzhiyun	alignaddr %o1, %g0, %g0		! generate %gsr
565*4882a593Smuzhiyun	add	%o1, %o5, %o1		! advance %o1 to after blocks
566*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)
567*4882a593Smuzhiyun.Lunalign_loop:
568*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
569*4882a593Smuzhiyun	faligndata %f0, %f2, %f16
570*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcpy_retl_o2_plus_o5)
571*4882a593Smuzhiyun	subcc	%o5, BLOCK_SIZE, %o5
572*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5_plus_64)
573*4882a593Smuzhiyun	faligndata %f2, %f4, %f18
574*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcpy_retl_o2_plus_o5_plus_56)
575*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f18, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
576*4882a593Smuzhiyun	faligndata %f4, %f6, %f20
577*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcpy_retl_o2_plus_o5_plus_48)
578*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f20, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
579*4882a593Smuzhiyun	faligndata %f6, %f8, %f22
580*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4+40, %f10), memcpy_retl_o2_plus_o5_plus_40)
581*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f22, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
582*4882a593Smuzhiyun	faligndata %f8, %f10, %f24
583*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4+48, %f12), memcpy_retl_o2_plus_o5_plus_32)
584*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f24, %o0+32), memcpy_retl_o2_plus_o5_plus_32)
585*4882a593Smuzhiyun	faligndata %f10, %f12, %f26
586*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4+56, %f14), memcpy_retl_o2_plus_o5_plus_24)
587*4882a593Smuzhiyun	add	%o4, BLOCK_SIZE, %o4
588*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f26, %o0+40), memcpy_retl_o2_plus_o5_plus_24)
589*4882a593Smuzhiyun	faligndata %f12, %f14, %f28
590*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5_plus_16)
591*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f28, %o0+48), memcpy_retl_o2_plus_o5_plus_16)
592*4882a593Smuzhiyun	faligndata %f14, %f0, %f30
593*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f30, %o0+56), memcpy_retl_o2_plus_o5_plus_8)
594*4882a593Smuzhiyun	add	%o0, BLOCK_SIZE, %o0
595*4882a593Smuzhiyun	bgu,pt	%xcc, .Lunalign_loop
596*4882a593Smuzhiyun	 prefetch [%o4 + (5 * BLOCK_SIZE)], 20
597*4882a593Smuzhiyun	ba	.Lunalign_done
598*4882a593Smuzhiyun	 nop
599*4882a593Smuzhiyun
600*4882a593Smuzhiyun.Lunalign_large:
601*4882a593Smuzhiyun	andcc	%o0, 0x3f, %o3		! is dst 64-byte block aligned?
602*4882a593Smuzhiyun	bz	%xcc, .Lunalignsrc
603*4882a593Smuzhiyun	 sub	%o3, 64, %o3		! %o3 will be multiple of 8
604*4882a593Smuzhiyun	neg	%o3			! bytes until dest is 64 byte aligned
605*4882a593Smuzhiyun	sub	%o2, %o3, %o2		! update cnt with bytes to be moved
606*4882a593Smuzhiyun	! Move bytes according to source alignment
607*4882a593Smuzhiyun	andcc	%o1, 0x1, %o5
608*4882a593Smuzhiyun	bnz	%xcc, .Lunalignbyte	! check for byte alignment
609*4882a593Smuzhiyun	 nop
610*4882a593Smuzhiyun	andcc	%o1, 2, %o5		! check for half word alignment
611*4882a593Smuzhiyun	bnz	%xcc, .Lunalignhalf
612*4882a593Smuzhiyun	 nop
613*4882a593Smuzhiyun	! Src is word aligned
614*4882a593Smuzhiyun.Lunalignword:
615*4882a593Smuzhiyun	EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_o3)	! load 4 bytes
616*4882a593Smuzhiyun	add	%o1, 8, %o1		! increase src ptr by 8
617*4882a593Smuzhiyun	EX_ST_FP(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_o3)	! and store 4
618*4882a593Smuzhiyun	subcc	%o3, 8, %o3		! decrease count by 8
619*4882a593Smuzhiyun	EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_o3_plus_4)! load 4
620*4882a593Smuzhiyun	add	%o0, 8, %o0		! increase dst ptr by 8
621*4882a593Smuzhiyun	bnz	%xcc, .Lunalignword
622*4882a593Smuzhiyun	 EX_ST_FP(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_o3_plus_4)
623*4882a593Smuzhiyun	ba	.Lunalignsrc
624*4882a593Smuzhiyun	 nop
625*4882a593Smuzhiyun
626*4882a593Smuzhiyun	! Src is half-word aligned
627*4882a593Smuzhiyun.Lunalignhalf:
628*4882a593Smuzhiyun	EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_retl_o2_plus_o3)	! load 2 bytes
629*4882a593Smuzhiyun	sllx	%o4, 32, %o5		! shift left
630*4882a593Smuzhiyun	EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcpy_retl_o2_plus_o3)
631*4882a593Smuzhiyun	or	%o4, %o5, %o5
632*4882a593Smuzhiyun	sllx	%o5, 16, %o5
633*4882a593Smuzhiyun	EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcpy_retl_o2_plus_o3)
634*4882a593Smuzhiyun	or	%o4, %o5, %o5
635*4882a593Smuzhiyun	EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
636*4882a593Smuzhiyun	add	%o1, 8, %o1
637*4882a593Smuzhiyun	subcc	%o3, 8, %o3
638*4882a593Smuzhiyun	bnz	%xcc, .Lunalignhalf
639*4882a593Smuzhiyun	 add	%o0, 8, %o0
640*4882a593Smuzhiyun	ba	.Lunalignsrc
641*4882a593Smuzhiyun	 nop
642*4882a593Smuzhiyun
643*4882a593Smuzhiyun	! Src is Byte aligned
644*4882a593Smuzhiyun.Lunalignbyte:
645*4882a593Smuzhiyun	sub	%o0, %o1, %o0		! share pointer advance
646*4882a593Smuzhiyun.Lunalignbyte_loop:
647*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_retl_o2_plus_o3)
648*4882a593Smuzhiyun	sllx	%o4, 56, %o5
649*4882a593Smuzhiyun	EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcpy_retl_o2_plus_o3)
650*4882a593Smuzhiyun	sllx	%o4, 40, %o4
651*4882a593Smuzhiyun	or	%o4, %o5, %o5
652*4882a593Smuzhiyun	EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcpy_retl_o2_plus_o3)
653*4882a593Smuzhiyun	sllx	%o4, 24, %o4
654*4882a593Smuzhiyun	or	%o4, %o5, %o5
655*4882a593Smuzhiyun	EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcpy_retl_o2_plus_o3)
656*4882a593Smuzhiyun	sllx	%o4,  8, %o4
657*4882a593Smuzhiyun	or	%o4, %o5, %o5
658*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcpy_retl_o2_plus_o3)
659*4882a593Smuzhiyun	or	%o4, %o5, %o5
660*4882a593Smuzhiyun	add	%o0, %o1, %o0
661*4882a593Smuzhiyun	EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
662*4882a593Smuzhiyun	sub	%o0, %o1, %o0
663*4882a593Smuzhiyun	subcc	%o3, 8, %o3
664*4882a593Smuzhiyun	bnz	%xcc, .Lunalignbyte_loop
665*4882a593Smuzhiyun	 add	%o1, 8, %o1
666*4882a593Smuzhiyun	add	%o0,%o1, %o0 		! restore pointer
667*4882a593Smuzhiyun
668*4882a593Smuzhiyun	! Destination is now block (64 byte aligned)
669*4882a593Smuzhiyun.Lunalignsrc:
670*4882a593Smuzhiyun	andn	%o2, 0x3f, %o5		! %o5 is multiple of block size
671*4882a593Smuzhiyun	and	%o2, 0x3f, %o2		! residue bytes in %o2
672*4882a593Smuzhiyun	add	%o2, 64, %o2		! Insure we do not load beyond
673*4882a593Smuzhiyun	sub	%o5, 64, %o5		! end of source buffer
674*4882a593Smuzhiyun
675*4882a593Smuzhiyun	andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
676*4882a593Smuzhiyun	alignaddr %o1, %g0, %g0		! generate %gsr
677*4882a593Smuzhiyun	add	%o1, %o5, %o1		! advance %o1 to after blocks
678*4882a593Smuzhiyun
679*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_retl_o2_plus_o5)
680*4882a593Smuzhiyun	add	%o4, 8, %o4
681*4882a593Smuzhiyun.Lunalign_sloop:
682*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_retl_o2_plus_o5)
683*4882a593Smuzhiyun	faligndata %f14, %f16, %f0
684*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcpy_retl_o2_plus_o5)
685*4882a593Smuzhiyun	faligndata %f16, %f18, %f2
686*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4+16, %f20), memcpy_retl_o2_plus_o5)
687*4882a593Smuzhiyun	faligndata %f18, %f20, %f4
688*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f0, %o0), memcpy_retl_o2_plus_o5)
689*4882a593Smuzhiyun	subcc	%o5, 64, %o5
690*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4+24, %f22), memcpy_retl_o2_plus_o5_plus_56)
691*4882a593Smuzhiyun	faligndata %f20, %f22, %f6
692*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f2, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
693*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4+32, %f24), memcpy_retl_o2_plus_o5_plus_48)
694*4882a593Smuzhiyun	faligndata %f22, %f24, %f8
695*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f4, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
696*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4+40, %f26), memcpy_retl_o2_plus_o5_plus_40)
697*4882a593Smuzhiyun	faligndata %f24, %f26, %f10
698*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f6, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
699*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4+48, %f28), memcpy_retl_o2_plus_o5_plus_40)
700*4882a593Smuzhiyun	faligndata %f26, %f28, %f12
701*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f8, %o0+32), memcpy_retl_o2_plus_o5_plus_40)
702*4882a593Smuzhiyun	add	%o4, 64, %o4
703*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcpy_retl_o2_plus_o5_plus_40)
704*4882a593Smuzhiyun	faligndata %f28, %f30, %f14
705*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f10, %o0+40), memcpy_retl_o2_plus_o5_plus_40)
706*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f12, %o0+48), memcpy_retl_o2_plus_o5_plus_40)
707*4882a593Smuzhiyun	add	%o0, 64, %o0
708*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f14, %o0-8), memcpy_retl_o2_plus_o5_plus_40)
709*4882a593Smuzhiyun	fsrc2	%f30, %f14
710*4882a593Smuzhiyun	bgu,pt	%xcc, .Lunalign_sloop
711*4882a593Smuzhiyun	 prefetch [%o4 + (8 * BLOCK_SIZE)], 20
712*4882a593Smuzhiyun
713*4882a593Smuzhiyun.Lunalign_done:
714*4882a593Smuzhiyun	! Handle trailing bytes, 64 to 127
715*4882a593Smuzhiyun	! Dest long word aligned, Src not long word aligned
716*4882a593Smuzhiyun	cmp	%o2, 15
717*4882a593Smuzhiyun	bleu	%xcc, .Lunalign_short
718*4882a593Smuzhiyun
719*4882a593Smuzhiyun	 andn	%o2, 0x7, %o5		! %o5 is multiple of 8
720*4882a593Smuzhiyun	and	%o2, 0x7, %o2		! residue bytes in %o2
721*4882a593Smuzhiyun	add	%o2, 8, %o2
722*4882a593Smuzhiyun	sub	%o5, 8, %o5		! insure we do not load past end of src
723*4882a593Smuzhiyun	andn	%o1, 0x7, %o4		! %o4 has long word aligned src address
724*4882a593Smuzhiyun	add	%o1, %o5, %o1		! advance %o1 to after multiple of 8
725*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)! fetch partialword
726*4882a593Smuzhiyun.Lunalign_by8:
727*4882a593Smuzhiyun	EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
728*4882a593Smuzhiyun	add	%o4, 8, %o4
729*4882a593Smuzhiyun	faligndata %f0, %f2, %f16
730*4882a593Smuzhiyun	subcc	%o5, 8, %o5
731*4882a593Smuzhiyun	EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5)
732*4882a593Smuzhiyun	fsrc2	%f2, %f0
733*4882a593Smuzhiyun	bgu,pt	%xcc, .Lunalign_by8
734*4882a593Smuzhiyun	 add	%o0, 8, %o0
735*4882a593Smuzhiyun
736*4882a593Smuzhiyun.Lunalign_short:
737*4882a593Smuzhiyun#ifdef NON_USER_COPY
738*4882a593Smuzhiyun	VISExitHalfFast
739*4882a593Smuzhiyun#else
740*4882a593Smuzhiyun	VISExitHalf
741*4882a593Smuzhiyun#endif
742*4882a593Smuzhiyun	ba	.Lsmallrest
743*4882a593Smuzhiyun	 nop
744*4882a593Smuzhiyun
745*4882a593Smuzhiyun/*
746*4882a593Smuzhiyun * This is a special case of nested memcpy. This can happen when kernel
747*4882a593Smuzhiyun * calls unaligned memcpy back to back without saving FP registers. We need
748*4882a593Smuzhiyun * traps(context switch) to save/restore FP registers. If the kernel calls
749*4882a593Smuzhiyun * memcpy without this trap sequence we will hit FP corruption. Let's use
750*4882a593Smuzhiyun * the normal integer load/store method in this case.
751*4882a593Smuzhiyun */
752*4882a593Smuzhiyun
753*4882a593Smuzhiyun#ifdef NON_USER_COPY
754*4882a593Smuzhiyun.Lmedium_vis_entry_fail_cp:
755*4882a593Smuzhiyun	or	%o0, %o1, %g2
756*4882a593Smuzhiyun#endif
757*4882a593Smuzhiyun.Lmedium_cp:
758*4882a593Smuzhiyun	LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
759*4882a593Smuzhiyun	andcc	%g2, 0x7, %g0
760*4882a593Smuzhiyun	bne,pn	%xcc, .Lmedium_unaligned_cp
761*4882a593Smuzhiyun	 nop
762*4882a593Smuzhiyun
763*4882a593Smuzhiyun.Lmedium_noprefetch_cp:
764*4882a593Smuzhiyun	andncc	%o2, 0x20 - 1, %o5
765*4882a593Smuzhiyun	be,pn	%xcc, 2f
766*4882a593Smuzhiyun	 sub	%o2, %o5, %o2
767*4882a593Smuzhiyun1:	EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
768*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5)
769*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memcpy_retl_o2_plus_o5)
770*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5)
771*4882a593Smuzhiyun	add	%o1, 0x20, %o1
772*4882a593Smuzhiyun	subcc	%o5, 0x20, %o5
773*4882a593Smuzhiyun	EX_ST(STORE(stx, %o3, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32)
774*4882a593Smuzhiyun	EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24)
775*4882a593Smuzhiyun	EX_ST(STORE(stx, %g7, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24)
776*4882a593Smuzhiyun	EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8)
777*4882a593Smuzhiyun	bne,pt	%xcc, 1b
778*4882a593Smuzhiyun	 add	%o0, 0x20, %o0
779*4882a593Smuzhiyun2:	andcc	%o2, 0x18, %o5
780*4882a593Smuzhiyun	be,pt	%xcc, 3f
781*4882a593Smuzhiyun	 sub	%o2, %o5, %o2
782*4882a593Smuzhiyun1:	EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
783*4882a593Smuzhiyun	add	%o1, 0x08, %o1
784*4882a593Smuzhiyun	add	%o0, 0x08, %o0
785*4882a593Smuzhiyun	subcc	%o5, 0x08, %o5
786*4882a593Smuzhiyun	bne,pt	%xcc, 1b
787*4882a593Smuzhiyun	 EX_ST(STORE(stx, %o3, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8)
788*4882a593Smuzhiyun3:	brz,pt	%o2, .Lexit_cp
789*4882a593Smuzhiyun	 cmp	%o2, 0x04
790*4882a593Smuzhiyun	bl,pn	%xcc, .Ltiny_cp
791*4882a593Smuzhiyun	 nop
792*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2)
793*4882a593Smuzhiyun	add	%o1, 0x04, %o1
794*4882a593Smuzhiyun	add	%o0, 0x04, %o0
795*4882a593Smuzhiyun	subcc	%o2, 0x04, %o2
796*4882a593Smuzhiyun	bne,pn	%xcc, .Ltiny_cp
797*4882a593Smuzhiyun	 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_4)
798*4882a593Smuzhiyun	ba,a,pt	%xcc, .Lexit_cp
799*4882a593Smuzhiyun
800*4882a593Smuzhiyun.Lmedium_unaligned_cp:
801*4882a593Smuzhiyun	/* First get dest 8 byte aligned.  */
802*4882a593Smuzhiyun	sub	%g0, %o0, %o3
803*4882a593Smuzhiyun	and	%o3, 0x7, %o3
804*4882a593Smuzhiyun	brz,pt	%o3, 2f
805*4882a593Smuzhiyun	 sub	%o2, %o3, %o2
806*4882a593Smuzhiyun
807*4882a593Smuzhiyun1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1)
808*4882a593Smuzhiyun	add	%o1, 1, %o1
809*4882a593Smuzhiyun	subcc	%o3, 1, %o3
810*4882a593Smuzhiyun	add	%o0, 1, %o0
811*4882a593Smuzhiyun	bne,pt	%xcc, 1b
812*4882a593Smuzhiyun	 EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1)
813*4882a593Smuzhiyun2:
814*4882a593Smuzhiyun	and	%o1, 0x7, %o3
815*4882a593Smuzhiyun	brz,pn	%o3, .Lmedium_noprefetch_cp
816*4882a593Smuzhiyun	 sll	%o3, 3, %o3
817*4882a593Smuzhiyun	mov	64, %g2
818*4882a593Smuzhiyun	sub	%g2, %o3, %g2
819*4882a593Smuzhiyun	andn	%o1, 0x7, %o1
820*4882a593Smuzhiyun	EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2)
821*4882a593Smuzhiyun	sllx	%o4, %o3, %o4
822*4882a593Smuzhiyun	andn	%o2, 0x08 - 1, %o5
823*4882a593Smuzhiyun	sub	%o2, %o5, %o2
824*4882a593Smuzhiyun
825*4882a593Smuzhiyun1:	EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5)
826*4882a593Smuzhiyun	add	%o1, 0x08, %o1
827*4882a593Smuzhiyun	subcc	%o5, 0x08, %o5
828*4882a593Smuzhiyun	srlx	%g3, %g2, %g7
829*4882a593Smuzhiyun	or	%g7, %o4, %g7
830*4882a593Smuzhiyun	EX_ST(STORE(stx, %g7, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8)
831*4882a593Smuzhiyun	add	%o0, 0x08, %o0
832*4882a593Smuzhiyun	bne,pt	%xcc, 1b
833*4882a593Smuzhiyun	 sllx	%g3, %o3, %o4
834*4882a593Smuzhiyun	srl	%o3, 3, %o3
835*4882a593Smuzhiyun	add	%o1, %o3, %o1
836*4882a593Smuzhiyun	brz,pn	%o2, .Lexit_cp
837*4882a593Smuzhiyun	 nop
838*4882a593Smuzhiyun	ba,pt	%xcc, .Lsmall_unaligned_cp
839*4882a593Smuzhiyun
840*4882a593Smuzhiyun.Ltiny_cp:
841*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
842*4882a593Smuzhiyun	subcc	%o2, 1, %o2
843*4882a593Smuzhiyun	be,pn	%xcc, .Lexit_cp
844*4882a593Smuzhiyun	 EX_ST(STORE(stb, %o3, %o0 + 0x00), memcpy_retl_o2_plus_1)
845*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o1 + 0x01, %o3), memcpy_retl_o2)
846*4882a593Smuzhiyun	subcc	%o2, 1, %o2
847*4882a593Smuzhiyun	be,pn	%xcc, .Lexit_cp
848*4882a593Smuzhiyun	 EX_ST(STORE(stb, %o3, %o0 + 0x01), memcpy_retl_o2_plus_1)
849*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o1 + 0x02, %o3), memcpy_retl_o2)
850*4882a593Smuzhiyun	ba,pt	%xcc, .Lexit_cp
851*4882a593Smuzhiyun	 EX_ST(STORE(stb, %o3, %o0 + 0x02), memcpy_retl_o2)
852*4882a593Smuzhiyun
853*4882a593Smuzhiyun.Lsmall_cp:
854*4882a593Smuzhiyun	andcc	%g2, 0x3, %g0
855*4882a593Smuzhiyun	bne,pn	%xcc, .Lsmall_unaligned_cp
856*4882a593Smuzhiyun	 andn	%o2, 0x4 - 1, %o5
857*4882a593Smuzhiyun	sub	%o2, %o5, %o2
858*4882a593Smuzhiyun1:
859*4882a593Smuzhiyun	EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
860*4882a593Smuzhiyun	add	%o1, 0x04, %o1
861*4882a593Smuzhiyun	subcc	%o5, 0x04, %o5
862*4882a593Smuzhiyun	add	%o0, 0x04, %o0
863*4882a593Smuzhiyun	bne,pt	%xcc, 1b
864*4882a593Smuzhiyun	 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4)
865*4882a593Smuzhiyun	brz,pt	%o2, .Lexit_cp
866*4882a593Smuzhiyun	 nop
867*4882a593Smuzhiyun	ba,a,pt	%xcc, .Ltiny_cp
868*4882a593Smuzhiyun
869*4882a593Smuzhiyun.Lsmall_unaligned_cp:
870*4882a593Smuzhiyun1:	EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
871*4882a593Smuzhiyun	add	%o1, 1, %o1
872*4882a593Smuzhiyun	add	%o0, 1, %o0
873*4882a593Smuzhiyun	subcc	%o2, 1, %o2
874*4882a593Smuzhiyun	bne,pt	%xcc, 1b
875*4882a593Smuzhiyun	 EX_ST(STORE(stb, %o3, %o0 - 0x01), memcpy_retl_o2_plus_1)
876*4882a593Smuzhiyun	ba,a,pt	%xcc, .Lexit_cp
877*4882a593Smuzhiyun
878*4882a593Smuzhiyun.Lsmallrest:
879*4882a593Smuzhiyun	tst	%o2
880*4882a593Smuzhiyun	bz,pt	%xcc, .Lsmallx
881*4882a593Smuzhiyun	 cmp	%o2, 4
882*4882a593Smuzhiyun	blt,pn	%xcc, .Lsmallleft3
883*4882a593Smuzhiyun	 nop
884*4882a593Smuzhiyun	sub	%o2, 3, %o2
885*4882a593Smuzhiyun.Lsmallnotalign4:
886*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_3)! read byte
887*4882a593Smuzhiyun	subcc	%o2, 4, %o2		! reduce count by 4
888*4882a593Smuzhiyun	EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_7)! write byte & repeat
889*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2_plus_6)! for total of 4
890*4882a593Smuzhiyun	add	%o1, 4, %o1		! advance SRC by 4
891*4882a593Smuzhiyun	EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_6)
892*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_retl_o2_plus_5)
893*4882a593Smuzhiyun	add	%o0, 4, %o0		! advance DST by 4
894*4882a593Smuzhiyun	EX_ST(STORE(stb, %o3, %o0-2), memcpy_retl_o2_plus_5)
895*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_retl_o2_plus_4)
896*4882a593Smuzhiyun	bgu,pt	%xcc, .Lsmallnotalign4	! loop til 3 or fewer bytes remain
897*4882a593Smuzhiyun	EX_ST(STORE(stb, %o3, %o0-1), memcpy_retl_o2_plus_4)
898*4882a593Smuzhiyun	addcc	%o2, 3, %o2		! restore count
899*4882a593Smuzhiyun	bz,pt	%xcc, .Lsmallx
900*4882a593Smuzhiyun.Lsmallleft3:				! 1, 2, or 3 bytes remain
901*4882a593Smuzhiyun	subcc	%o2, 1, %o2
902*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_1)	! load one byte
903*4882a593Smuzhiyun	bz,pt	%xcc, .Lsmallx
904*4882a593Smuzhiyun	EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_1)	! store one byte
905*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2)	! load second byte
906*4882a593Smuzhiyun	subcc	%o2, 1, %o2
907*4882a593Smuzhiyun	bz,pt	%xcc, .Lsmallx
908*4882a593Smuzhiyun	EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_1)! store second byte
909*4882a593Smuzhiyun	EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_retl_o2)	! load third byte
910*4882a593Smuzhiyun	EX_ST(STORE(stb, %o3, %o0+2), memcpy_retl_o2)	! store third byte
911*4882a593Smuzhiyun.Lsmallx:
912*4882a593Smuzhiyun	retl
913*4882a593Smuzhiyun	 mov	EX_RETVAL(%g1), %o0
914*4882a593Smuzhiyun.Lsmallfin:
915*4882a593Smuzhiyun	tst	%o2
916*4882a593Smuzhiyun	bnz,pn	%xcc, .Lsmallleft3
917*4882a593Smuzhiyun	 nop
918*4882a593Smuzhiyun	retl
919*4882a593Smuzhiyun	 mov	EX_RETVAL(%g1), %o0	! restore %o0
920*4882a593Smuzhiyun.Lexit_cp:
921*4882a593Smuzhiyun	retl
922*4882a593Smuzhiyun	 mov	EX_RETVAL(%g1), %o0
923*4882a593Smuzhiyun	.size  FUNC_NAME, .-FUNC_NAME
924