xref: /OK3568_Linux_fs/kernel/arch/microblaze/lib/fastcopy.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/*
2*4882a593Smuzhiyun * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3*4882a593Smuzhiyun * Copyright (C) 2008-2009 PetaLogix
4*4882a593Smuzhiyun * Copyright (C) 2008 Jim Law - Iris LP  All rights reserved.
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun * This file is subject to the terms and conditions of the GNU General
7*4882a593Smuzhiyun * Public License.  See the file COPYING in the main directory of this
8*4882a593Smuzhiyun * archive for more details.
9*4882a593Smuzhiyun *
10*4882a593Smuzhiyun * Written by Jim Law <jlaw@irispower.com>
11*4882a593Smuzhiyun *
12*4882a593Smuzhiyun * intended to replace:
13*4882a593Smuzhiyun *	memcpy in memcpy.c and
14*4882a593Smuzhiyun *	memmove in memmove.c
15*4882a593Smuzhiyun * ... in arch/microblaze/lib
16*4882a593Smuzhiyun *
17*4882a593Smuzhiyun *
18*4882a593Smuzhiyun * assly_fastcopy.S
19*4882a593Smuzhiyun *
20*4882a593Smuzhiyun * Attempt at quicker memcpy and memmove for MicroBlaze
21*4882a593Smuzhiyun *	Input :	Operand1 in Reg r5 - destination address
22*4882a593Smuzhiyun *		Operand2 in Reg r6 - source address
23*4882a593Smuzhiyun *		Operand3 in Reg r7 - number of bytes to transfer
24*4882a593Smuzhiyun *	Output: Result in Reg r3 - starting destinaition address
25*4882a593Smuzhiyun *
26*4882a593Smuzhiyun *
27*4882a593Smuzhiyun * Explanation:
28*4882a593Smuzhiyun *	Perform (possibly unaligned) copy of a block of memory
29*4882a593Smuzhiyun *	between mem locations with size of xfer spec'd in bytes
30*4882a593Smuzhiyun */
31*4882a593Smuzhiyun
32*4882a593Smuzhiyun#include <linux/linkage.h>
33*4882a593Smuzhiyun	.text
34*4882a593Smuzhiyun	.globl	memcpy
35*4882a593Smuzhiyun	.type  memcpy, @function
36*4882a593Smuzhiyun	.ent	memcpy
37*4882a593Smuzhiyun
38*4882a593Smuzhiyunmemcpy:
39*4882a593Smuzhiyunfast_memcpy_ascending:
40*4882a593Smuzhiyun	/* move d to return register as value of function */
41*4882a593Smuzhiyun	addi	r3, r5, 0
42*4882a593Smuzhiyun
43*4882a593Smuzhiyun	addi	r4, r0, 4	/* n = 4 */
44*4882a593Smuzhiyun	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
45*4882a593Smuzhiyun	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */
46*4882a593Smuzhiyun
47*4882a593Smuzhiyun	/* transfer first 0~3 bytes to get aligned dest address */
48*4882a593Smuzhiyun	andi	r4, r5, 3		/* n = d & 3 */
49*4882a593Smuzhiyun	/* if zero, destination already aligned */
50*4882a593Smuzhiyun	beqi	r4, a_dalign_done
51*4882a593Smuzhiyun	/* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
52*4882a593Smuzhiyun	rsubi	r4, r4, 4
53*4882a593Smuzhiyun	rsub	r7, r4, r7		/* c = c - n adjust c */
54*4882a593Smuzhiyun
55*4882a593Smuzhiyuna_xfer_first_loop:
56*4882a593Smuzhiyun	/* if no bytes left to transfer, transfer the bulk */
57*4882a593Smuzhiyun	beqi	r4, a_dalign_done
58*4882a593Smuzhiyun	lbui	r11, r6, 0		/* h = *s */
59*4882a593Smuzhiyun	sbi	r11, r5, 0		/* *d = h */
60*4882a593Smuzhiyun	addi	r6, r6, 1		/* s++ */
61*4882a593Smuzhiyun	addi	r5, r5, 1		/* d++ */
62*4882a593Smuzhiyun	brid	a_xfer_first_loop	/* loop */
63*4882a593Smuzhiyun	addi	r4, r4, -1		/* n-- (IN DELAY SLOT) */
64*4882a593Smuzhiyun
65*4882a593Smuzhiyuna_dalign_done:
66*4882a593Smuzhiyun	addi	r4, r0, 32		/* n = 32 */
67*4882a593Smuzhiyun	cmpu	r4, r4, r7		/* n = c - n  (unsigned) */
68*4882a593Smuzhiyun	/* if n < 0, less than one block to transfer */
69*4882a593Smuzhiyun	blti	r4, a_block_done
70*4882a593Smuzhiyun
71*4882a593Smuzhiyuna_block_xfer:
72*4882a593Smuzhiyun	andi	r4, r7, 0xffffffe0	/* n = c & ~31 */
73*4882a593Smuzhiyun	rsub	r7, r4, r7		/* c = c - n */
74*4882a593Smuzhiyun
75*4882a593Smuzhiyun	andi	r9, r6, 3		/* t1 = s & 3 */
76*4882a593Smuzhiyun	/* if temp != 0, unaligned transfers needed */
77*4882a593Smuzhiyun	bnei	r9, a_block_unaligned
78*4882a593Smuzhiyun
79*4882a593Smuzhiyuna_block_aligned:
80*4882a593Smuzhiyun	lwi	r9, r6, 0		/* t1 = *(s + 0) */
81*4882a593Smuzhiyun	lwi	r10, r6, 4		/* t2 = *(s + 4) */
82*4882a593Smuzhiyun	lwi	r11, r6, 8		/* t3 = *(s + 8) */
83*4882a593Smuzhiyun	lwi	r12, r6, 12		/* t4 = *(s + 12) */
84*4882a593Smuzhiyun	swi	r9, r5, 0		/* *(d + 0) = t1 */
85*4882a593Smuzhiyun	swi	r10, r5, 4		/* *(d + 4) = t2 */
86*4882a593Smuzhiyun	swi	r11, r5, 8		/* *(d + 8) = t3 */
87*4882a593Smuzhiyun	swi	r12, r5, 12		/* *(d + 12) = t4 */
88*4882a593Smuzhiyun	lwi	r9, r6, 16		/* t1 = *(s + 16) */
89*4882a593Smuzhiyun	lwi	r10, r6, 20		/* t2 = *(s + 20) */
90*4882a593Smuzhiyun	lwi	r11, r6, 24		/* t3 = *(s + 24) */
91*4882a593Smuzhiyun	lwi	r12, r6, 28		/* t4 = *(s + 28) */
92*4882a593Smuzhiyun	swi	r9, r5, 16		/* *(d + 16) = t1 */
93*4882a593Smuzhiyun	swi	r10, r5, 20		/* *(d + 20) = t2 */
94*4882a593Smuzhiyun	swi	r11, r5, 24		/* *(d + 24) = t3 */
95*4882a593Smuzhiyun	swi	r12, r5, 28		/* *(d + 28) = t4 */
96*4882a593Smuzhiyun	addi	r6, r6, 32		/* s = s + 32 */
97*4882a593Smuzhiyun	addi	r4, r4, -32		/* n = n - 32 */
98*4882a593Smuzhiyun	bneid	r4, a_block_aligned	/* while (n) loop */
99*4882a593Smuzhiyun	addi	r5, r5, 32		/* d = d + 32 (IN DELAY SLOT) */
100*4882a593Smuzhiyun	bri	a_block_done
101*4882a593Smuzhiyun
102*4882a593Smuzhiyuna_block_unaligned:
103*4882a593Smuzhiyun	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
104*4882a593Smuzhiyun	add	r6, r6, r4		/* s = s + n */
105*4882a593Smuzhiyun	lwi	r11, r8, 0		/* h = *(as + 0) */
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun	addi	r9, r9, -1
108*4882a593Smuzhiyun	beqi	r9, a_block_u1		/* t1 was 1 => 1 byte offset */
109*4882a593Smuzhiyun	addi	r9, r9, -1
110*4882a593Smuzhiyun	beqi	r9, a_block_u2		/* t1 was 2 => 2 byte offset */
111*4882a593Smuzhiyun
112*4882a593Smuzhiyuna_block_u3:
113*4882a593Smuzhiyun	bslli	r11, r11, 24	/* h = h << 24 */
114*4882a593Smuzhiyuna_bu3_loop:
115*4882a593Smuzhiyun	lwi	r12, r8, 4	/* v = *(as + 4) */
116*4882a593Smuzhiyun	bsrli	r9, r12, 8	/* t1 = v >> 8 */
117*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
118*4882a593Smuzhiyun	swi	r9, r5, 0	/* *(d + 0) = t1 */
119*4882a593Smuzhiyun	bslli	r11, r12, 24	/* h = v << 24 */
120*4882a593Smuzhiyun	lwi	r12, r8, 8	/* v = *(as + 8) */
121*4882a593Smuzhiyun	bsrli	r9, r12, 8	/* t1 = v >> 8 */
122*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
123*4882a593Smuzhiyun	swi	r9, r5, 4	/* *(d + 4) = t1 */
124*4882a593Smuzhiyun	bslli	r11, r12, 24	/* h = v << 24 */
125*4882a593Smuzhiyun	lwi	r12, r8, 12	/* v = *(as + 12) */
126*4882a593Smuzhiyun	bsrli	r9, r12, 8	/* t1 = v >> 8 */
127*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
128*4882a593Smuzhiyun	swi	r9, r5, 8	/* *(d + 8) = t1 */
129*4882a593Smuzhiyun	bslli	r11, r12, 24	/* h = v << 24 */
130*4882a593Smuzhiyun	lwi	r12, r8, 16	/* v = *(as + 16) */
131*4882a593Smuzhiyun	bsrli	r9, r12, 8	/* t1 = v >> 8 */
132*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
133*4882a593Smuzhiyun	swi	r9, r5, 12	/* *(d + 12) = t1 */
134*4882a593Smuzhiyun	bslli	r11, r12, 24	/* h = v << 24 */
135*4882a593Smuzhiyun	lwi	r12, r8, 20	/* v = *(as + 20) */
136*4882a593Smuzhiyun	bsrli	r9, r12, 8	/* t1 = v >> 8 */
137*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
138*4882a593Smuzhiyun	swi	r9, r5, 16	/* *(d + 16) = t1 */
139*4882a593Smuzhiyun	bslli	r11, r12, 24	/* h = v << 24 */
140*4882a593Smuzhiyun	lwi	r12, r8, 24	/* v = *(as + 24) */
141*4882a593Smuzhiyun	bsrli	r9, r12, 8	/* t1 = v >> 8 */
142*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
143*4882a593Smuzhiyun	swi	r9, r5, 20	/* *(d + 20) = t1 */
144*4882a593Smuzhiyun	bslli	r11, r12, 24	/* h = v << 24 */
145*4882a593Smuzhiyun	lwi	r12, r8, 28	/* v = *(as + 28) */
146*4882a593Smuzhiyun	bsrli	r9, r12, 8	/* t1 = v >> 8 */
147*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
148*4882a593Smuzhiyun	swi	r9, r5, 24	/* *(d + 24) = t1 */
149*4882a593Smuzhiyun	bslli	r11, r12, 24	/* h = v << 24 */
150*4882a593Smuzhiyun	lwi	r12, r8, 32	/* v = *(as + 32) */
151*4882a593Smuzhiyun	bsrli	r9, r12, 8	/* t1 = v >> 8 */
152*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
153*4882a593Smuzhiyun	swi	r9, r5, 28	/* *(d + 28) = t1 */
154*4882a593Smuzhiyun	bslli	r11, r12, 24	/* h = v << 24 */
155*4882a593Smuzhiyun	addi	r8, r8, 32	/* as = as + 32 */
156*4882a593Smuzhiyun	addi	r4, r4, -32	/* n = n - 32 */
157*4882a593Smuzhiyun	bneid	r4, a_bu3_loop	/* while (n) loop */
158*4882a593Smuzhiyun	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
159*4882a593Smuzhiyun	bri	a_block_done
160*4882a593Smuzhiyun
161*4882a593Smuzhiyuna_block_u1:
162*4882a593Smuzhiyun	bslli	r11, r11, 8	/* h = h << 8 */
163*4882a593Smuzhiyuna_bu1_loop:
164*4882a593Smuzhiyun	lwi	r12, r8, 4	/* v = *(as + 4) */
165*4882a593Smuzhiyun	bsrli	r9, r12, 24	/* t1 = v >> 24 */
166*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
167*4882a593Smuzhiyun	swi	r9, r5, 0	/* *(d + 0) = t1 */
168*4882a593Smuzhiyun	bslli	r11, r12, 8	/* h = v << 8 */
169*4882a593Smuzhiyun	lwi	r12, r8, 8	/* v = *(as + 8) */
170*4882a593Smuzhiyun	bsrli	r9, r12, 24	/* t1 = v >> 24 */
171*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
172*4882a593Smuzhiyun	swi	r9, r5, 4	/* *(d + 4) = t1 */
173*4882a593Smuzhiyun	bslli	r11, r12, 8	/* h = v << 8 */
174*4882a593Smuzhiyun	lwi	r12, r8, 12	/* v = *(as + 12) */
175*4882a593Smuzhiyun	bsrli	r9, r12, 24	/* t1 = v >> 24 */
176*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
177*4882a593Smuzhiyun	swi	r9, r5, 8	/* *(d + 8) = t1 */
178*4882a593Smuzhiyun	bslli	r11, r12, 8	/* h = v << 8 */
179*4882a593Smuzhiyun	lwi	r12, r8, 16	/* v = *(as + 16) */
180*4882a593Smuzhiyun	bsrli	r9, r12, 24	/* t1 = v >> 24 */
181*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
182*4882a593Smuzhiyun	swi	r9, r5, 12	/* *(d + 12) = t1 */
183*4882a593Smuzhiyun	bslli	r11, r12, 8	/* h = v << 8 */
184*4882a593Smuzhiyun	lwi	r12, r8, 20	/* v = *(as + 20) */
185*4882a593Smuzhiyun	bsrli	r9, r12, 24	/* t1 = v >> 24 */
186*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
187*4882a593Smuzhiyun	swi	r9, r5, 16	/* *(d + 16) = t1 */
188*4882a593Smuzhiyun	bslli	r11, r12, 8	/* h = v << 8 */
189*4882a593Smuzhiyun	lwi	r12, r8, 24	/* v = *(as + 24) */
190*4882a593Smuzhiyun	bsrli	r9, r12, 24	/* t1 = v >> 24 */
191*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
192*4882a593Smuzhiyun	swi	r9, r5, 20	/* *(d + 20) = t1 */
193*4882a593Smuzhiyun	bslli	r11, r12, 8	/* h = v << 8 */
194*4882a593Smuzhiyun	lwi	r12, r8, 28	/* v = *(as + 28) */
195*4882a593Smuzhiyun	bsrli	r9, r12, 24	/* t1 = v >> 24 */
196*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
197*4882a593Smuzhiyun	swi	r9, r5, 24	/* *(d + 24) = t1 */
198*4882a593Smuzhiyun	bslli	r11, r12, 8	/* h = v << 8 */
199*4882a593Smuzhiyun	lwi	r12, r8, 32	/* v = *(as + 32) */
200*4882a593Smuzhiyun	bsrli	r9, r12, 24	/* t1 = v >> 24 */
201*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
202*4882a593Smuzhiyun	swi	r9, r5, 28	/* *(d + 28) = t1 */
203*4882a593Smuzhiyun	bslli	r11, r12, 8	/* h = v << 8 */
204*4882a593Smuzhiyun	addi	r8, r8, 32	/* as = as + 32 */
205*4882a593Smuzhiyun	addi	r4, r4, -32	/* n = n - 32 */
206*4882a593Smuzhiyun	bneid	r4, a_bu1_loop	/* while (n) loop */
207*4882a593Smuzhiyun	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
208*4882a593Smuzhiyun	bri	a_block_done
209*4882a593Smuzhiyun
210*4882a593Smuzhiyuna_block_u2:
211*4882a593Smuzhiyun	bslli	r11, r11, 16	/* h = h << 16 */
212*4882a593Smuzhiyuna_bu2_loop:
213*4882a593Smuzhiyun	lwi	r12, r8, 4	/* v = *(as + 4) */
214*4882a593Smuzhiyun	bsrli	r9, r12, 16	/* t1 = v >> 16 */
215*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
216*4882a593Smuzhiyun	swi	r9, r5, 0	/* *(d + 0) = t1 */
217*4882a593Smuzhiyun	bslli	r11, r12, 16	/* h = v << 16 */
218*4882a593Smuzhiyun	lwi	r12, r8, 8	/* v = *(as + 8) */
219*4882a593Smuzhiyun	bsrli	r9, r12, 16	/* t1 = v >> 16 */
220*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
221*4882a593Smuzhiyun	swi	r9, r5, 4	/* *(d + 4) = t1 */
222*4882a593Smuzhiyun	bslli	r11, r12, 16	/* h = v << 16 */
223*4882a593Smuzhiyun	lwi	r12, r8, 12	/* v = *(as + 12) */
224*4882a593Smuzhiyun	bsrli	r9, r12, 16	/* t1 = v >> 16 */
225*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
226*4882a593Smuzhiyun	swi	r9, r5, 8	/* *(d + 8) = t1 */
227*4882a593Smuzhiyun	bslli	r11, r12, 16	/* h = v << 16 */
228*4882a593Smuzhiyun	lwi	r12, r8, 16	/* v = *(as + 16) */
229*4882a593Smuzhiyun	bsrli	r9, r12, 16	/* t1 = v >> 16 */
230*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
231*4882a593Smuzhiyun	swi	r9, r5, 12	/* *(d + 12) = t1 */
232*4882a593Smuzhiyun	bslli	r11, r12, 16	/* h = v << 16 */
233*4882a593Smuzhiyun	lwi	r12, r8, 20	/* v = *(as + 20) */
234*4882a593Smuzhiyun	bsrli	r9, r12, 16	/* t1 = v >> 16 */
235*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
236*4882a593Smuzhiyun	swi	r9, r5, 16	/* *(d + 16) = t1 */
237*4882a593Smuzhiyun	bslli	r11, r12, 16	/* h = v << 16 */
238*4882a593Smuzhiyun	lwi	r12, r8, 24	/* v = *(as + 24) */
239*4882a593Smuzhiyun	bsrli	r9, r12, 16	/* t1 = v >> 16 */
240*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
241*4882a593Smuzhiyun	swi	r9, r5, 20	/* *(d + 20) = t1 */
242*4882a593Smuzhiyun	bslli	r11, r12, 16	/* h = v << 16 */
243*4882a593Smuzhiyun	lwi	r12, r8, 28	/* v = *(as + 28) */
244*4882a593Smuzhiyun	bsrli	r9, r12, 16	/* t1 = v >> 16 */
245*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
246*4882a593Smuzhiyun	swi	r9, r5, 24	/* *(d + 24) = t1 */
247*4882a593Smuzhiyun	bslli	r11, r12, 16	/* h = v << 16 */
248*4882a593Smuzhiyun	lwi	r12, r8, 32	/* v = *(as + 32) */
249*4882a593Smuzhiyun	bsrli	r9, r12, 16	/* t1 = v >> 16 */
250*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
251*4882a593Smuzhiyun	swi	r9, r5, 28	/* *(d + 28) = t1 */
252*4882a593Smuzhiyun	bslli	r11, r12, 16	/* h = v << 16 */
253*4882a593Smuzhiyun	addi	r8, r8, 32	/* as = as + 32 */
254*4882a593Smuzhiyun	addi	r4, r4, -32	/* n = n - 32 */
255*4882a593Smuzhiyun	bneid	r4, a_bu2_loop	/* while (n) loop */
256*4882a593Smuzhiyun	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
257*4882a593Smuzhiyun
258*4882a593Smuzhiyuna_block_done:
259*4882a593Smuzhiyun	addi	r4, r0, 4	/* n = 4 */
260*4882a593Smuzhiyun	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
261*4882a593Smuzhiyun	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */
262*4882a593Smuzhiyun
263*4882a593Smuzhiyuna_word_xfer:
264*4882a593Smuzhiyun	andi	r4, r7, 0xfffffffc	/* n = c & ~3 */
265*4882a593Smuzhiyun	addi	r10, r0, 0		/* offset = 0 */
266*4882a593Smuzhiyun
267*4882a593Smuzhiyun	andi	r9, r6, 3		/* t1 = s & 3 */
268*4882a593Smuzhiyun	/* if temp != 0, unaligned transfers needed */
269*4882a593Smuzhiyun	bnei	r9, a_word_unaligned
270*4882a593Smuzhiyun
271*4882a593Smuzhiyuna_word_aligned:
272*4882a593Smuzhiyun	lw	r9, r6, r10		/* t1 = *(s+offset) */
273*4882a593Smuzhiyun	sw	r9, r5, r10		/* *(d+offset) = t1 */
274*4882a593Smuzhiyun	addi	r4, r4,-4		/* n-- */
275*4882a593Smuzhiyun	bneid	r4, a_word_aligned	/* loop */
276*4882a593Smuzhiyun	addi	r10, r10, 4		/* offset++ (IN DELAY SLOT) */
277*4882a593Smuzhiyun
278*4882a593Smuzhiyun	bri	a_word_done
279*4882a593Smuzhiyun
280*4882a593Smuzhiyuna_word_unaligned:
281*4882a593Smuzhiyun	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
282*4882a593Smuzhiyun	lwi	r11, r8, 0		/* h = *(as + 0) */
283*4882a593Smuzhiyun	addi	r8, r8, 4		/* as = as + 4 */
284*4882a593Smuzhiyun
285*4882a593Smuzhiyun	addi	r9, r9, -1
286*4882a593Smuzhiyun	beqi	r9, a_word_u1		/* t1 was 1 => 1 byte offset */
287*4882a593Smuzhiyun	addi	r9, r9, -1
288*4882a593Smuzhiyun	beqi	r9, a_word_u2		/* t1 was 2 => 2 byte offset */
289*4882a593Smuzhiyun
290*4882a593Smuzhiyuna_word_u3:
291*4882a593Smuzhiyun	bslli	r11, r11, 24	/* h = h << 24 */
292*4882a593Smuzhiyuna_wu3_loop:
293*4882a593Smuzhiyun	lw	r12, r8, r10	/* v = *(as + offset) */
294*4882a593Smuzhiyun	bsrli	r9, r12, 8	/* t1 = v >> 8 */
295*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
296*4882a593Smuzhiyun	sw	r9, r5, r10	/* *(d + offset) = t1 */
297*4882a593Smuzhiyun	bslli	r11, r12, 24	/* h = v << 24 */
298*4882a593Smuzhiyun	addi	r4, r4,-4	/* n = n - 4 */
299*4882a593Smuzhiyun	bneid	r4, a_wu3_loop	/* while (n) loop */
300*4882a593Smuzhiyun	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
301*4882a593Smuzhiyun
302*4882a593Smuzhiyun	bri	a_word_done
303*4882a593Smuzhiyun
304*4882a593Smuzhiyuna_word_u1:
305*4882a593Smuzhiyun	bslli	r11, r11, 8	/* h = h << 8 */
306*4882a593Smuzhiyuna_wu1_loop:
307*4882a593Smuzhiyun	lw	r12, r8, r10	/* v = *(as + offset) */
308*4882a593Smuzhiyun	bsrli	r9, r12, 24	/* t1 = v >> 24 */
309*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
310*4882a593Smuzhiyun	sw	r9, r5, r10	/* *(d + offset) = t1 */
311*4882a593Smuzhiyun	bslli	r11, r12, 8	/* h = v << 8 */
312*4882a593Smuzhiyun	addi	r4, r4,-4	/* n = n - 4 */
313*4882a593Smuzhiyun	bneid	r4, a_wu1_loop	/* while (n) loop */
314*4882a593Smuzhiyun	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
315*4882a593Smuzhiyun
316*4882a593Smuzhiyun	bri	a_word_done
317*4882a593Smuzhiyun
318*4882a593Smuzhiyuna_word_u2:
319*4882a593Smuzhiyun	bslli	r11, r11, 16	/* h = h << 16 */
320*4882a593Smuzhiyuna_wu2_loop:
321*4882a593Smuzhiyun	lw	r12, r8, r10	/* v = *(as + offset) */
322*4882a593Smuzhiyun	bsrli	r9, r12, 16	/* t1 = v >> 16 */
323*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
324*4882a593Smuzhiyun	sw	r9, r5, r10	/* *(d + offset) = t1 */
325*4882a593Smuzhiyun	bslli	r11, r12, 16	/* h = v << 16 */
326*4882a593Smuzhiyun	addi	r4, r4,-4	/* n = n - 4 */
327*4882a593Smuzhiyun	bneid	r4, a_wu2_loop	/* while (n) loop */
328*4882a593Smuzhiyun	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
329*4882a593Smuzhiyun
330*4882a593Smuzhiyuna_word_done:
331*4882a593Smuzhiyun	add	r5, r5, r10	/* d = d + offset */
332*4882a593Smuzhiyun	add	r6, r6, r10	/* s = s + offset */
333*4882a593Smuzhiyun	rsub	r7, r10, r7	/* c = c - offset */
334*4882a593Smuzhiyun
335*4882a593Smuzhiyuna_xfer_end:
336*4882a593Smuzhiyuna_xfer_end_loop:
337*4882a593Smuzhiyun	beqi	r7, a_done		/* while (c) */
338*4882a593Smuzhiyun	lbui	r9, r6, 0		/* t1 = *s */
339*4882a593Smuzhiyun	addi	r6, r6, 1		/* s++ */
340*4882a593Smuzhiyun	sbi	r9, r5, 0		/* *d = t1 */
341*4882a593Smuzhiyun	addi	r7, r7, -1		/* c-- */
342*4882a593Smuzhiyun	brid	a_xfer_end_loop		/* loop */
343*4882a593Smuzhiyun	addi	r5, r5, 1		/* d++ (IN DELAY SLOT) */
344*4882a593Smuzhiyun
345*4882a593Smuzhiyuna_done:
346*4882a593Smuzhiyun	rtsd	r15, 8
347*4882a593Smuzhiyun	nop
348*4882a593Smuzhiyun
349*4882a593Smuzhiyun.size  memcpy, . - memcpy
350*4882a593Smuzhiyun.end memcpy
351*4882a593Smuzhiyun/*----------------------------------------------------------------------------*/
352*4882a593Smuzhiyun	.globl	memmove
353*4882a593Smuzhiyun	.type  memmove, @function
354*4882a593Smuzhiyun	.ent	memmove
355*4882a593Smuzhiyun
356*4882a593Smuzhiyunmemmove:
357*4882a593Smuzhiyun	cmpu	r4, r5, r6	/* n = s - d */
358*4882a593Smuzhiyun	bgei	r4,fast_memcpy_ascending
359*4882a593Smuzhiyun
360*4882a593Smuzhiyunfast_memcpy_descending:
361*4882a593Smuzhiyun	/* move d to return register as value of function */
362*4882a593Smuzhiyun	addi	r3, r5, 0
363*4882a593Smuzhiyun
364*4882a593Smuzhiyun	add	r5, r5, r7	/* d = d + c */
365*4882a593Smuzhiyun	add	r6, r6, r7	/* s = s + c */
366*4882a593Smuzhiyun
367*4882a593Smuzhiyun	addi	r4, r0, 4	/* n = 4 */
368*4882a593Smuzhiyun	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
369*4882a593Smuzhiyun	blti	r4,d_xfer_end	/* if n < 0, less than one word to transfer */
370*4882a593Smuzhiyun
371*4882a593Smuzhiyun	/* transfer first 0~3 bytes to get aligned dest address */
372*4882a593Smuzhiyun	andi	r4, r5, 3		/* n = d & 3 */
373*4882a593Smuzhiyun	/* if zero, destination already aligned */
374*4882a593Smuzhiyun	beqi	r4,d_dalign_done
375*4882a593Smuzhiyun	rsub	r7, r4, r7		/* c = c - n adjust c */
376*4882a593Smuzhiyun
377*4882a593Smuzhiyund_xfer_first_loop:
378*4882a593Smuzhiyun	/* if no bytes left to transfer, transfer the bulk */
379*4882a593Smuzhiyun	beqi	r4,d_dalign_done
380*4882a593Smuzhiyun	addi	r6, r6, -1		/* s-- */
381*4882a593Smuzhiyun	addi	r5, r5, -1		/* d-- */
382*4882a593Smuzhiyun	lbui	r11, r6, 0		/* h = *s */
383*4882a593Smuzhiyun	sbi	r11, r5, 0		/* *d = h */
384*4882a593Smuzhiyun	brid	d_xfer_first_loop	/* loop */
385*4882a593Smuzhiyun	addi	r4, r4, -1		/* n-- (IN DELAY SLOT) */
386*4882a593Smuzhiyun
387*4882a593Smuzhiyund_dalign_done:
388*4882a593Smuzhiyun	addi	r4, r0, 32	/* n = 32 */
389*4882a593Smuzhiyun	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
390*4882a593Smuzhiyun	/* if n < 0, less than one block to transfer */
391*4882a593Smuzhiyun	blti	r4, d_block_done
392*4882a593Smuzhiyun
393*4882a593Smuzhiyund_block_xfer:
394*4882a593Smuzhiyun	andi	r4, r7, 0xffffffe0	/* n = c & ~31 */
395*4882a593Smuzhiyun	rsub	r7, r4, r7		/* c = c - n */
396*4882a593Smuzhiyun
397*4882a593Smuzhiyun	andi	r9, r6, 3		/* t1 = s & 3 */
398*4882a593Smuzhiyun	/* if temp != 0, unaligned transfers needed */
399*4882a593Smuzhiyun	bnei	r9, d_block_unaligned
400*4882a593Smuzhiyun
401*4882a593Smuzhiyund_block_aligned:
402*4882a593Smuzhiyun	addi	r6, r6, -32		/* s = s - 32 */
403*4882a593Smuzhiyun	addi	r5, r5, -32		/* d = d - 32 */
404*4882a593Smuzhiyun	lwi	r9, r6, 28		/* t1 = *(s + 28) */
405*4882a593Smuzhiyun	lwi	r10, r6, 24		/* t2 = *(s + 24) */
406*4882a593Smuzhiyun	lwi	r11, r6, 20		/* t3 = *(s + 20) */
407*4882a593Smuzhiyun	lwi	r12, r6, 16		/* t4 = *(s + 16) */
408*4882a593Smuzhiyun	swi	r9, r5, 28		/* *(d + 28) = t1 */
409*4882a593Smuzhiyun	swi	r10, r5, 24		/* *(d + 24) = t2 */
410*4882a593Smuzhiyun	swi	r11, r5, 20		/* *(d + 20) = t3 */
411*4882a593Smuzhiyun	swi	r12, r5, 16		/* *(d + 16) = t4 */
412*4882a593Smuzhiyun	lwi	r9, r6, 12		/* t1 = *(s + 12) */
413*4882a593Smuzhiyun	lwi	r10, r6, 8		/* t2 = *(s + 8) */
414*4882a593Smuzhiyun	lwi	r11, r6, 4		/* t3 = *(s + 4) */
415*4882a593Smuzhiyun	lwi	r12, r6, 0		/* t4 = *(s + 0) */
416*4882a593Smuzhiyun	swi	r9, r5, 12		/* *(d + 12) = t1 */
417*4882a593Smuzhiyun	swi	r10, r5, 8		/* *(d + 8) = t2 */
418*4882a593Smuzhiyun	swi	r11, r5, 4		/* *(d + 4) = t3 */
419*4882a593Smuzhiyun	addi	r4, r4, -32		/* n = n - 32 */
420*4882a593Smuzhiyun	bneid	r4, d_block_aligned	/* while (n) loop */
421*4882a593Smuzhiyun	swi	r12, r5, 0		/* *(d + 0) = t4 (IN DELAY SLOT) */
422*4882a593Smuzhiyun	bri	d_block_done
423*4882a593Smuzhiyun
424*4882a593Smuzhiyund_block_unaligned:
425*4882a593Smuzhiyun	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
426*4882a593Smuzhiyun	rsub	r6, r4, r6		/* s = s - n */
427*4882a593Smuzhiyun	lwi	r11, r8, 0		/* h = *(as + 0) */
428*4882a593Smuzhiyun
429*4882a593Smuzhiyun	addi	r9, r9, -1
430*4882a593Smuzhiyun	beqi	r9,d_block_u1		/* t1 was 1 => 1 byte offset */
431*4882a593Smuzhiyun	addi	r9, r9, -1
432*4882a593Smuzhiyun	beqi	r9,d_block_u2		/* t1 was 2 => 2 byte offset */
433*4882a593Smuzhiyun
434*4882a593Smuzhiyund_block_u3:
435*4882a593Smuzhiyun	bsrli	r11, r11, 8	/* h = h >> 8 */
436*4882a593Smuzhiyund_bu3_loop:
437*4882a593Smuzhiyun	addi	r8, r8, -32	/* as = as - 32 */
438*4882a593Smuzhiyun	addi	r5, r5, -32	/* d = d - 32 */
439*4882a593Smuzhiyun	lwi	r12, r8, 28	/* v = *(as + 28) */
440*4882a593Smuzhiyun	bslli	r9, r12, 24	/* t1 = v << 24 */
441*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
442*4882a593Smuzhiyun	swi	r9, r5, 28	/* *(d + 28) = t1 */
443*4882a593Smuzhiyun	bsrli	r11, r12, 8	/* h = v >> 8 */
444*4882a593Smuzhiyun	lwi	r12, r8, 24	/* v = *(as + 24) */
445*4882a593Smuzhiyun	bslli	r9, r12, 24	/* t1 = v << 24 */
446*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
447*4882a593Smuzhiyun	swi	r9, r5, 24	/* *(d + 24) = t1 */
448*4882a593Smuzhiyun	bsrli	r11, r12, 8	/* h = v >> 8 */
449*4882a593Smuzhiyun	lwi	r12, r8, 20	/* v = *(as + 20) */
450*4882a593Smuzhiyun	bslli	r9, r12, 24	/* t1 = v << 24 */
451*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
452*4882a593Smuzhiyun	swi	r9, r5, 20	/* *(d + 20) = t1 */
453*4882a593Smuzhiyun	bsrli	r11, r12, 8	/* h = v >> 8 */
454*4882a593Smuzhiyun	lwi	r12, r8, 16	/* v = *(as + 16) */
455*4882a593Smuzhiyun	bslli	r9, r12, 24	/* t1 = v << 24 */
456*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
457*4882a593Smuzhiyun	swi	r9, r5, 16	/* *(d + 16) = t1 */
458*4882a593Smuzhiyun	bsrli	r11, r12, 8	/* h = v >> 8 */
459*4882a593Smuzhiyun	lwi	r12, r8, 12	/* v = *(as + 12) */
460*4882a593Smuzhiyun	bslli	r9, r12, 24	/* t1 = v << 24 */
461*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
462*4882a593Smuzhiyun	swi	r9, r5, 12	/* *(d + 112) = t1 */
463*4882a593Smuzhiyun	bsrli	r11, r12, 8	/* h = v >> 8 */
464*4882a593Smuzhiyun	lwi	r12, r8, 8	/* v = *(as + 8) */
465*4882a593Smuzhiyun	bslli	r9, r12, 24	/* t1 = v << 24 */
466*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
467*4882a593Smuzhiyun	swi	r9, r5, 8	/* *(d + 8) = t1 */
468*4882a593Smuzhiyun	bsrli	r11, r12, 8	/* h = v >> 8 */
469*4882a593Smuzhiyun	lwi	r12, r8, 4	/* v = *(as + 4) */
470*4882a593Smuzhiyun	bslli	r9, r12, 24	/* t1 = v << 24 */
471*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
472*4882a593Smuzhiyun	swi	r9, r5, 4	/* *(d + 4) = t1 */
473*4882a593Smuzhiyun	bsrli	r11, r12, 8	/* h = v >> 8 */
474*4882a593Smuzhiyun	lwi	r12, r8, 0	/* v = *(as + 0) */
475*4882a593Smuzhiyun	bslli	r9, r12, 24	/* t1 = v << 24 */
476*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
477*4882a593Smuzhiyun	swi	r9, r5, 0	/* *(d + 0) = t1 */
478*4882a593Smuzhiyun	addi	r4, r4, -32	/* n = n - 32 */
479*4882a593Smuzhiyun	bneid	r4, d_bu3_loop	/* while (n) loop */
480*4882a593Smuzhiyun	bsrli	r11, r12, 8	/* h = v >> 8 (IN DELAY SLOT) */
481*4882a593Smuzhiyun	bri	d_block_done
482*4882a593Smuzhiyun
483*4882a593Smuzhiyund_block_u1:
484*4882a593Smuzhiyun	bsrli	r11, r11, 24	/* h = h >> 24 */
485*4882a593Smuzhiyund_bu1_loop:
486*4882a593Smuzhiyun	addi	r8, r8, -32	/* as = as - 32 */
487*4882a593Smuzhiyun	addi	r5, r5, -32	/* d = d - 32 */
488*4882a593Smuzhiyun	lwi	r12, r8, 28	/* v = *(as + 28) */
489*4882a593Smuzhiyun	bslli	r9, r12, 8	/* t1 = v << 8 */
490*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
491*4882a593Smuzhiyun	swi	r9, r5, 28	/* *(d + 28) = t1 */
492*4882a593Smuzhiyun	bsrli	r11, r12, 24	/* h = v >> 24 */
493*4882a593Smuzhiyun	lwi	r12, r8, 24	/* v = *(as + 24) */
494*4882a593Smuzhiyun	bslli	r9, r12, 8	/* t1 = v << 8 */
495*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
496*4882a593Smuzhiyun	swi	r9, r5, 24	/* *(d + 24) = t1 */
497*4882a593Smuzhiyun	bsrli	r11, r12, 24	/* h = v >> 24 */
498*4882a593Smuzhiyun	lwi	r12, r8, 20	/* v = *(as + 20) */
499*4882a593Smuzhiyun	bslli	r9, r12, 8	/* t1 = v << 8 */
500*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
501*4882a593Smuzhiyun	swi	r9, r5, 20	/* *(d + 20) = t1 */
502*4882a593Smuzhiyun	bsrli	r11, r12, 24	/* h = v >> 24 */
503*4882a593Smuzhiyun	lwi	r12, r8, 16	/* v = *(as + 16) */
504*4882a593Smuzhiyun	bslli	r9, r12, 8	/* t1 = v << 8 */
505*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
506*4882a593Smuzhiyun	swi	r9, r5, 16	/* *(d + 16) = t1 */
507*4882a593Smuzhiyun	bsrli	r11, r12, 24	/* h = v >> 24 */
508*4882a593Smuzhiyun	lwi	r12, r8, 12	/* v = *(as + 12) */
509*4882a593Smuzhiyun	bslli	r9, r12, 8	/* t1 = v << 8 */
510*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
511*4882a593Smuzhiyun	swi	r9, r5, 12	/* *(d + 112) = t1 */
512*4882a593Smuzhiyun	bsrli	r11, r12, 24	/* h = v >> 24 */
513*4882a593Smuzhiyun	lwi	r12, r8, 8	/* v = *(as + 8) */
514*4882a593Smuzhiyun	bslli	r9, r12, 8	/* t1 = v << 8 */
515*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
516*4882a593Smuzhiyun	swi	r9, r5, 8	/* *(d + 8) = t1 */
517*4882a593Smuzhiyun	bsrli	r11, r12, 24	/* h = v >> 24 */
518*4882a593Smuzhiyun	lwi	r12, r8, 4	/* v = *(as + 4) */
519*4882a593Smuzhiyun	bslli	r9, r12, 8	/* t1 = v << 8 */
520*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
521*4882a593Smuzhiyun	swi	r9, r5, 4	/* *(d + 4) = t1 */
522*4882a593Smuzhiyun	bsrli	r11, r12, 24	/* h = v >> 24 */
523*4882a593Smuzhiyun	lwi	r12, r8, 0	/* v = *(as + 0) */
524*4882a593Smuzhiyun	bslli	r9, r12, 8	/* t1 = v << 8 */
525*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
526*4882a593Smuzhiyun	swi	r9, r5, 0	/* *(d + 0) = t1 */
527*4882a593Smuzhiyun	addi	r4, r4, -32	/* n = n - 32 */
528*4882a593Smuzhiyun	bneid	r4, d_bu1_loop	/* while (n) loop */
529*4882a593Smuzhiyun	bsrli	r11, r12, 24	/* h = v >> 24 (IN DELAY SLOT) */
530*4882a593Smuzhiyun	bri	d_block_done
531*4882a593Smuzhiyun
532*4882a593Smuzhiyund_block_u2:
533*4882a593Smuzhiyun	bsrli	r11, r11, 16	/* h = h >> 16 */
534*4882a593Smuzhiyund_bu2_loop:
535*4882a593Smuzhiyun	addi	r8, r8, -32	/* as = as - 32 */
536*4882a593Smuzhiyun	addi	r5, r5, -32	/* d = d - 32 */
537*4882a593Smuzhiyun	lwi	r12, r8, 28	/* v = *(as + 28) */
538*4882a593Smuzhiyun	bslli	r9, r12, 16	/* t1 = v << 16 */
539*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
540*4882a593Smuzhiyun	swi	r9, r5, 28	/* *(d + 28) = t1 */
541*4882a593Smuzhiyun	bsrli	r11, r12, 16	/* h = v >> 16 */
542*4882a593Smuzhiyun	lwi	r12, r8, 24	/* v = *(as + 24) */
543*4882a593Smuzhiyun	bslli	r9, r12, 16	/* t1 = v << 16 */
544*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
545*4882a593Smuzhiyun	swi	r9, r5, 24	/* *(d + 24) = t1 */
546*4882a593Smuzhiyun	bsrli	r11, r12, 16	/* h = v >> 16 */
547*4882a593Smuzhiyun	lwi	r12, r8, 20	/* v = *(as + 20) */
548*4882a593Smuzhiyun	bslli	r9, r12, 16	/* t1 = v << 16 */
549*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
550*4882a593Smuzhiyun	swi	r9, r5, 20	/* *(d + 20) = t1 */
551*4882a593Smuzhiyun	bsrli	r11, r12, 16	/* h = v >> 16 */
552*4882a593Smuzhiyun	lwi	r12, r8, 16	/* v = *(as + 16) */
553*4882a593Smuzhiyun	bslli	r9, r12, 16	/* t1 = v << 16 */
554*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
555*4882a593Smuzhiyun	swi	r9, r5, 16	/* *(d + 16) = t1 */
556*4882a593Smuzhiyun	bsrli	r11, r12, 16	/* h = v >> 16 */
557*4882a593Smuzhiyun	lwi	r12, r8, 12	/* v = *(as + 12) */
558*4882a593Smuzhiyun	bslli	r9, r12, 16	/* t1 = v << 16 */
559*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
560*4882a593Smuzhiyun	swi	r9, r5, 12	/* *(d + 112) = t1 */
561*4882a593Smuzhiyun	bsrli	r11, r12, 16	/* h = v >> 16 */
562*4882a593Smuzhiyun	lwi	r12, r8, 8	/* v = *(as + 8) */
563*4882a593Smuzhiyun	bslli	r9, r12, 16	/* t1 = v << 16 */
564*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
565*4882a593Smuzhiyun	swi	r9, r5, 8	/* *(d + 8) = t1 */
566*4882a593Smuzhiyun	bsrli	r11, r12, 16	/* h = v >> 16 */
567*4882a593Smuzhiyun	lwi	r12, r8, 4	/* v = *(as + 4) */
568*4882a593Smuzhiyun	bslli	r9, r12, 16	/* t1 = v << 16 */
569*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
570*4882a593Smuzhiyun	swi	r9, r5, 4	/* *(d + 4) = t1 */
571*4882a593Smuzhiyun	bsrli	r11, r12, 16	/* h = v >> 16 */
572*4882a593Smuzhiyun	lwi	r12, r8, 0	/* v = *(as + 0) */
573*4882a593Smuzhiyun	bslli	r9, r12, 16	/* t1 = v << 16 */
574*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
575*4882a593Smuzhiyun	swi	r9, r5, 0	/* *(d + 0) = t1 */
576*4882a593Smuzhiyun	addi	r4, r4, -32	/* n = n - 32 */
577*4882a593Smuzhiyun	bneid	r4, d_bu2_loop	/* while (n) loop */
578*4882a593Smuzhiyun	bsrli	r11, r12, 16	/* h = v >> 16 (IN DELAY SLOT) */
579*4882a593Smuzhiyun
580*4882a593Smuzhiyund_block_done:
581*4882a593Smuzhiyun	addi	r4, r0, 4	/* n = 4 */
582*4882a593Smuzhiyun	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
583*4882a593Smuzhiyun	blti	r4,d_xfer_end	/* if n < 0, less than one word to transfer */
584*4882a593Smuzhiyun
585*4882a593Smuzhiyund_word_xfer:
586*4882a593Smuzhiyun	andi	r4, r7, 0xfffffffc	/* n = c & ~3 */
587*4882a593Smuzhiyun	rsub	r5, r4, r5		/* d = d - n */
588*4882a593Smuzhiyun	rsub	r6, r4, r6		/* s = s - n */
589*4882a593Smuzhiyun	rsub	r7, r4, r7		/* c = c - n */
590*4882a593Smuzhiyun
591*4882a593Smuzhiyun	andi	r9, r6, 3		/* t1 = s & 3 */
592*4882a593Smuzhiyun	/* if temp != 0, unaligned transfers needed */
593*4882a593Smuzhiyun	bnei	r9, d_word_unaligned
594*4882a593Smuzhiyun
595*4882a593Smuzhiyund_word_aligned:
596*4882a593Smuzhiyun	addi	r4, r4,-4		/* n-- */
597*4882a593Smuzhiyun	lw	r9, r6, r4		/* t1 = *(s+n) */
598*4882a593Smuzhiyun	bneid	r4, d_word_aligned	/* loop */
599*4882a593Smuzhiyun	sw	r9, r5, r4		/* *(d+n) = t1 (IN DELAY SLOT) */
600*4882a593Smuzhiyun
601*4882a593Smuzhiyun	bri	d_word_done
602*4882a593Smuzhiyun
603*4882a593Smuzhiyund_word_unaligned:
604*4882a593Smuzhiyun	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
605*4882a593Smuzhiyun	lw	r11, r8, r4		/* h = *(as + n) */
606*4882a593Smuzhiyun
607*4882a593Smuzhiyun	addi	r9, r9, -1
608*4882a593Smuzhiyun	beqi	r9,d_word_u1		/* t1 was 1 => 1 byte offset */
609*4882a593Smuzhiyun	addi	r9, r9, -1
610*4882a593Smuzhiyun	beqi	r9,d_word_u2		/* t1 was 2 => 2 byte offset */
611*4882a593Smuzhiyun
612*4882a593Smuzhiyund_word_u3:
613*4882a593Smuzhiyun	bsrli	r11, r11, 8	/* h = h >> 8 */
614*4882a593Smuzhiyund_wu3_loop:
615*4882a593Smuzhiyun	addi	r4, r4,-4	/* n = n - 4 */
616*4882a593Smuzhiyun	lw	r12, r8, r4	/* v = *(as + n) */
617*4882a593Smuzhiyun	bslli	r9, r12, 24	/* t1 = v << 24 */
618*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
619*4882a593Smuzhiyun	sw	r9, r5, r4	/* *(d + n) = t1 */
620*4882a593Smuzhiyun	bneid	r4, d_wu3_loop	/* while (n) loop */
621*4882a593Smuzhiyun	bsrli	r11, r12, 8	/* h = v >> 8 (IN DELAY SLOT) */
622*4882a593Smuzhiyun
623*4882a593Smuzhiyun	bri	d_word_done
624*4882a593Smuzhiyun
625*4882a593Smuzhiyund_word_u1:
626*4882a593Smuzhiyun	bsrli	r11, r11, 24	/* h = h >> 24 */
627*4882a593Smuzhiyund_wu1_loop:
628*4882a593Smuzhiyun	addi	r4, r4,-4	/* n = n - 4 */
629*4882a593Smuzhiyun	lw	r12, r8, r4	/* v = *(as + n) */
630*4882a593Smuzhiyun	bslli	r9, r12, 8	/* t1 = v << 8 */
631*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
632*4882a593Smuzhiyun	sw	r9, r5, r4	/* *(d + n) = t1 */
633*4882a593Smuzhiyun	bneid	r4, d_wu1_loop	/* while (n) loop */
634*4882a593Smuzhiyun	bsrli	r11, r12, 24	/* h = v >> 24 (IN DELAY SLOT) */
635*4882a593Smuzhiyun
636*4882a593Smuzhiyun	bri	d_word_done
637*4882a593Smuzhiyun
638*4882a593Smuzhiyund_word_u2:
639*4882a593Smuzhiyun	bsrli	r11, r11, 16	/* h = h >> 16 */
640*4882a593Smuzhiyund_wu2_loop:
641*4882a593Smuzhiyun	addi	r4, r4,-4	/* n = n - 4 */
642*4882a593Smuzhiyun	lw	r12, r8, r4	/* v = *(as + n) */
643*4882a593Smuzhiyun	bslli	r9, r12, 16	/* t1 = v << 16 */
644*4882a593Smuzhiyun	or	r9, r11, r9	/* t1 = h | t1 */
645*4882a593Smuzhiyun	sw	r9, r5, r4	/* *(d + n) = t1 */
646*4882a593Smuzhiyun	bneid	r4, d_wu2_loop	/* while (n) loop */
647*4882a593Smuzhiyun	bsrli	r11, r12, 16	/* h = v >> 16 (IN DELAY SLOT) */
648*4882a593Smuzhiyun
649*4882a593Smuzhiyund_word_done:
650*4882a593Smuzhiyun
651*4882a593Smuzhiyund_xfer_end:
652*4882a593Smuzhiyund_xfer_end_loop:
653*4882a593Smuzhiyun	beqi	r7, a_done		/* while (c) */
654*4882a593Smuzhiyun	addi	r6, r6, -1		/* s-- */
655*4882a593Smuzhiyun	lbui	r9, r6, 0		/* t1 = *s */
656*4882a593Smuzhiyun	addi	r5, r5, -1		/* d-- */
657*4882a593Smuzhiyun	sbi	r9, r5, 0		/* *d = t1 */
658*4882a593Smuzhiyun	brid	d_xfer_end_loop		/* loop */
659*4882a593Smuzhiyun	addi	r7, r7, -1		/* c-- (IN DELAY SLOT) */
660*4882a593Smuzhiyun
661*4882a593Smuzhiyund_done:
662*4882a593Smuzhiyun	rtsd	r15, 8
663*4882a593Smuzhiyun	nop
664*4882a593Smuzhiyun
665*4882a593Smuzhiyun.size  memmove, . - memmove
666*4882a593Smuzhiyun.end memmove
667