xref: /OK3568_Linux_fs/kernel/arch/mips/cavium-octeon/octeon-memcpy.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/*
2*4882a593Smuzhiyun * This file is subject to the terms and conditions of the GNU General Public
3*4882a593Smuzhiyun * License.  See the file "COPYING" in the main directory of this archive
4*4882a593Smuzhiyun * for more details.
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun * Unified implementation of memcpy, memmove and the __copy_user backend.
7*4882a593Smuzhiyun *
8*4882a593Smuzhiyun * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
9*4882a593Smuzhiyun * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
10*4882a593Smuzhiyun * Copyright (C) 2002 Broadcom, Inc.
11*4882a593Smuzhiyun *   memcpy/copy_user author: Mark Vandevoorde
12*4882a593Smuzhiyun *
13*4882a593Smuzhiyun * Mnemonic names for arguments to memcpy/__copy_user
14*4882a593Smuzhiyun */
15*4882a593Smuzhiyun
16*4882a593Smuzhiyun#include <asm/asm.h>
17*4882a593Smuzhiyun#include <asm/asm-offsets.h>
18*4882a593Smuzhiyun#include <asm/export.h>
19*4882a593Smuzhiyun#include <asm/regdef.h>
20*4882a593Smuzhiyun
21*4882a593Smuzhiyun#define dst a0
22*4882a593Smuzhiyun#define src a1
23*4882a593Smuzhiyun#define len a2
24*4882a593Smuzhiyun
25*4882a593Smuzhiyun/*
26*4882a593Smuzhiyun * Spec
27*4882a593Smuzhiyun *
28*4882a593Smuzhiyun * memcpy copies len bytes from src to dst and sets v0 to dst.
29*4882a593Smuzhiyun * It assumes that
30*4882a593Smuzhiyun *   - src and dst don't overlap
31*4882a593Smuzhiyun *   - src is readable
32*4882a593Smuzhiyun *   - dst is writable
33*4882a593Smuzhiyun * memcpy uses the standard calling convention
34*4882a593Smuzhiyun *
35*4882a593Smuzhiyun * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
36*4882a593Smuzhiyun * the number of uncopied bytes due to an exception caused by a read or write.
37*4882a593Smuzhiyun * __copy_user assumes that src and dst don't overlap, and that the call is
38*4882a593Smuzhiyun * implementing one of the following:
39*4882a593Smuzhiyun *   copy_to_user
40*4882a593Smuzhiyun *     - src is readable  (no exceptions when reading src)
41*4882a593Smuzhiyun *   copy_from_user
42*4882a593Smuzhiyun *     - dst is writable  (no exceptions when writing dst)
43*4882a593Smuzhiyun * __copy_user uses a non-standard calling convention; see
44*4882a593Smuzhiyun * arch/mips/include/asm/uaccess.h
45*4882a593Smuzhiyun *
46*4882a593Smuzhiyun * When an exception happens on a load, the handler must
47*4882a593Smuzhiyun # ensure that all of the destination buffer is overwritten to prevent
48*4882a593Smuzhiyun * leaking information to user mode programs.
49*4882a593Smuzhiyun */
50*4882a593Smuzhiyun
51*4882a593Smuzhiyun/*
52*4882a593Smuzhiyun * Implementation
53*4882a593Smuzhiyun */
54*4882a593Smuzhiyun
55*4882a593Smuzhiyun/*
56*4882a593Smuzhiyun * The exception handler for loads requires that:
57*4882a593Smuzhiyun *  1- AT contain the address of the byte just past the end of the source
58*4882a593Smuzhiyun *     of the copy,
59*4882a593Smuzhiyun *  2- src_entry <= src < AT, and
60*4882a593Smuzhiyun *  3- (dst - src) == (dst_entry - src_entry),
61*4882a593Smuzhiyun * The _entry suffix denotes values when __copy_user was called.
62*4882a593Smuzhiyun *
63*4882a593Smuzhiyun * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
64*4882a593Smuzhiyun * (2) is met by incrementing src by the number of bytes copied
65*4882a593Smuzhiyun * (3) is met by not doing loads between a pair of increments of dst and src
66*4882a593Smuzhiyun *
67*4882a593Smuzhiyun * The exception handlers for stores adjust len (if necessary) and return.
68*4882a593Smuzhiyun * These handlers do not need to overwrite any data.
69*4882a593Smuzhiyun *
70*4882a593Smuzhiyun * For __rmemcpy and memmove an exception is always a kernel bug, therefore
71*4882a593Smuzhiyun * they're not protected.
72*4882a593Smuzhiyun */
73*4882a593Smuzhiyun
74*4882a593Smuzhiyun#define EXC(inst_reg,addr,handler)		\
75*4882a593Smuzhiyun9:	inst_reg, addr;				\
76*4882a593Smuzhiyun	.section __ex_table,"a";		\
77*4882a593Smuzhiyun	PTR	9b, handler;			\
78*4882a593Smuzhiyun	.previous
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun/*
81*4882a593Smuzhiyun * Only on the 64-bit kernel we can made use of 64-bit registers.
82*4882a593Smuzhiyun */
83*4882a593Smuzhiyun
84*4882a593Smuzhiyun#define LOAD   ld
85*4882a593Smuzhiyun#define LOADL  ldl
86*4882a593Smuzhiyun#define LOADR  ldr
87*4882a593Smuzhiyun#define STOREL sdl
88*4882a593Smuzhiyun#define STORER sdr
89*4882a593Smuzhiyun#define STORE  sd
90*4882a593Smuzhiyun#define ADD    daddu
91*4882a593Smuzhiyun#define SUB    dsubu
92*4882a593Smuzhiyun#define SRL    dsrl
93*4882a593Smuzhiyun#define SRA    dsra
94*4882a593Smuzhiyun#define SLL    dsll
95*4882a593Smuzhiyun#define SLLV   dsllv
96*4882a593Smuzhiyun#define SRLV   dsrlv
97*4882a593Smuzhiyun#define NBYTES 8
98*4882a593Smuzhiyun#define LOG_NBYTES 3
99*4882a593Smuzhiyun
100*4882a593Smuzhiyun/*
101*4882a593Smuzhiyun * As we are sharing code base with the mips32 tree (which use the o32 ABI
102*4882a593Smuzhiyun * register definitions). We need to redefine the register definitions from
103*4882a593Smuzhiyun * the n64 ABI register naming to the o32 ABI register naming.
104*4882a593Smuzhiyun */
105*4882a593Smuzhiyun#undef t0
106*4882a593Smuzhiyun#undef t1
107*4882a593Smuzhiyun#undef t2
108*4882a593Smuzhiyun#undef t3
109*4882a593Smuzhiyun#define t0	$8
110*4882a593Smuzhiyun#define t1	$9
111*4882a593Smuzhiyun#define t2	$10
112*4882a593Smuzhiyun#define t3	$11
113*4882a593Smuzhiyun#define t4	$12
114*4882a593Smuzhiyun#define t5	$13
115*4882a593Smuzhiyun#define t6	$14
116*4882a593Smuzhiyun#define t7	$15
117*4882a593Smuzhiyun
118*4882a593Smuzhiyun#ifdef CONFIG_CPU_LITTLE_ENDIAN
119*4882a593Smuzhiyun#define LDFIRST LOADR
120*4882a593Smuzhiyun#define LDREST	LOADL
121*4882a593Smuzhiyun#define STFIRST STORER
122*4882a593Smuzhiyun#define STREST	STOREL
123*4882a593Smuzhiyun#define SHIFT_DISCARD SLLV
124*4882a593Smuzhiyun#else
125*4882a593Smuzhiyun#define LDFIRST LOADL
126*4882a593Smuzhiyun#define LDREST	LOADR
127*4882a593Smuzhiyun#define STFIRST STOREL
128*4882a593Smuzhiyun#define STREST	STORER
129*4882a593Smuzhiyun#define SHIFT_DISCARD SRLV
130*4882a593Smuzhiyun#endif
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun#define FIRST(unit) ((unit)*NBYTES)
133*4882a593Smuzhiyun#define REST(unit)  (FIRST(unit)+NBYTES-1)
134*4882a593Smuzhiyun#define UNIT(unit)  FIRST(unit)
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun#define ADDRMASK (NBYTES-1)
137*4882a593Smuzhiyun
138*4882a593Smuzhiyun	.text
139*4882a593Smuzhiyun	.set	noreorder
140*4882a593Smuzhiyun	.set	noat
141*4882a593Smuzhiyun
142*4882a593Smuzhiyun/*
143*4882a593Smuzhiyun * A combined memcpy/__copy_user
144*4882a593Smuzhiyun * __copy_user sets len to 0 for success; else to an upper bound of
145*4882a593Smuzhiyun * the number of uncopied bytes.
146*4882a593Smuzhiyun * memcpy sets v0 to dst.
147*4882a593Smuzhiyun */
148*4882a593Smuzhiyun	.align	5
149*4882a593SmuzhiyunLEAF(memcpy)					/* a0=dst a1=src a2=len */
150*4882a593SmuzhiyunEXPORT_SYMBOL(memcpy)
151*4882a593Smuzhiyun	move	v0, dst				/* return value */
152*4882a593Smuzhiyun__memcpy:
153*4882a593SmuzhiyunFEXPORT(__copy_user)
154*4882a593SmuzhiyunEXPORT_SYMBOL(__copy_user)
155*4882a593Smuzhiyun	/*
156*4882a593Smuzhiyun	 * Note: dst & src may be unaligned, len may be 0
157*4882a593Smuzhiyun	 * Temps
158*4882a593Smuzhiyun	 */
159*4882a593Smuzhiyun	#
160*4882a593Smuzhiyun	# Octeon doesn't care if the destination is unaligned. The hardware
161*4882a593Smuzhiyun	# can fix it faster than we can special case the assembly.
162*4882a593Smuzhiyun	#
163*4882a593Smuzhiyun	pref	0, 0(src)
164*4882a593Smuzhiyun	sltu	t0, len, NBYTES		# Check if < 1 word
165*4882a593Smuzhiyun	bnez	t0, copy_bytes_checklen
166*4882a593Smuzhiyun	 and	t0, src, ADDRMASK	# Check if src unaligned
167*4882a593Smuzhiyun	bnez	t0, src_unaligned
168*4882a593Smuzhiyun	 sltu	t0, len, 4*NBYTES	# Check if < 4 words
169*4882a593Smuzhiyun	bnez	t0, less_than_4units
170*4882a593Smuzhiyun	 sltu	t0, len, 8*NBYTES	# Check if < 8 words
171*4882a593Smuzhiyun	bnez	t0, less_than_8units
172*4882a593Smuzhiyun	 sltu	t0, len, 16*NBYTES	# Check if < 16 words
173*4882a593Smuzhiyun	bnez	t0, cleanup_both_aligned
174*4882a593Smuzhiyun	 sltu	t0, len, 128+1		# Check if len < 129
175*4882a593Smuzhiyun	bnez	t0, 1f			# Skip prefetch if len is too short
176*4882a593Smuzhiyun	 sltu	t0, len, 256+1		# Check if len < 257
177*4882a593Smuzhiyun	bnez	t0, 1f			# Skip prefetch if len is too short
178*4882a593Smuzhiyun	 pref	0, 128(src)		# We must not prefetch invalid addresses
179*4882a593Smuzhiyun	#
180*4882a593Smuzhiyun	# This is where we loop if there is more than 128 bytes left
181*4882a593Smuzhiyun2:	pref	0, 256(src)		# We must not prefetch invalid addresses
182*4882a593Smuzhiyun	#
183*4882a593Smuzhiyun	# This is where we loop if we can't prefetch anymore
184*4882a593Smuzhiyun1:
185*4882a593SmuzhiyunEXC(	LOAD	t0, UNIT(0)(src),	l_exc)
186*4882a593SmuzhiyunEXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
187*4882a593SmuzhiyunEXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
188*4882a593SmuzhiyunEXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
189*4882a593Smuzhiyun	SUB	len, len, 16*NBYTES
190*4882a593SmuzhiyunEXC(	STORE	t0, UNIT(0)(dst),	s_exc_p16u)
191*4882a593SmuzhiyunEXC(	STORE	t1, UNIT(1)(dst),	s_exc_p15u)
192*4882a593SmuzhiyunEXC(	STORE	t2, UNIT(2)(dst),	s_exc_p14u)
193*4882a593SmuzhiyunEXC(	STORE	t3, UNIT(3)(dst),	s_exc_p13u)
194*4882a593SmuzhiyunEXC(	LOAD	t0, UNIT(4)(src),	l_exc_copy)
195*4882a593SmuzhiyunEXC(	LOAD	t1, UNIT(5)(src),	l_exc_copy)
196*4882a593SmuzhiyunEXC(	LOAD	t2, UNIT(6)(src),	l_exc_copy)
197*4882a593SmuzhiyunEXC(	LOAD	t3, UNIT(7)(src),	l_exc_copy)
198*4882a593SmuzhiyunEXC(	STORE	t0, UNIT(4)(dst),	s_exc_p12u)
199*4882a593SmuzhiyunEXC(	STORE	t1, UNIT(5)(dst),	s_exc_p11u)
200*4882a593SmuzhiyunEXC(	STORE	t2, UNIT(6)(dst),	s_exc_p10u)
201*4882a593Smuzhiyun	ADD	src, src, 16*NBYTES
202*4882a593SmuzhiyunEXC(	STORE	t3, UNIT(7)(dst),	s_exc_p9u)
203*4882a593Smuzhiyun	ADD	dst, dst, 16*NBYTES
204*4882a593SmuzhiyunEXC(	LOAD	t0, UNIT(-8)(src),	l_exc_copy_rewind16)
205*4882a593SmuzhiyunEXC(	LOAD	t1, UNIT(-7)(src),	l_exc_copy_rewind16)
206*4882a593SmuzhiyunEXC(	LOAD	t2, UNIT(-6)(src),	l_exc_copy_rewind16)
207*4882a593SmuzhiyunEXC(	LOAD	t3, UNIT(-5)(src),	l_exc_copy_rewind16)
208*4882a593SmuzhiyunEXC(	STORE	t0, UNIT(-8)(dst),	s_exc_p8u)
209*4882a593SmuzhiyunEXC(	STORE	t1, UNIT(-7)(dst),	s_exc_p7u)
210*4882a593SmuzhiyunEXC(	STORE	t2, UNIT(-6)(dst),	s_exc_p6u)
211*4882a593SmuzhiyunEXC(	STORE	t3, UNIT(-5)(dst),	s_exc_p5u)
212*4882a593SmuzhiyunEXC(	LOAD	t0, UNIT(-4)(src),	l_exc_copy_rewind16)
213*4882a593SmuzhiyunEXC(	LOAD	t1, UNIT(-3)(src),	l_exc_copy_rewind16)
214*4882a593SmuzhiyunEXC(	LOAD	t2, UNIT(-2)(src),	l_exc_copy_rewind16)
215*4882a593SmuzhiyunEXC(	LOAD	t3, UNIT(-1)(src),	l_exc_copy_rewind16)
216*4882a593SmuzhiyunEXC(	STORE	t0, UNIT(-4)(dst),	s_exc_p4u)
217*4882a593SmuzhiyunEXC(	STORE	t1, UNIT(-3)(dst),	s_exc_p3u)
218*4882a593SmuzhiyunEXC(	STORE	t2, UNIT(-2)(dst),	s_exc_p2u)
219*4882a593SmuzhiyunEXC(	STORE	t3, UNIT(-1)(dst),	s_exc_p1u)
220*4882a593Smuzhiyun	sltu	t0, len, 256+1		# See if we can prefetch more
221*4882a593Smuzhiyun	beqz	t0, 2b
222*4882a593Smuzhiyun	 sltu	t0, len, 128		# See if we can loop more time
223*4882a593Smuzhiyun	beqz	t0, 1b
224*4882a593Smuzhiyun	 nop
225*4882a593Smuzhiyun	#
226*4882a593Smuzhiyun	# Jump here if there are less than 16*NBYTES left.
227*4882a593Smuzhiyun	#
228*4882a593Smuzhiyuncleanup_both_aligned:
229*4882a593Smuzhiyun	beqz	len, done
230*4882a593Smuzhiyun	 sltu	t0, len, 8*NBYTES
231*4882a593Smuzhiyun	bnez	t0, less_than_8units
232*4882a593Smuzhiyun	 nop
233*4882a593SmuzhiyunEXC(	LOAD	t0, UNIT(0)(src),	l_exc)
234*4882a593SmuzhiyunEXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
235*4882a593SmuzhiyunEXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
236*4882a593SmuzhiyunEXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
237*4882a593Smuzhiyun	SUB	len, len, 8*NBYTES
238*4882a593SmuzhiyunEXC(	STORE	t0, UNIT(0)(dst),	s_exc_p8u)
239*4882a593SmuzhiyunEXC(	STORE	t1, UNIT(1)(dst),	s_exc_p7u)
240*4882a593SmuzhiyunEXC(	STORE	t2, UNIT(2)(dst),	s_exc_p6u)
241*4882a593SmuzhiyunEXC(	STORE	t3, UNIT(3)(dst),	s_exc_p5u)
242*4882a593SmuzhiyunEXC(	LOAD	t0, UNIT(4)(src),	l_exc_copy)
243*4882a593SmuzhiyunEXC(	LOAD	t1, UNIT(5)(src),	l_exc_copy)
244*4882a593SmuzhiyunEXC(	LOAD	t2, UNIT(6)(src),	l_exc_copy)
245*4882a593SmuzhiyunEXC(	LOAD	t3, UNIT(7)(src),	l_exc_copy)
246*4882a593SmuzhiyunEXC(	STORE	t0, UNIT(4)(dst),	s_exc_p4u)
247*4882a593SmuzhiyunEXC(	STORE	t1, UNIT(5)(dst),	s_exc_p3u)
248*4882a593SmuzhiyunEXC(	STORE	t2, UNIT(6)(dst),	s_exc_p2u)
249*4882a593SmuzhiyunEXC(	STORE	t3, UNIT(7)(dst),	s_exc_p1u)
250*4882a593Smuzhiyun	ADD	src, src, 8*NBYTES
251*4882a593Smuzhiyun	beqz	len, done
252*4882a593Smuzhiyun	 ADD	dst, dst, 8*NBYTES
253*4882a593Smuzhiyun	#
254*4882a593Smuzhiyun	# Jump here if there are less than 8*NBYTES left.
255*4882a593Smuzhiyun	#
256*4882a593Smuzhiyunless_than_8units:
257*4882a593Smuzhiyun	sltu	t0, len, 4*NBYTES
258*4882a593Smuzhiyun	bnez	t0, less_than_4units
259*4882a593Smuzhiyun	 nop
260*4882a593SmuzhiyunEXC(	LOAD	t0, UNIT(0)(src),	l_exc)
261*4882a593SmuzhiyunEXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
262*4882a593SmuzhiyunEXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
263*4882a593SmuzhiyunEXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
264*4882a593Smuzhiyun	SUB	len, len, 4*NBYTES
265*4882a593SmuzhiyunEXC(	STORE	t0, UNIT(0)(dst),	s_exc_p4u)
266*4882a593SmuzhiyunEXC(	STORE	t1, UNIT(1)(dst),	s_exc_p3u)
267*4882a593SmuzhiyunEXC(	STORE	t2, UNIT(2)(dst),	s_exc_p2u)
268*4882a593SmuzhiyunEXC(	STORE	t3, UNIT(3)(dst),	s_exc_p1u)
269*4882a593Smuzhiyun	ADD	src, src, 4*NBYTES
270*4882a593Smuzhiyun	beqz	len, done
271*4882a593Smuzhiyun	 ADD	dst, dst, 4*NBYTES
272*4882a593Smuzhiyun	#
273*4882a593Smuzhiyun	# Jump here if there are less than 4*NBYTES left. This means
274*4882a593Smuzhiyun	# we may need to copy up to 3 NBYTES words.
275*4882a593Smuzhiyun	#
276*4882a593Smuzhiyunless_than_4units:
277*4882a593Smuzhiyun	sltu	t0, len, 1*NBYTES
278*4882a593Smuzhiyun	bnez	t0, copy_bytes_checklen
279*4882a593Smuzhiyun	 nop
280*4882a593Smuzhiyun	#
281*4882a593Smuzhiyun	# 1) Copy NBYTES, then check length again
282*4882a593Smuzhiyun	#
283*4882a593SmuzhiyunEXC(	LOAD	t0, 0(src),		l_exc)
284*4882a593Smuzhiyun	SUB	len, len, NBYTES
285*4882a593Smuzhiyun	sltu	t1, len, 8
286*4882a593SmuzhiyunEXC(	STORE	t0, 0(dst),		s_exc_p1u)
287*4882a593Smuzhiyun	ADD	src, src, NBYTES
288*4882a593Smuzhiyun	bnez	t1, copy_bytes_checklen
289*4882a593Smuzhiyun	 ADD	dst, dst, NBYTES
290*4882a593Smuzhiyun	#
291*4882a593Smuzhiyun	# 2) Copy NBYTES, then check length again
292*4882a593Smuzhiyun	#
293*4882a593SmuzhiyunEXC(	LOAD	t0, 0(src),		l_exc)
294*4882a593Smuzhiyun	SUB	len, len, NBYTES
295*4882a593Smuzhiyun	sltu	t1, len, 8
296*4882a593SmuzhiyunEXC(	STORE	t0, 0(dst),		s_exc_p1u)
297*4882a593Smuzhiyun	ADD	src, src, NBYTES
298*4882a593Smuzhiyun	bnez	t1, copy_bytes_checklen
299*4882a593Smuzhiyun	 ADD	dst, dst, NBYTES
300*4882a593Smuzhiyun	#
301*4882a593Smuzhiyun	# 3) Copy NBYTES, then check length again
302*4882a593Smuzhiyun	#
303*4882a593SmuzhiyunEXC(	LOAD	t0, 0(src),		l_exc)
304*4882a593Smuzhiyun	SUB	len, len, NBYTES
305*4882a593Smuzhiyun	ADD	src, src, NBYTES
306*4882a593Smuzhiyun	ADD	dst, dst, NBYTES
307*4882a593Smuzhiyun	b copy_bytes_checklen
308*4882a593SmuzhiyunEXC(	 STORE	t0, -8(dst),		s_exc_p1u)
309*4882a593Smuzhiyun
310*4882a593Smuzhiyunsrc_unaligned:
311*4882a593Smuzhiyun#define rem t8
312*4882a593Smuzhiyun	SRL	t0, len, LOG_NBYTES+2	 # +2 for 4 units/iter
313*4882a593Smuzhiyun	beqz	t0, cleanup_src_unaligned
314*4882a593Smuzhiyun	 and	rem, len, (4*NBYTES-1)	 # rem = len % 4*NBYTES
315*4882a593Smuzhiyun1:
316*4882a593Smuzhiyun/*
317*4882a593Smuzhiyun * Avoid consecutive LD*'s to the same register since some mips
318*4882a593Smuzhiyun * implementations can't issue them in the same cycle.
319*4882a593Smuzhiyun * It's OK to load FIRST(N+1) before REST(N) because the two addresses
320*4882a593Smuzhiyun * are to the same unit (unless src is aligned, but it's not).
321*4882a593Smuzhiyun */
322*4882a593SmuzhiyunEXC(	LDFIRST t0, FIRST(0)(src),	l_exc)
323*4882a593SmuzhiyunEXC(	LDFIRST t1, FIRST(1)(src),	l_exc_copy)
324*4882a593Smuzhiyun	SUB	len, len, 4*NBYTES
325*4882a593SmuzhiyunEXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
326*4882a593SmuzhiyunEXC(	LDREST	t1, REST(1)(src),	l_exc_copy)
327*4882a593SmuzhiyunEXC(	LDFIRST t2, FIRST(2)(src),	l_exc_copy)
328*4882a593SmuzhiyunEXC(	LDFIRST t3, FIRST(3)(src),	l_exc_copy)
329*4882a593SmuzhiyunEXC(	LDREST	t2, REST(2)(src),	l_exc_copy)
330*4882a593SmuzhiyunEXC(	LDREST	t3, REST(3)(src),	l_exc_copy)
331*4882a593Smuzhiyun	ADD	src, src, 4*NBYTES
332*4882a593SmuzhiyunEXC(	STORE	t0, UNIT(0)(dst),	s_exc_p4u)
333*4882a593SmuzhiyunEXC(	STORE	t1, UNIT(1)(dst),	s_exc_p3u)
334*4882a593SmuzhiyunEXC(	STORE	t2, UNIT(2)(dst),	s_exc_p2u)
335*4882a593SmuzhiyunEXC(	STORE	t3, UNIT(3)(dst),	s_exc_p1u)
336*4882a593Smuzhiyun	bne	len, rem, 1b
337*4882a593Smuzhiyun	 ADD	dst, dst, 4*NBYTES
338*4882a593Smuzhiyun
339*4882a593Smuzhiyuncleanup_src_unaligned:
340*4882a593Smuzhiyun	beqz	len, done
341*4882a593Smuzhiyun	 and	rem, len, NBYTES-1  # rem = len % NBYTES
342*4882a593Smuzhiyun	beq	rem, len, copy_bytes
343*4882a593Smuzhiyun	 nop
344*4882a593Smuzhiyun1:
345*4882a593SmuzhiyunEXC(	LDFIRST t0, FIRST(0)(src),	l_exc)
346*4882a593SmuzhiyunEXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
347*4882a593Smuzhiyun	SUB	len, len, NBYTES
348*4882a593SmuzhiyunEXC(	STORE	t0, 0(dst),		s_exc_p1u)
349*4882a593Smuzhiyun	ADD	src, src, NBYTES
350*4882a593Smuzhiyun	bne	len, rem, 1b
351*4882a593Smuzhiyun	 ADD	dst, dst, NBYTES
352*4882a593Smuzhiyun
353*4882a593Smuzhiyuncopy_bytes_checklen:
354*4882a593Smuzhiyun	beqz	len, done
355*4882a593Smuzhiyun	 nop
356*4882a593Smuzhiyuncopy_bytes:
357*4882a593Smuzhiyun	/* 0 < len < NBYTES  */
358*4882a593Smuzhiyun#define COPY_BYTE(N)			\
359*4882a593SmuzhiyunEXC(	lb	t0, N(src), l_exc);	\
360*4882a593Smuzhiyun	SUB	len, len, 1;		\
361*4882a593Smuzhiyun	beqz	len, done;		\
362*4882a593SmuzhiyunEXC(	 sb	t0, N(dst), s_exc_p1)
363*4882a593Smuzhiyun
364*4882a593Smuzhiyun	COPY_BYTE(0)
365*4882a593Smuzhiyun	COPY_BYTE(1)
366*4882a593Smuzhiyun	COPY_BYTE(2)
367*4882a593Smuzhiyun	COPY_BYTE(3)
368*4882a593Smuzhiyun	COPY_BYTE(4)
369*4882a593Smuzhiyun	COPY_BYTE(5)
370*4882a593SmuzhiyunEXC(	lb	t0, NBYTES-2(src), l_exc)
371*4882a593Smuzhiyun	SUB	len, len, 1
372*4882a593Smuzhiyun	jr	ra
373*4882a593SmuzhiyunEXC(	 sb	t0, NBYTES-2(dst), s_exc_p1)
374*4882a593Smuzhiyundone:
375*4882a593Smuzhiyun	jr	ra
376*4882a593Smuzhiyun	 nop
377*4882a593Smuzhiyun	END(memcpy)
378*4882a593Smuzhiyun
379*4882a593Smuzhiyunl_exc_copy_rewind16:
380*4882a593Smuzhiyun	/* Rewind src and dst by 16*NBYTES for l_exc_copy */
381*4882a593Smuzhiyun	SUB	src, src, 16*NBYTES
382*4882a593Smuzhiyun	SUB	dst, dst, 16*NBYTES
383*4882a593Smuzhiyunl_exc_copy:
384*4882a593Smuzhiyun	/*
385*4882a593Smuzhiyun	 * Copy bytes from src until faulting load address (or until a
386*4882a593Smuzhiyun	 * lb faults)
387*4882a593Smuzhiyun	 *
388*4882a593Smuzhiyun	 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
389*4882a593Smuzhiyun	 * may be more than a byte beyond the last address.
390*4882a593Smuzhiyun	 * Hence, the lb below may get an exception.
391*4882a593Smuzhiyun	 *
392*4882a593Smuzhiyun	 * Assumes src < THREAD_BUADDR($28)
393*4882a593Smuzhiyun	 */
394*4882a593Smuzhiyun	LOAD	t0, TI_TASK($28)
395*4882a593Smuzhiyun	LOAD	t0, THREAD_BUADDR(t0)
396*4882a593Smuzhiyun1:
397*4882a593SmuzhiyunEXC(	lb	t1, 0(src),	l_exc)
398*4882a593Smuzhiyun	ADD	src, src, 1
399*4882a593Smuzhiyun	sb	t1, 0(dst)	# can't fault -- we're copy_from_user
400*4882a593Smuzhiyun	bne	src, t0, 1b
401*4882a593Smuzhiyun	 ADD	dst, dst, 1
402*4882a593Smuzhiyunl_exc:
403*4882a593Smuzhiyun	LOAD	t0, TI_TASK($28)
404*4882a593Smuzhiyun	LOAD	t0, THREAD_BUADDR(t0)	# t0 is just past last good address
405*4882a593Smuzhiyun	SUB	len, AT, t0		# len number of uncopied bytes
406*4882a593Smuzhiyun	jr	ra
407*4882a593Smuzhiyun	 nop
408*4882a593Smuzhiyun
409*4882a593Smuzhiyun
410*4882a593Smuzhiyun#define SEXC(n)				\
411*4882a593Smuzhiyuns_exc_p ## n ## u:			\
412*4882a593Smuzhiyun	jr	ra;			\
413*4882a593Smuzhiyun	 ADD	len, len, n*NBYTES
414*4882a593Smuzhiyun
415*4882a593SmuzhiyunSEXC(16)
416*4882a593SmuzhiyunSEXC(15)
417*4882a593SmuzhiyunSEXC(14)
418*4882a593SmuzhiyunSEXC(13)
419*4882a593SmuzhiyunSEXC(12)
420*4882a593SmuzhiyunSEXC(11)
421*4882a593SmuzhiyunSEXC(10)
422*4882a593SmuzhiyunSEXC(9)
423*4882a593SmuzhiyunSEXC(8)
424*4882a593SmuzhiyunSEXC(7)
425*4882a593SmuzhiyunSEXC(6)
426*4882a593SmuzhiyunSEXC(5)
427*4882a593SmuzhiyunSEXC(4)
428*4882a593SmuzhiyunSEXC(3)
429*4882a593SmuzhiyunSEXC(2)
430*4882a593SmuzhiyunSEXC(1)
431*4882a593Smuzhiyun
432*4882a593Smuzhiyuns_exc_p1:
433*4882a593Smuzhiyun	jr	ra
434*4882a593Smuzhiyun	 ADD	len, len, 1
435*4882a593Smuzhiyuns_exc:
436*4882a593Smuzhiyun	jr	ra
437*4882a593Smuzhiyun	 nop
438*4882a593Smuzhiyun
439*4882a593Smuzhiyun	.align	5
440*4882a593SmuzhiyunLEAF(memmove)
441*4882a593SmuzhiyunEXPORT_SYMBOL(memmove)
442*4882a593Smuzhiyun	ADD	t0, a0, a2
443*4882a593Smuzhiyun	ADD	t1, a1, a2
444*4882a593Smuzhiyun	sltu	t0, a1, t0			# dst + len <= src -> memcpy
445*4882a593Smuzhiyun	sltu	t1, a0, t1			# dst >= src + len -> memcpy
446*4882a593Smuzhiyun	and	t0, t1
447*4882a593Smuzhiyun	beqz	t0, __memcpy
448*4882a593Smuzhiyun	 move	v0, a0				/* return value */
449*4882a593Smuzhiyun	beqz	a2, r_out
450*4882a593Smuzhiyun	END(memmove)
451*4882a593Smuzhiyun
452*4882a593Smuzhiyun	/* fall through to __rmemcpy */
453*4882a593SmuzhiyunLEAF(__rmemcpy)					/* a0=dst a1=src a2=len */
454*4882a593Smuzhiyun	 sltu	t0, a1, a0
455*4882a593Smuzhiyun	beqz	t0, r_end_bytes_up		# src >= dst
456*4882a593Smuzhiyun	 nop
457*4882a593Smuzhiyun	ADD	a0, a2				# dst = dst + len
458*4882a593Smuzhiyun	ADD	a1, a2				# src = src + len
459*4882a593Smuzhiyun
460*4882a593Smuzhiyunr_end_bytes:
461*4882a593Smuzhiyun	lb	t0, -1(a1)
462*4882a593Smuzhiyun	SUB	a2, a2, 0x1
463*4882a593Smuzhiyun	sb	t0, -1(a0)
464*4882a593Smuzhiyun	SUB	a1, a1, 0x1
465*4882a593Smuzhiyun	bnez	a2, r_end_bytes
466*4882a593Smuzhiyun	 SUB	a0, a0, 0x1
467*4882a593Smuzhiyun
468*4882a593Smuzhiyunr_out:
469*4882a593Smuzhiyun	jr	ra
470*4882a593Smuzhiyun	 move	a2, zero
471*4882a593Smuzhiyun
472*4882a593Smuzhiyunr_end_bytes_up:
473*4882a593Smuzhiyun	lb	t0, (a1)
474*4882a593Smuzhiyun	SUB	a2, a2, 0x1
475*4882a593Smuzhiyun	sb	t0, (a0)
476*4882a593Smuzhiyun	ADD	a1, a1, 0x1
477*4882a593Smuzhiyun	bnez	a2, r_end_bytes_up
478*4882a593Smuzhiyun	 ADD	a0, a0, 0x1
479*4882a593Smuzhiyun
480*4882a593Smuzhiyun	jr	ra
481*4882a593Smuzhiyun	 move	a2, zero
482*4882a593Smuzhiyun	END(__rmemcpy)
483