xref: /OK3568_Linux_fs/kernel/arch/sh/lib/memcpy-sh4.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * "memcpy" implementation of SuperH
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 1999  Niibe Yutaka
6*4882a593Smuzhiyun * Copyright (c) 2002  STMicroelectronics Ltd
7*4882a593Smuzhiyun *   Modified from memcpy.S and micro-optimised for SH4
8*4882a593Smuzhiyun *   Stuart Menefy (stuart.menefy@st.com)
9*4882a593Smuzhiyun *
10*4882a593Smuzhiyun */
11*4882a593Smuzhiyun#include <linux/linkage.h>
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun/*
14*4882a593Smuzhiyun * void *memcpy(void *dst, const void *src, size_t n);
15*4882a593Smuzhiyun *
16*4882a593Smuzhiyun * It is assumed that there is no overlap between src and dst.
17*4882a593Smuzhiyun * If there is an overlap, then the results are undefined.
18*4882a593Smuzhiyun */
19*4882a593Smuzhiyun
20*4882a593Smuzhiyun	!
21*4882a593Smuzhiyun	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
22*4882a593Smuzhiyun	!
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun	! Size is 16 or greater, and may have trailing bytes
25*4882a593Smuzhiyun
26*4882a593Smuzhiyun	.balign	32
27*4882a593Smuzhiyun.Lcase1:
28*4882a593Smuzhiyun	! Read a long word and write a long word at once
29*4882a593Smuzhiyun	! At the start of each iteration, r7 contains last long load
30*4882a593Smuzhiyun	add	#-1,r5		!  79 EX
31*4882a593Smuzhiyun	mov	r4,r2		!   5 MT (0 cycles latency)
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun	mov.l	@(r0,r5),r7	!  21 LS (2 cycles latency)
34*4882a593Smuzhiyun	add	#-4,r5		!  50 EX
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun	add	#7,r2		!  79 EX
37*4882a593Smuzhiyun	!
38*4882a593Smuzhiyun#ifdef CONFIG_CPU_LITTLE_ENDIAN
39*4882a593Smuzhiyun	! 6 cycles, 4 bytes per iteration
40*4882a593Smuzhiyun3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
41*4882a593Smuzhiyun	mov	r7, r3		!   5 MT (latency=0)	! RQPO
42*4882a593Smuzhiyun
43*4882a593Smuzhiyun	cmp/hi	r2,r0		!  57 MT
44*4882a593Smuzhiyun	shll16	r3		! 103 EX
45*4882a593Smuzhiyun
46*4882a593Smuzhiyun	mov	r1,r6		!   5 MT (latency=0)
47*4882a593Smuzhiyun	shll8	r3		! 102 EX		! Oxxx
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun	shlr8	r6		! 106 EX		! xNML
50*4882a593Smuzhiyun	mov	r1, r7		!   5 MT (latency=0)
51*4882a593Smuzhiyun
52*4882a593Smuzhiyun	or	r6,r3		!  82 EX		! ONML
53*4882a593Smuzhiyun	bt/s	3b		! 109 BR
54*4882a593Smuzhiyun
55*4882a593Smuzhiyun	 mov.l	r3,@-r0		!  30 LS
56*4882a593Smuzhiyun#else
57*4882a593Smuzhiyun3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! KLMN
58*4882a593Smuzhiyun	mov	r7,r3		!   5 MT (latency=0)	! OPQR
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun	cmp/hi	r2,r0		!  57 MT
61*4882a593Smuzhiyun	shlr16	r3		! 107 EX
62*4882a593Smuzhiyun
63*4882a593Smuzhiyun	shlr8	r3		! 106 EX		! xxxO
64*4882a593Smuzhiyun	mov	r1,r6		!   5 MT (latency=0)
65*4882a593Smuzhiyun
66*4882a593Smuzhiyun	shll8	r6		! 102 EX		! LMNx
67*4882a593Smuzhiyun	mov	r1,r7		!   5 MT (latency=0)
68*4882a593Smuzhiyun
69*4882a593Smuzhiyun	or	r6,r3		!  82 EX		! LMNO
70*4882a593Smuzhiyun	bt/s	3b		! 109 BR
71*4882a593Smuzhiyun
72*4882a593Smuzhiyun	 mov.l	r3,@-r0		!  30 LS
73*4882a593Smuzhiyun#endif
74*4882a593Smuzhiyun	! Finally, copy a byte at once, if necessary
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun	add	#4,r5		!  50 EX
77*4882a593Smuzhiyun	cmp/eq	r4,r0		!  54 MT
78*4882a593Smuzhiyun
79*4882a593Smuzhiyun	add	#-6,r2		!  50 EX
80*4882a593Smuzhiyun	bt	9f		! 109 BR
81*4882a593Smuzhiyun
82*4882a593Smuzhiyun8:	cmp/hi	r2,r0		!  57 MT
83*4882a593Smuzhiyun	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
84*4882a593Smuzhiyun
85*4882a593Smuzhiyun	bt/s	8b		! 109 BR
86*4882a593Smuzhiyun
87*4882a593Smuzhiyun	 mov.b	r1,@-r0		!  29 LS
88*4882a593Smuzhiyun
89*4882a593Smuzhiyun9:	rts
90*4882a593Smuzhiyun	 nop
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun
93*4882a593Smuzhiyun	!
94*4882a593Smuzhiyun	!	GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
95*4882a593Smuzhiyun	!
96*4882a593Smuzhiyun
97*4882a593Smuzhiyun	! Size is 16 or greater, and may have trailing bytes
98*4882a593Smuzhiyun
99*4882a593Smuzhiyun	.balign	32
100*4882a593Smuzhiyun.Lcase3:
101*4882a593Smuzhiyun	! Read a long word and write a long word at once
102*4882a593Smuzhiyun	! At the start of each iteration, r7 contains last long load
103*4882a593Smuzhiyun	add	#-3,r5		! 79 EX
104*4882a593Smuzhiyun	mov	r4,r2		!  5 MT (0 cycles latency)
105*4882a593Smuzhiyun
106*4882a593Smuzhiyun	mov.l	@(r0,r5),r7	! 21 LS (2 cycles latency)
107*4882a593Smuzhiyun	add	#-4,r5		! 50 EX
108*4882a593Smuzhiyun
109*4882a593Smuzhiyun	add	#7,r2		!  79 EX
110*4882a593Smuzhiyun	!
111*4882a593Smuzhiyun#ifdef CONFIG_CPU_LITTLE_ENDIAN
112*4882a593Smuzhiyun	! 6 cycles, 4 bytes per iteration
113*4882a593Smuzhiyun3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
114*4882a593Smuzhiyun	mov	r7, r3		!   5 MT (latency=0)	! RQPO
115*4882a593Smuzhiyun
116*4882a593Smuzhiyun	cmp/hi	r2,r0		!  57 MT
117*4882a593Smuzhiyun	shll8	r3		! 102 EX		! QPOx
118*4882a593Smuzhiyun
119*4882a593Smuzhiyun	mov	r1,r6		!   5 MT (latency=0)
120*4882a593Smuzhiyun	shlr16	r6		! 107 EX
121*4882a593Smuzhiyun
122*4882a593Smuzhiyun	shlr8	r6		! 106 EX		! xxxN
123*4882a593Smuzhiyun	mov	r1, r7		!   5 MT (latency=0)
124*4882a593Smuzhiyun
125*4882a593Smuzhiyun	or	r6,r3		!  82 EX		! QPON
126*4882a593Smuzhiyun	bt/s	3b		! 109 BR
127*4882a593Smuzhiyun
128*4882a593Smuzhiyun	 mov.l	r3,@-r0		!  30 LS
129*4882a593Smuzhiyun#else
130*4882a593Smuzhiyun3:	mov	r7,r3		! OPQR
131*4882a593Smuzhiyun	shlr8	r3		! xOPQ
132*4882a593Smuzhiyun	mov.l	@(r0,r5),r7	! KLMN
133*4882a593Smuzhiyun	mov	r7,r6
134*4882a593Smuzhiyun	shll16	r6
135*4882a593Smuzhiyun	shll8	r6		! Nxxx
136*4882a593Smuzhiyun	or	r6,r3		! NOPQ
137*4882a593Smuzhiyun	cmp/hi	r2,r0
138*4882a593Smuzhiyun	bt/s	3b
139*4882a593Smuzhiyun	 mov.l	r3,@-r0
140*4882a593Smuzhiyun#endif
141*4882a593Smuzhiyun
142*4882a593Smuzhiyun	! Finally, copy a byte at once, if necessary
143*4882a593Smuzhiyun
144*4882a593Smuzhiyun	add	#6,r5		!  50 EX
145*4882a593Smuzhiyun	cmp/eq	r4,r0		!  54 MT
146*4882a593Smuzhiyun
147*4882a593Smuzhiyun	add	#-6,r2		!  50 EX
148*4882a593Smuzhiyun	bt	9f		! 109 BR
149*4882a593Smuzhiyun
150*4882a593Smuzhiyun8:	cmp/hi	r2,r0		!  57 MT
151*4882a593Smuzhiyun	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
152*4882a593Smuzhiyun
153*4882a593Smuzhiyun	bt/s	8b		! 109 BR
154*4882a593Smuzhiyun
155*4882a593Smuzhiyun	 mov.b	r1,@-r0		!  29 LS
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun9:	rts
158*4882a593Smuzhiyun	 nop
159*4882a593Smuzhiyun
160*4882a593SmuzhiyunENTRY(memcpy)
161*4882a593Smuzhiyun
162*4882a593Smuzhiyun	! Calculate the invariants which will be used in the remainder
163*4882a593Smuzhiyun	! of the code:
164*4882a593Smuzhiyun	!
165*4882a593Smuzhiyun	!      r4   -->  [ ...  ] DST             [ ...  ] SRC
166*4882a593Smuzhiyun	!	         [ ...  ]                 [ ...  ]
167*4882a593Smuzhiyun	!	           :                        :
168*4882a593Smuzhiyun	!      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
169*4882a593Smuzhiyun	!
170*4882a593Smuzhiyun	!
171*4882a593Smuzhiyun
172*4882a593Smuzhiyun	! Short circuit the common case of src, dst and len being 32 bit aligned
173*4882a593Smuzhiyun	! and test for zero length move
174*4882a593Smuzhiyun
175*4882a593Smuzhiyun	mov	r6, r0		!   5 MT (0 cycle latency)
176*4882a593Smuzhiyun	or	r4, r0		!  82 EX
177*4882a593Smuzhiyun
178*4882a593Smuzhiyun	or	r5, r0		!  82 EX
179*4882a593Smuzhiyun	tst	r6, r6		!  86 MT
180*4882a593Smuzhiyun
181*4882a593Smuzhiyun	bt/s	99f		! 111 BR		(zero len)
182*4882a593Smuzhiyun	 tst	#3, r0		!  87 MT
183*4882a593Smuzhiyun
184*4882a593Smuzhiyun	mov	r4, r0		!   5 MT (0 cycle latency)
185*4882a593Smuzhiyun	add	r6, r0		!  49 EX
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun	mov	#16, r1		!   6 EX
188*4882a593Smuzhiyun	bt/s	.Lcase00	! 111 BR		(aligned)
189*4882a593Smuzhiyun
190*4882a593Smuzhiyun	 sub	r4, r5		!  75 EX
191*4882a593Smuzhiyun
192*4882a593Smuzhiyun	! Arguments are not nicely long word aligned or zero len.
193*4882a593Smuzhiyun	! Check for small copies, and if so do a simple byte at a time copy.
194*4882a593Smuzhiyun	!
195*4882a593Smuzhiyun	! Deciding on an exact value of 'small' is not easy, as the point at which
196*4882a593Smuzhiyun	! using the optimised routines become worthwhile varies (these are the
197*4882a593Smuzhiyun	! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
198*4882a593Smuzhiyun	!	size	byte-at-time	long	word	byte
199*4882a593Smuzhiyun	!	16	42		39-40	46-50	50-55
200*4882a593Smuzhiyun	!	24	58		43-44	54-58	62-67
201*4882a593Smuzhiyun	!	36	82		49-50	66-70	80-85
202*4882a593Smuzhiyun	! However the penalty for getting it 'wrong' is much higher for long word
203*4882a593Smuzhiyun	! aligned data (and this is more common), so use a value of 16.
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun	cmp/gt	r6,r1		!  56 MT
206*4882a593Smuzhiyun
207*4882a593Smuzhiyun	add	#-1,r5		!  50 EX
208*4882a593Smuzhiyun	bf/s	6f		! 108 BR		(not small)
209*4882a593Smuzhiyun
210*4882a593Smuzhiyun	 mov	r5, r3		!   5 MT (latency=0)
211*4882a593Smuzhiyun	shlr	r6		! 104 EX
212*4882a593Smuzhiyun
213*4882a593Smuzhiyun	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
214*4882a593Smuzhiyun	bf/s	4f		! 111 BR
215*4882a593Smuzhiyun
216*4882a593Smuzhiyun	 add	#-1,r3		!  50 EX
217*4882a593Smuzhiyun	tst	r6, r6		!  86 MT
218*4882a593Smuzhiyun
219*4882a593Smuzhiyun	bt/s	98f		! 110 BR
220*4882a593Smuzhiyun	 mov.b	r1,@-r0		!  29 LS
221*4882a593Smuzhiyun
222*4882a593Smuzhiyun	! 4 cycles, 2 bytes per iteration
223*4882a593Smuzhiyun3:	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
224*4882a593Smuzhiyun
225*4882a593Smuzhiyun4:	mov.b	@(r0,r3),r2	!  20 LS (latency=2)
226*4882a593Smuzhiyun	dt	r6		!  67 EX
227*4882a593Smuzhiyun
228*4882a593Smuzhiyun	mov.b	r1,@-r0		!  29 LS
229*4882a593Smuzhiyun	bf/s	3b		! 111 BR
230*4882a593Smuzhiyun
231*4882a593Smuzhiyun	 mov.b	r2,@-r0		!  29 LS
232*4882a593Smuzhiyun98:
233*4882a593Smuzhiyun	rts
234*4882a593Smuzhiyun	 nop
235*4882a593Smuzhiyun
236*4882a593Smuzhiyun99:	rts
237*4882a593Smuzhiyun	 mov	r4, r0
238*4882a593Smuzhiyun
239*4882a593Smuzhiyun	! Size is not small, so its worthwhile looking for optimisations.
240*4882a593Smuzhiyun	! First align destination to a long word boundary.
241*4882a593Smuzhiyun	!
242*4882a593Smuzhiyun	! r5 = normal value -1
243*4882a593Smuzhiyun
244*4882a593Smuzhiyun6:	tst	#3, r0		!  87 MT
245*4882a593Smuzhiyun        mov	#3, r3		!   6 EX
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun	bt/s	2f		! 111 BR
248*4882a593Smuzhiyun	 and	r0,r3		!  78 EX
249*4882a593Smuzhiyun
250*4882a593Smuzhiyun	! 3 cycles, 1 byte per iteration
251*4882a593Smuzhiyun1:	dt	r3		!  67 EX
252*4882a593Smuzhiyun	mov.b	@(r0,r5),r1	!  19 LS (latency=2)
253*4882a593Smuzhiyun
254*4882a593Smuzhiyun	add	#-1, r6		!  79 EX
255*4882a593Smuzhiyun	bf/s	1b		! 109 BR
256*4882a593Smuzhiyun
257*4882a593Smuzhiyun	 mov.b	r1,@-r0		!  28 LS
258*4882a593Smuzhiyun
259*4882a593Smuzhiyun2:	add	#1, r5		!  79 EX
260*4882a593Smuzhiyun
261*4882a593Smuzhiyun	! Now select the appropriate bulk transfer code based on relative
262*4882a593Smuzhiyun	! alignment of src and dst.
263*4882a593Smuzhiyun
264*4882a593Smuzhiyun	mov	r0, r3		!   5 MT (latency=0)
265*4882a593Smuzhiyun
266*4882a593Smuzhiyun	mov	r5, r0		!   5 MT (latency=0)
267*4882a593Smuzhiyun	tst	#1, r0		!  87 MT
268*4882a593Smuzhiyun
269*4882a593Smuzhiyun	bf/s	1f		! 111 BR
270*4882a593Smuzhiyun	 mov	#64, r7		!   6 EX
271*4882a593Smuzhiyun
272*4882a593Smuzhiyun	! bit 0 clear
273*4882a593Smuzhiyun
274*4882a593Smuzhiyun	cmp/ge	r7, r6		!  55 MT
275*4882a593Smuzhiyun
276*4882a593Smuzhiyun	bt/s	2f		! 111 BR
277*4882a593Smuzhiyun	 tst	#2, r0		!  87 MT
278*4882a593Smuzhiyun
279*4882a593Smuzhiyun	! small
280*4882a593Smuzhiyun	bt/s	.Lcase0
281*4882a593Smuzhiyun	 mov	r3, r0
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun	bra	.Lcase2
284*4882a593Smuzhiyun	 nop
285*4882a593Smuzhiyun
286*4882a593Smuzhiyun	! big
287*4882a593Smuzhiyun2:	bt/s	.Lcase0b
288*4882a593Smuzhiyun	 mov	r3, r0
289*4882a593Smuzhiyun
290*4882a593Smuzhiyun	bra	.Lcase2b
291*4882a593Smuzhiyun	 nop
292*4882a593Smuzhiyun
293*4882a593Smuzhiyun	! bit 0 set
294*4882a593Smuzhiyun1:	tst	#2, r0		! 87 MT
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun	bt/s	.Lcase1
297*4882a593Smuzhiyun	 mov	r3, r0
298*4882a593Smuzhiyun
299*4882a593Smuzhiyun	bra	.Lcase3
300*4882a593Smuzhiyun	 nop
301*4882a593Smuzhiyun
302*4882a593Smuzhiyun
303*4882a593Smuzhiyun	!
304*4882a593Smuzhiyun	!	GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
305*4882a593Smuzhiyun	!
306*4882a593Smuzhiyun
307*4882a593Smuzhiyun	! src, dst and size are all long word aligned
308*4882a593Smuzhiyun	! size is non-zero
309*4882a593Smuzhiyun
310*4882a593Smuzhiyun	.balign	32
311*4882a593Smuzhiyun.Lcase00:
312*4882a593Smuzhiyun	mov	#64, r1		!   6 EX
313*4882a593Smuzhiyun	mov	r5, r3		!   5 MT (latency=0)
314*4882a593Smuzhiyun
315*4882a593Smuzhiyun	cmp/gt	r6, r1		!  56 MT
316*4882a593Smuzhiyun	add	#-4, r5		!  50 EX
317*4882a593Smuzhiyun
318*4882a593Smuzhiyun	bf	.Lcase00b	! 108 BR		(big loop)
319*4882a593Smuzhiyun	shlr2	r6		! 105 EX
320*4882a593Smuzhiyun
321*4882a593Smuzhiyun	shlr	r6		! 104 EX
322*4882a593Smuzhiyun	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
323*4882a593Smuzhiyun
324*4882a593Smuzhiyun	bf/s	4f		! 111 BR
325*4882a593Smuzhiyun	 add	#-8, r3		!  50 EX
326*4882a593Smuzhiyun
327*4882a593Smuzhiyun	tst	r6, r6		!  86 MT
328*4882a593Smuzhiyun	bt/s	5f		! 110 BR
329*4882a593Smuzhiyun
330*4882a593Smuzhiyun	 mov.l	r1,@-r0		!  30 LS
331*4882a593Smuzhiyun
332*4882a593Smuzhiyun	! 4 cycles, 2 long words per iteration
333*4882a593Smuzhiyun3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
334*4882a593Smuzhiyun
335*4882a593Smuzhiyun4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
336*4882a593Smuzhiyun	dt	r6		!  67 EX
337*4882a593Smuzhiyun
338*4882a593Smuzhiyun	mov.l	r1, @-r0	!  30 LS
339*4882a593Smuzhiyun	bf/s	3b		! 109 BR
340*4882a593Smuzhiyun
341*4882a593Smuzhiyun	 mov.l	r2, @-r0	!  30 LS
342*4882a593Smuzhiyun
343*4882a593Smuzhiyun5:	rts
344*4882a593Smuzhiyun	 nop
345*4882a593Smuzhiyun
346*4882a593Smuzhiyun
347*4882a593Smuzhiyun	! Size is 16 or greater and less than 64, but may have trailing bytes
348*4882a593Smuzhiyun
349*4882a593Smuzhiyun	.balign	32
350*4882a593Smuzhiyun.Lcase0:
351*4882a593Smuzhiyun	add	#-4, r5		!  50 EX
352*4882a593Smuzhiyun	mov	r4, r7		!   5 MT (latency=0)
353*4882a593Smuzhiyun
354*4882a593Smuzhiyun	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
355*4882a593Smuzhiyun	mov	#4, r2		!   6 EX
356*4882a593Smuzhiyun
357*4882a593Smuzhiyun	add	#11, r7		!  50 EX
358*4882a593Smuzhiyun	tst	r2, r6		!  86 MT
359*4882a593Smuzhiyun
360*4882a593Smuzhiyun	mov	r5, r3		!   5 MT (latency=0)
361*4882a593Smuzhiyun	bt/s	4f		! 111 BR
362*4882a593Smuzhiyun
363*4882a593Smuzhiyun	 add	#-4, r3		!  50 EX
364*4882a593Smuzhiyun	mov.l	r1,@-r0		!  30 LS
365*4882a593Smuzhiyun
366*4882a593Smuzhiyun	! 4 cycles, 2 long words per iteration
367*4882a593Smuzhiyun3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
368*4882a593Smuzhiyun
369*4882a593Smuzhiyun4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
370*4882a593Smuzhiyun	cmp/hi	r7, r0
371*4882a593Smuzhiyun
372*4882a593Smuzhiyun	mov.l	r1, @-r0	!  30 LS
373*4882a593Smuzhiyun	bt/s	3b		! 109 BR
374*4882a593Smuzhiyun
375*4882a593Smuzhiyun	 mov.l	r2, @-r0	!  30 LS
376*4882a593Smuzhiyun
377*4882a593Smuzhiyun	! Copy the final 0-3 bytes
378*4882a593Smuzhiyun
379*4882a593Smuzhiyun	add	#3,r5		!  50 EX
380*4882a593Smuzhiyun
381*4882a593Smuzhiyun	cmp/eq	r0, r4		!  54 MT
382*4882a593Smuzhiyun	add	#-10, r7	!  50 EX
383*4882a593Smuzhiyun
384*4882a593Smuzhiyun	bt	9f		! 110 BR
385*4882a593Smuzhiyun
386*4882a593Smuzhiyun	! 3 cycles, 1 byte per iteration
387*4882a593Smuzhiyun1:	mov.b	@(r0,r5),r1	!  19 LS
388*4882a593Smuzhiyun	cmp/hi	r7,r0		!  57 MT
389*4882a593Smuzhiyun
390*4882a593Smuzhiyun	bt/s	1b		! 111 BR
391*4882a593Smuzhiyun	 mov.b	r1,@-r0		!  28 LS
392*4882a593Smuzhiyun
393*4882a593Smuzhiyun9:	rts
394*4882a593Smuzhiyun	 nop
395*4882a593Smuzhiyun
396*4882a593Smuzhiyun	! Size is at least 64 bytes, so will be going round the big loop at least once.
397*4882a593Smuzhiyun	!
398*4882a593Smuzhiyun	!   r2 = rounded up r4
399*4882a593Smuzhiyun	!   r3 = rounded down r0
400*4882a593Smuzhiyun
401*4882a593Smuzhiyun	.balign	32
402*4882a593Smuzhiyun.Lcase0b:
403*4882a593Smuzhiyun	add	#-4, r5		!  50 EX
404*4882a593Smuzhiyun
405*4882a593Smuzhiyun.Lcase00b:
406*4882a593Smuzhiyun	mov	r0, r3		!   5 MT (latency=0)
407*4882a593Smuzhiyun	mov	#(~0x1f), r1	!   6 EX
408*4882a593Smuzhiyun
409*4882a593Smuzhiyun	and	r1, r3		!  78 EX
410*4882a593Smuzhiyun	mov	r4, r2		!   5 MT (latency=0)
411*4882a593Smuzhiyun
412*4882a593Smuzhiyun	cmp/eq	r3, r0		!  54 MT
413*4882a593Smuzhiyun	add	#0x1f, r2	!  50 EX
414*4882a593Smuzhiyun
415*4882a593Smuzhiyun	bt/s	1f		! 110 BR
416*4882a593Smuzhiyun	 and	r1, r2		!  78 EX
417*4882a593Smuzhiyun
418*4882a593Smuzhiyun	! copy initial words until cache line aligned
419*4882a593Smuzhiyun
420*4882a593Smuzhiyun	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
421*4882a593Smuzhiyun	tst	#4, r0		!  87 MT
422*4882a593Smuzhiyun
423*4882a593Smuzhiyun	mov	r5, r6		!   5 MT (latency=0)
424*4882a593Smuzhiyun	add	#-4, r6		!  50 EX
425*4882a593Smuzhiyun
426*4882a593Smuzhiyun	bt/s	4f		! 111 BR
427*4882a593Smuzhiyun	 add	#8, r3		!  50 EX
428*4882a593Smuzhiyun
429*4882a593Smuzhiyun	tst	#0x18, r0	!  87 MT
430*4882a593Smuzhiyun
431*4882a593Smuzhiyun	bt/s	1f		! 109 BR
432*4882a593Smuzhiyun	 mov.l	r1,@-r0		!  30 LS
433*4882a593Smuzhiyun
434*4882a593Smuzhiyun	! 4 cycles, 2 long words per iteration
435*4882a593Smuzhiyun3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
436*4882a593Smuzhiyun
437*4882a593Smuzhiyun4:	mov.l	@(r0, r6), r7	!  21 LS (latency=2)
438*4882a593Smuzhiyun	cmp/eq	r3, r0		!  54 MT
439*4882a593Smuzhiyun
440*4882a593Smuzhiyun	mov.l	r1, @-r0	!  30 LS
441*4882a593Smuzhiyun	bf/s	3b		! 109 BR
442*4882a593Smuzhiyun
443*4882a593Smuzhiyun	 mov.l	r7, @-r0	!  30 LS
444*4882a593Smuzhiyun
445*4882a593Smuzhiyun	! Copy the cache line aligned blocks
446*4882a593Smuzhiyun	!
447*4882a593Smuzhiyun	! In use: r0, r2, r4, r5
448*4882a593Smuzhiyun	! Scratch: r1, r3, r6, r7
449*4882a593Smuzhiyun	!
450*4882a593Smuzhiyun	! We could do this with the four scratch registers, but if src
451*4882a593Smuzhiyun	! and dest hit the same cache line, this will thrash, so make
452*4882a593Smuzhiyun	! use of additional registers.
453*4882a593Smuzhiyun	!
454*4882a593Smuzhiyun	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
455*4882a593Smuzhiyun	!   r5:	 src (was r0+r5)
456*4882a593Smuzhiyun	!   r1:	 dest (was r0)
457*4882a593Smuzhiyun	! this can be reversed at the end, so we don't need to save any extra
458*4882a593Smuzhiyun	! state.
459*4882a593Smuzhiyun	!
460*4882a593Smuzhiyun1:	mov.l	r8, @-r15	!  30 LS
461*4882a593Smuzhiyun	add	r0, r5		!  49 EX
462*4882a593Smuzhiyun
463*4882a593Smuzhiyun	mov.l	r9, @-r15	!  30 LS
464*4882a593Smuzhiyun	mov	r0, r1		!   5 MT (latency=0)
465*4882a593Smuzhiyun
466*4882a593Smuzhiyun	mov.l	r10, @-r15	!  30 LS
467*4882a593Smuzhiyun	add	#-0x1c, r5	!  50 EX
468*4882a593Smuzhiyun
469*4882a593Smuzhiyun	mov.l	r11, @-r15	!  30 LS
470*4882a593Smuzhiyun
471*4882a593Smuzhiyun	! 16 cycles, 32 bytes per iteration
472*4882a593Smuzhiyun2:	mov.l	@(0x00,r5),r0	! 18 LS (latency=2)
473*4882a593Smuzhiyun	add	#-0x20, r1	! 50 EX
474*4882a593Smuzhiyun	mov.l	@(0x04,r5),r3	! 18 LS (latency=2)
475*4882a593Smuzhiyun	mov.l	@(0x08,r5),r6	! 18 LS (latency=2)
476*4882a593Smuzhiyun	mov.l	@(0x0c,r5),r7	! 18 LS (latency=2)
477*4882a593Smuzhiyun	mov.l	@(0x10,r5),r8	! 18 LS (latency=2)
478*4882a593Smuzhiyun	mov.l	@(0x14,r5),r9	! 18 LS (latency=2)
479*4882a593Smuzhiyun	mov.l	@(0x18,r5),r10	! 18 LS (latency=2)
480*4882a593Smuzhiyun	mov.l	@(0x1c,r5),r11	! 18 LS (latency=2)
481*4882a593Smuzhiyun	movca.l	r0,@r1		! 40 LS (latency=3-7)
482*4882a593Smuzhiyun	mov.l	r3,@(0x04,r1)	! 33 LS
483*4882a593Smuzhiyun	mov.l	r6,@(0x08,r1)	! 33 LS
484*4882a593Smuzhiyun	mov.l	r7,@(0x0c,r1)	! 33 LS
485*4882a593Smuzhiyun
486*4882a593Smuzhiyun	mov.l	r8,@(0x10,r1)	! 33 LS
487*4882a593Smuzhiyun	add	#-0x20, r5	! 50 EX
488*4882a593Smuzhiyun
489*4882a593Smuzhiyun	mov.l	r9,@(0x14,r1)	! 33 LS
490*4882a593Smuzhiyun	cmp/eq	r2,r1		! 54 MT
491*4882a593Smuzhiyun
492*4882a593Smuzhiyun	mov.l	r10,@(0x18,r1)	!  33 LS
493*4882a593Smuzhiyun	bf/s	2b		! 109 BR
494*4882a593Smuzhiyun
495*4882a593Smuzhiyun	 mov.l	r11,@(0x1c,r1)	!  33 LS
496*4882a593Smuzhiyun
497*4882a593Smuzhiyun	mov	r1, r0		!   5 MT (latency=0)
498*4882a593Smuzhiyun
499*4882a593Smuzhiyun	mov.l	@r15+, r11	!  15 LS
500*4882a593Smuzhiyun	sub	r1, r5		!  75 EX
501*4882a593Smuzhiyun
502*4882a593Smuzhiyun	mov.l	@r15+, r10	!  15 LS
503*4882a593Smuzhiyun	cmp/eq	r4, r0		!  54 MT
504*4882a593Smuzhiyun
505*4882a593Smuzhiyun	bf/s	1f		! 109 BR
506*4882a593Smuzhiyun	 mov.l	 @r15+, r9	!  15 LS
507*4882a593Smuzhiyun
508*4882a593Smuzhiyun	rts
509*4882a593Smuzhiyun1:	 mov.l	@r15+, r8	!  15 LS
510*4882a593Smuzhiyun	sub	r4, r1		!  75 EX		(len remaining)
511*4882a593Smuzhiyun
512*4882a593Smuzhiyun	! number of trailing bytes is non-zero
513*4882a593Smuzhiyun	!
514*4882a593Smuzhiyun	! invariants restored (r5 already decremented by 4)
515*4882a593Smuzhiyun	! also r1=num bytes remaining
516*4882a593Smuzhiyun
517*4882a593Smuzhiyun	mov	#4, r2		!   6 EX
518*4882a593Smuzhiyun	mov	r4, r7		!   5 MT (latency=0)
519*4882a593Smuzhiyun
520*4882a593Smuzhiyun	add	#0x1c, r5	!  50 EX		(back to -4)
521*4882a593Smuzhiyun	cmp/hs	r2, r1		!  58 MT
522*4882a593Smuzhiyun
523*4882a593Smuzhiyun	bf/s	5f		! 108 BR
524*4882a593Smuzhiyun	 add	 #11, r7	!  50 EX
525*4882a593Smuzhiyun
526*4882a593Smuzhiyun	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
527*4882a593Smuzhiyun	tst	r2, r1		!  86 MT
528*4882a593Smuzhiyun
529*4882a593Smuzhiyun	mov	r5, r3		!   5 MT (latency=0)
530*4882a593Smuzhiyun	bt/s	4f		! 111 BR
531*4882a593Smuzhiyun
532*4882a593Smuzhiyun	 add	#-4, r3		!  50 EX
533*4882a593Smuzhiyun	cmp/hs	r2, r1		!  58 MT
534*4882a593Smuzhiyun
535*4882a593Smuzhiyun	bt/s	5f		! 111 BR
536*4882a593Smuzhiyun	 mov.l	r6,@-r0		!  30 LS
537*4882a593Smuzhiyun
538*4882a593Smuzhiyun	! 4 cycles, 2 long words per iteration
539*4882a593Smuzhiyun3:	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
540*4882a593Smuzhiyun
541*4882a593Smuzhiyun4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
542*4882a593Smuzhiyun	cmp/hi	r7, r0
543*4882a593Smuzhiyun
544*4882a593Smuzhiyun	mov.l	r6, @-r0	!  30 LS
545*4882a593Smuzhiyun	bt/s	3b		! 109 BR
546*4882a593Smuzhiyun
547*4882a593Smuzhiyun	 mov.l	r2, @-r0	!  30 LS
548*4882a593Smuzhiyun
549*4882a593Smuzhiyun	! Copy the final 0-3 bytes
550*4882a593Smuzhiyun
551*4882a593Smuzhiyun5:	cmp/eq	r0, r4		!  54 MT
552*4882a593Smuzhiyun	add	#-10, r7	!  50 EX
553*4882a593Smuzhiyun
554*4882a593Smuzhiyun	bt	9f		! 110 BR
555*4882a593Smuzhiyun	add	#3,r5		!  50 EX
556*4882a593Smuzhiyun
557*4882a593Smuzhiyun	! 3 cycles, 1 byte per iteration
558*4882a593Smuzhiyun1:	mov.b	@(r0,r5),r1	!  19 LS
559*4882a593Smuzhiyun	cmp/hi	r7,r0		!  57 MT
560*4882a593Smuzhiyun
561*4882a593Smuzhiyun	bt/s	1b		! 111 BR
562*4882a593Smuzhiyun	 mov.b	r1,@-r0		!  28 LS
563*4882a593Smuzhiyun
564*4882a593Smuzhiyun9:	rts
565*4882a593Smuzhiyun	 nop
566*4882a593Smuzhiyun
567*4882a593Smuzhiyun	!
568*4882a593Smuzhiyun	!	GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
569*4882a593Smuzhiyun	!
570*4882a593Smuzhiyun
571*4882a593Smuzhiyun	.balign	32
572*4882a593Smuzhiyun.Lcase2:
573*4882a593Smuzhiyun	! Size is 16 or greater and less then 64, but may have trailing bytes
574*4882a593Smuzhiyun
575*4882a593Smuzhiyun2:	mov	r5, r6		!   5 MT (latency=0)
576*4882a593Smuzhiyun	add	#-2,r5		!  50 EX
577*4882a593Smuzhiyun
578*4882a593Smuzhiyun	mov	r4,r2		!   5 MT (latency=0)
579*4882a593Smuzhiyun	add	#-4,r6		!  50 EX
580*4882a593Smuzhiyun
581*4882a593Smuzhiyun	add	#7,r2		!  50 EX
582*4882a593Smuzhiyun3:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
583*4882a593Smuzhiyun
584*4882a593Smuzhiyun	mov.w	@(r0,r6),r3	!  20 LS (latency=2)
585*4882a593Smuzhiyun	cmp/hi	r2,r0		!  57 MT
586*4882a593Smuzhiyun
587*4882a593Smuzhiyun	mov.w	r1,@-r0		!  29 LS
588*4882a593Smuzhiyun	bt/s	3b		! 111 BR
589*4882a593Smuzhiyun
590*4882a593Smuzhiyun	 mov.w	r3,@-r0		!  29 LS
591*4882a593Smuzhiyun
592*4882a593Smuzhiyun	bra	10f
593*4882a593Smuzhiyun	 nop
594*4882a593Smuzhiyun
595*4882a593Smuzhiyun
596*4882a593Smuzhiyun	.balign	32
597*4882a593Smuzhiyun.Lcase2b:
598*4882a593Smuzhiyun	! Size is at least 64 bytes, so will be going round the big loop at least once.
599*4882a593Smuzhiyun	!
600*4882a593Smuzhiyun	!   r2 = rounded up r4
601*4882a593Smuzhiyun	!   r3 = rounded down r0
602*4882a593Smuzhiyun
603*4882a593Smuzhiyun	mov	r0, r3		!   5 MT (latency=0)
604*4882a593Smuzhiyun	mov	#(~0x1f), r1	!   6 EX
605*4882a593Smuzhiyun
606*4882a593Smuzhiyun	and	r1, r3		!  78 EX
607*4882a593Smuzhiyun	mov	r4, r2		!   5 MT (latency=0)
608*4882a593Smuzhiyun
609*4882a593Smuzhiyun	cmp/eq	r3, r0		!  54 MT
610*4882a593Smuzhiyun	add	#0x1f, r2	!  50 EX
611*4882a593Smuzhiyun
612*4882a593Smuzhiyun	add	#-2, r5		!  50 EX
613*4882a593Smuzhiyun	bt/s	1f		! 110 BR
614*4882a593Smuzhiyun	 and	r1, r2		!  78 EX
615*4882a593Smuzhiyun
616*4882a593Smuzhiyun	! Copy a short word one at a time until we are cache line aligned
617*4882a593Smuzhiyun	!   Normal values: r0, r2, r3, r4
618*4882a593Smuzhiyun	!   Unused: r1, r6, r7
619*4882a593Smuzhiyun	!   Mod: r5 (=r5-2)
620*4882a593Smuzhiyun	!
621*4882a593Smuzhiyun	add	#2, r3		!  50 EX
622*4882a593Smuzhiyun
623*4882a593Smuzhiyun2:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
624*4882a593Smuzhiyun	cmp/eq	r3,r0		!  54 MT
625*4882a593Smuzhiyun
626*4882a593Smuzhiyun	bf/s	2b		! 111 BR
627*4882a593Smuzhiyun
628*4882a593Smuzhiyun	 mov.w	r1,@-r0		!  29 LS
629*4882a593Smuzhiyun
630*4882a593Smuzhiyun	! Copy the cache line aligned blocks
631*4882a593Smuzhiyun	!
632*4882a593Smuzhiyun	! In use: r0, r2, r4, r5 (=r5-2)
633*4882a593Smuzhiyun	! Scratch: r1, r3, r6, r7
634*4882a593Smuzhiyun	!
635*4882a593Smuzhiyun	! We could do this with the four scratch registers, but if src
636*4882a593Smuzhiyun	! and dest hit the same cache line, this will thrash, so make
637*4882a593Smuzhiyun	! use of additional registers.
638*4882a593Smuzhiyun	!
639*4882a593Smuzhiyun	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
640*4882a593Smuzhiyun	!   r5:	 src (was r0+r5)
641*4882a593Smuzhiyun	!   r1:	 dest (was r0)
642*4882a593Smuzhiyun	! this can be reversed at the end, so we don't need to save any extra
643*4882a593Smuzhiyun	! state.
644*4882a593Smuzhiyun	!
645*4882a593Smuzhiyun1:	mov.l	r8, @-r15	!  30 LS
646*4882a593Smuzhiyun	add	r0, r5		!  49 EX
647*4882a593Smuzhiyun
648*4882a593Smuzhiyun	mov.l	r9, @-r15	!  30 LS
649*4882a593Smuzhiyun	mov	r0, r1		!   5 MT (latency=0)
650*4882a593Smuzhiyun
651*4882a593Smuzhiyun	mov.l	r10, @-r15	!  30 LS
652*4882a593Smuzhiyun	add	#-0x1e, r5	!  50 EX
653*4882a593Smuzhiyun
654*4882a593Smuzhiyun	mov.l	r11, @-r15	!  30 LS
655*4882a593Smuzhiyun
656*4882a593Smuzhiyun	mov.l	r12, @-r15	!  30 LS
657*4882a593Smuzhiyun
658*4882a593Smuzhiyun	! 17 cycles, 32 bytes per iteration
659*4882a593Smuzhiyun#ifdef CONFIG_CPU_LITTLE_ENDIAN
660*4882a593Smuzhiyun2:	mov.w	@r5+, r0	!  14 LS (latency=2)		..JI
661*4882a593Smuzhiyun	add	#-0x20, r1	!  50 EX
662*4882a593Smuzhiyun
663*4882a593Smuzhiyun	mov.l	@r5+, r3	!  15 LS (latency=2)		NMLK
664*4882a593Smuzhiyun
665*4882a593Smuzhiyun	mov.l	@r5+, r6	!  15 LS (latency=2)		RQPO
666*4882a593Smuzhiyun	shll16	r0		! 103 EX			JI..
667*4882a593Smuzhiyun
668*4882a593Smuzhiyun	mov.l	@r5+, r7	!  15 LS (latency=2)
669*4882a593Smuzhiyun	xtrct	r3, r0		!  48 EX			LKJI
670*4882a593Smuzhiyun
671*4882a593Smuzhiyun	mov.l	@r5+, r8	!  15 LS (latency=2)
672*4882a593Smuzhiyun	xtrct	r6, r3		!  48 EX			PONM
673*4882a593Smuzhiyun
674*4882a593Smuzhiyun	mov.l	@r5+, r9	!  15 LS (latency=2)
675*4882a593Smuzhiyun	xtrct	r7, r6		!  48 EX
676*4882a593Smuzhiyun
677*4882a593Smuzhiyun	mov.l	@r5+, r10	!  15 LS (latency=2)
678*4882a593Smuzhiyun	xtrct	r8, r7		!  48 EX
679*4882a593Smuzhiyun
680*4882a593Smuzhiyun	mov.l	@r5+, r11	!  15 LS (latency=2)
681*4882a593Smuzhiyun	xtrct	r9, r8		!  48 EX
682*4882a593Smuzhiyun
683*4882a593Smuzhiyun	mov.w	@r5+, r12	!  15 LS (latency=2)
684*4882a593Smuzhiyun	xtrct	r10, r9		!  48 EX
685*4882a593Smuzhiyun
686*4882a593Smuzhiyun	movca.l	r0,@r1		!  40 LS (latency=3-7)
687*4882a593Smuzhiyun	xtrct	r11, r10	!  48 EX
688*4882a593Smuzhiyun
689*4882a593Smuzhiyun	mov.l	r3, @(0x04,r1)	!  33 LS
690*4882a593Smuzhiyun	xtrct	r12, r11	!  48 EX
691*4882a593Smuzhiyun
692*4882a593Smuzhiyun	mov.l	r6, @(0x08,r1)	!  33 LS
693*4882a593Smuzhiyun
694*4882a593Smuzhiyun	mov.l	r7, @(0x0c,r1)	!  33 LS
695*4882a593Smuzhiyun
696*4882a593Smuzhiyun	mov.l	r8, @(0x10,r1)	!  33 LS
697*4882a593Smuzhiyun	add	#-0x40, r5	!  50 EX
698*4882a593Smuzhiyun
699*4882a593Smuzhiyun	mov.l	r9, @(0x14,r1)	!  33 LS
700*4882a593Smuzhiyun	cmp/eq	r2,r1		!  54 MT
701*4882a593Smuzhiyun
702*4882a593Smuzhiyun	mov.l	r10, @(0x18,r1)	!  33 LS
703*4882a593Smuzhiyun	bf/s	2b		! 109 BR
704*4882a593Smuzhiyun
705*4882a593Smuzhiyun	 mov.l	r11, @(0x1c,r1)	!  33 LS
706*4882a593Smuzhiyun#else
707*4882a593Smuzhiyun2:	mov.w	@(0x1e,r5), r0	!  17 LS (latency=2)
708*4882a593Smuzhiyun	add	#-2, r5		!  50 EX
709*4882a593Smuzhiyun
710*4882a593Smuzhiyun	mov.l	@(0x1c,r5), r3	!  18 LS (latency=2)
711*4882a593Smuzhiyun	add	#-4, r1		!  50 EX
712*4882a593Smuzhiyun
713*4882a593Smuzhiyun	mov.l	@(0x18,r5), r6	!  18 LS (latency=2)
714*4882a593Smuzhiyun	shll16	r0		! 103 EX
715*4882a593Smuzhiyun
716*4882a593Smuzhiyun	mov.l	@(0x14,r5), r7	!  18 LS (latency=2)
717*4882a593Smuzhiyun	xtrct	r3, r0		!  48 EX
718*4882a593Smuzhiyun
719*4882a593Smuzhiyun	mov.l	@(0x10,r5), r8	!  18 LS (latency=2)
720*4882a593Smuzhiyun	xtrct	r6, r3		!  48 EX
721*4882a593Smuzhiyun
722*4882a593Smuzhiyun	mov.l	@(0x0c,r5), r9	!  18 LS (latency=2)
723*4882a593Smuzhiyun	xtrct	r7, r6		!  48 EX
724*4882a593Smuzhiyun
725*4882a593Smuzhiyun	mov.l	@(0x08,r5), r10	!  18 LS (latency=2)
726*4882a593Smuzhiyun	xtrct	r8, r7		!  48 EX
727*4882a593Smuzhiyun
728*4882a593Smuzhiyun	mov.l	@(0x04,r5), r11	!  18 LS (latency=2)
729*4882a593Smuzhiyun	xtrct	r9, r8		!  48 EX
730*4882a593Smuzhiyun
731*4882a593Smuzhiyun	mov.l   @(0x00,r5), r12 !  18 LS (latency=2)
732*4882a593Smuzhiyun    	xtrct	r10, r9		!  48 EX
733*4882a593Smuzhiyun
734*4882a593Smuzhiyun	movca.l	r0,@r1		!  40 LS (latency=3-7)
735*4882a593Smuzhiyun	add	#-0x1c, r1	!  50 EX
736*4882a593Smuzhiyun
737*4882a593Smuzhiyun	mov.l	r3, @(0x18,r1)	!  33 LS
738*4882a593Smuzhiyun	xtrct	r11, r10	!  48 EX
739*4882a593Smuzhiyun
740*4882a593Smuzhiyun	mov.l	r6, @(0x14,r1)	!  33 LS
741*4882a593Smuzhiyun	xtrct	r12, r11	!  48 EX
742*4882a593Smuzhiyun
743*4882a593Smuzhiyun	mov.l	r7, @(0x10,r1)	!  33 LS
744*4882a593Smuzhiyun
745*4882a593Smuzhiyun	mov.l	r8, @(0x0c,r1)	!  33 LS
746*4882a593Smuzhiyun	add	#-0x1e, r5	!  50 EX
747*4882a593Smuzhiyun
748*4882a593Smuzhiyun	mov.l	r9, @(0x08,r1)	!  33 LS
749*4882a593Smuzhiyun	cmp/eq	r2,r1		!  54 MT
750*4882a593Smuzhiyun
751*4882a593Smuzhiyun	mov.l	r10, @(0x04,r1)	!  33 LS
752*4882a593Smuzhiyun	bf/s	2b		! 109 BR
753*4882a593Smuzhiyun
754*4882a593Smuzhiyun	 mov.l	r11, @(0x00,r1)	!  33 LS
755*4882a593Smuzhiyun#endif
756*4882a593Smuzhiyun
757*4882a593Smuzhiyun	mov.l	@r15+, r12
758*4882a593Smuzhiyun	mov	r1, r0		!   5 MT (latency=0)
759*4882a593Smuzhiyun
760*4882a593Smuzhiyun	mov.l	@r15+, r11	!  15 LS
761*4882a593Smuzhiyun	sub	r1, r5		!  75 EX
762*4882a593Smuzhiyun
763*4882a593Smuzhiyun	mov.l	@r15+, r10	!  15 LS
764*4882a593Smuzhiyun	cmp/eq	r4, r0		!  54 MT
765*4882a593Smuzhiyun
766*4882a593Smuzhiyun	bf/s	1f		! 109 BR
767*4882a593Smuzhiyun	 mov.l	 @r15+, r9	!  15 LS
768*4882a593Smuzhiyun
769*4882a593Smuzhiyun	rts
770*4882a593Smuzhiyun1:	 mov.l	@r15+, r8	!  15 LS
771*4882a593Smuzhiyun
772*4882a593Smuzhiyun	add	#0x1e, r5	!  50 EX
773*4882a593Smuzhiyun
774*4882a593Smuzhiyun	! Finish off a short word at a time
775*4882a593Smuzhiyun	! r5 must be invariant - 2
776*4882a593Smuzhiyun10:	mov	r4,r2		!   5 MT (latency=0)
777*4882a593Smuzhiyun	add	#1,r2		!  50 EX
778*4882a593Smuzhiyun
779*4882a593Smuzhiyun	cmp/hi	r2, r0		!  57 MT
780*4882a593Smuzhiyun	bf/s	1f		! 109 BR
781*4882a593Smuzhiyun
782*4882a593Smuzhiyun	 add	#2, r2		!  50 EX
783*4882a593Smuzhiyun
784*4882a593Smuzhiyun3:	mov.w	@(r0,r5),r1	!  20 LS
785*4882a593Smuzhiyun	cmp/hi	r2,r0		!  57 MT
786*4882a593Smuzhiyun
787*4882a593Smuzhiyun	bt/s	3b		! 109 BR
788*4882a593Smuzhiyun
789*4882a593Smuzhiyun	 mov.w	r1,@-r0		!  29 LS
790*4882a593Smuzhiyun1:
791*4882a593Smuzhiyun
792*4882a593Smuzhiyun	!
793*4882a593Smuzhiyun	! Finally, copy the last byte if necessary
794*4882a593Smuzhiyun	cmp/eq	r4,r0		!  54 MT
795*4882a593Smuzhiyun	bt/s	9b
796*4882a593Smuzhiyun	 add	#1,r5
797*4882a593Smuzhiyun	mov.b	@(r0,r5),r1
798*4882a593Smuzhiyun	rts
799*4882a593Smuzhiyun	 mov.b	r1,@-r0
800*4882a593Smuzhiyun
801