xref: /OK3568_Linux_fs/kernel/arch/csky/abiv1/memcpy.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
3*4882a593Smuzhiyun
4*4882a593Smuzhiyun#include <linux/linkage.h>
5*4882a593Smuzhiyun
6*4882a593Smuzhiyun.macro	GET_FRONT_BITS rx y
7*4882a593Smuzhiyun#ifdef	__cskyLE__
8*4882a593Smuzhiyun	lsri	\rx, \y
9*4882a593Smuzhiyun#else
10*4882a593Smuzhiyun	lsli	\rx, \y
11*4882a593Smuzhiyun#endif
12*4882a593Smuzhiyun.endm
13*4882a593Smuzhiyun
14*4882a593Smuzhiyun.macro	GET_AFTER_BITS rx y
15*4882a593Smuzhiyun#ifdef	__cskyLE__
16*4882a593Smuzhiyun	lsli	\rx, \y
17*4882a593Smuzhiyun#else
18*4882a593Smuzhiyun	lsri	\rx, \y
19*4882a593Smuzhiyun#endif
20*4882a593Smuzhiyun.endm
21*4882a593Smuzhiyun
22*4882a593Smuzhiyun/* void *memcpy(void *dest, const void *src, size_t n); */
23*4882a593SmuzhiyunENTRY(memcpy)
24*4882a593Smuzhiyun	mov	r7, r2
25*4882a593Smuzhiyun	cmplti	r4, 4
26*4882a593Smuzhiyun	bt	.L_copy_by_byte
27*4882a593Smuzhiyun	mov	r6, r2
28*4882a593Smuzhiyun	andi	r6, 3
29*4882a593Smuzhiyun	cmpnei	r6, 0
30*4882a593Smuzhiyun	jbt	.L_dest_not_aligned
31*4882a593Smuzhiyun	mov	r6, r3
32*4882a593Smuzhiyun	andi	r6, 3
33*4882a593Smuzhiyun	cmpnei	r6, 0
34*4882a593Smuzhiyun	jbt	.L_dest_aligned_but_src_not_aligned
35*4882a593Smuzhiyun.L0:
36*4882a593Smuzhiyun	cmplti	r4, 16
37*4882a593Smuzhiyun	jbt	.L_aligned_and_len_less_16bytes
38*4882a593Smuzhiyun	subi	sp, 8
39*4882a593Smuzhiyun	stw	r8, (sp, 0)
40*4882a593Smuzhiyun.L_aligned_and_len_larger_16bytes:
41*4882a593Smuzhiyun	ldw	r1, (r3, 0)
42*4882a593Smuzhiyun	ldw	r5, (r3, 4)
43*4882a593Smuzhiyun	ldw	r8, (r3, 8)
44*4882a593Smuzhiyun	stw	r1, (r7, 0)
45*4882a593Smuzhiyun	ldw	r1, (r3, 12)
46*4882a593Smuzhiyun	stw	r5, (r7, 4)
47*4882a593Smuzhiyun	stw	r8, (r7, 8)
48*4882a593Smuzhiyun	stw	r1, (r7, 12)
49*4882a593Smuzhiyun	subi	r4, 16
50*4882a593Smuzhiyun	addi	r3, 16
51*4882a593Smuzhiyun	addi	r7, 16
52*4882a593Smuzhiyun	cmplti	r4, 16
53*4882a593Smuzhiyun	jbf	.L_aligned_and_len_larger_16bytes
54*4882a593Smuzhiyun	ldw	r8, (sp, 0)
55*4882a593Smuzhiyun	addi	sp, 8
56*4882a593Smuzhiyun	cmpnei	r4, 0
57*4882a593Smuzhiyun	jbf	.L_return
58*4882a593Smuzhiyun
59*4882a593Smuzhiyun.L_aligned_and_len_less_16bytes:
60*4882a593Smuzhiyun	cmplti	r4, 4
61*4882a593Smuzhiyun	bt	.L_copy_by_byte
62*4882a593Smuzhiyun.L1:
63*4882a593Smuzhiyun	ldw	r1, (r3, 0)
64*4882a593Smuzhiyun	stw	r1, (r7, 0)
65*4882a593Smuzhiyun	subi	r4, 4
66*4882a593Smuzhiyun	addi	r3, 4
67*4882a593Smuzhiyun	addi	r7, 4
68*4882a593Smuzhiyun	cmplti	r4, 4
69*4882a593Smuzhiyun	jbf	.L1
70*4882a593Smuzhiyun	br	.L_copy_by_byte
71*4882a593Smuzhiyun
72*4882a593Smuzhiyun.L_return:
73*4882a593Smuzhiyun	rts
74*4882a593Smuzhiyun
75*4882a593Smuzhiyun.L_copy_by_byte:                      /* len less than 4 bytes */
76*4882a593Smuzhiyun	cmpnei	r4, 0
77*4882a593Smuzhiyun	jbf	.L_return
78*4882a593Smuzhiyun.L4:
79*4882a593Smuzhiyun	ldb	r1, (r3, 0)
80*4882a593Smuzhiyun	stb	r1, (r7, 0)
81*4882a593Smuzhiyun	addi	r3, 1
82*4882a593Smuzhiyun	addi	r7, 1
83*4882a593Smuzhiyun	decne	r4
84*4882a593Smuzhiyun	jbt	.L4
85*4882a593Smuzhiyun	rts
86*4882a593Smuzhiyun
87*4882a593Smuzhiyun/*
88*4882a593Smuzhiyun * If dest is not aligned, just copying some bytes makes the dest align.
89*4882a593Smuzhiyun * Afther that, we judge whether the src is aligned.
90*4882a593Smuzhiyun */
91*4882a593Smuzhiyun.L_dest_not_aligned:
92*4882a593Smuzhiyun	mov	r5, r3
93*4882a593Smuzhiyun	rsub	r5, r5, r7
94*4882a593Smuzhiyun	abs	r5, r5
95*4882a593Smuzhiyun	cmplt	r5, r4
96*4882a593Smuzhiyun	bt	.L_copy_by_byte
97*4882a593Smuzhiyun	mov	r5, r7
98*4882a593Smuzhiyun	sub	r5, r3
99*4882a593Smuzhiyun	cmphs	r5, r4
100*4882a593Smuzhiyun	bf	.L_copy_by_byte
101*4882a593Smuzhiyun	mov	r5, r6
102*4882a593Smuzhiyun.L5:
103*4882a593Smuzhiyun	ldb	r1, (r3, 0)              /* makes the dest align. */
104*4882a593Smuzhiyun	stb	r1, (r7, 0)
105*4882a593Smuzhiyun	addi	r5, 1
106*4882a593Smuzhiyun	subi	r4, 1
107*4882a593Smuzhiyun	addi	r3, 1
108*4882a593Smuzhiyun	addi	r7, 1
109*4882a593Smuzhiyun	cmpnei	r5, 4
110*4882a593Smuzhiyun	jbt	.L5
111*4882a593Smuzhiyun	cmplti	r4, 4
112*4882a593Smuzhiyun	jbt	.L_copy_by_byte
113*4882a593Smuzhiyun	mov	r6, r3                   /* judge whether the src is aligned. */
114*4882a593Smuzhiyun	andi	r6, 3
115*4882a593Smuzhiyun	cmpnei	r6, 0
116*4882a593Smuzhiyun	jbf	.L0
117*4882a593Smuzhiyun
118*4882a593Smuzhiyun/* Judge the number of misaligned, 1, 2, 3? */
119*4882a593Smuzhiyun.L_dest_aligned_but_src_not_aligned:
120*4882a593Smuzhiyun	mov	r5, r3
121*4882a593Smuzhiyun	rsub	r5, r5, r7
122*4882a593Smuzhiyun	abs	r5, r5
123*4882a593Smuzhiyun	cmplt	r5, r4
124*4882a593Smuzhiyun	bt	.L_copy_by_byte
125*4882a593Smuzhiyun	bclri	r3, 0
126*4882a593Smuzhiyun	bclri	r3, 1
127*4882a593Smuzhiyun	ldw	r1, (r3, 0)
128*4882a593Smuzhiyun	addi	r3, 4
129*4882a593Smuzhiyun	cmpnei	r6, 2
130*4882a593Smuzhiyun	bf	.L_dest_aligned_but_src_not_aligned_2bytes
131*4882a593Smuzhiyun	cmpnei	r6, 3
132*4882a593Smuzhiyun	bf	.L_dest_aligned_but_src_not_aligned_3bytes
133*4882a593Smuzhiyun
134*4882a593Smuzhiyun.L_dest_aligned_but_src_not_aligned_1byte:
135*4882a593Smuzhiyun	mov	r5, r7
136*4882a593Smuzhiyun	sub	r5, r3
137*4882a593Smuzhiyun	cmphs	r5, r4
138*4882a593Smuzhiyun	bf	.L_copy_by_byte
139*4882a593Smuzhiyun	cmplti	r4, 16
140*4882a593Smuzhiyun	bf	.L11
141*4882a593Smuzhiyun.L10:                                     /* If the len is less than 16 bytes */
142*4882a593Smuzhiyun	GET_FRONT_BITS r1 8
143*4882a593Smuzhiyun	mov	r5, r1
144*4882a593Smuzhiyun	ldw	r6, (r3, 0)
145*4882a593Smuzhiyun	mov	r1, r6
146*4882a593Smuzhiyun	GET_AFTER_BITS r6 24
147*4882a593Smuzhiyun	or	r5, r6
148*4882a593Smuzhiyun	stw	r5, (r7, 0)
149*4882a593Smuzhiyun	subi	r4, 4
150*4882a593Smuzhiyun	addi	r3, 4
151*4882a593Smuzhiyun	addi	r7, 4
152*4882a593Smuzhiyun	cmplti	r4, 4
153*4882a593Smuzhiyun	bf	.L10
154*4882a593Smuzhiyun	subi	r3, 3
155*4882a593Smuzhiyun	br	.L_copy_by_byte
156*4882a593Smuzhiyun.L11:
157*4882a593Smuzhiyun	subi	sp, 16
158*4882a593Smuzhiyun	stw	r8, (sp, 0)
159*4882a593Smuzhiyun	stw	r9, (sp, 4)
160*4882a593Smuzhiyun	stw	r10, (sp, 8)
161*4882a593Smuzhiyun	stw	r11, (sp, 12)
162*4882a593Smuzhiyun.L12:
163*4882a593Smuzhiyun	ldw	r5, (r3, 0)
164*4882a593Smuzhiyun	ldw	r11, (r3, 4)
165*4882a593Smuzhiyun	ldw	r8, (r3, 8)
166*4882a593Smuzhiyun	ldw	r9, (r3, 12)
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun	GET_FRONT_BITS r1 8               /* little or big endian? */
169*4882a593Smuzhiyun	mov	r10, r5
170*4882a593Smuzhiyun	GET_AFTER_BITS r5 24
171*4882a593Smuzhiyun	or	r5, r1
172*4882a593Smuzhiyun
173*4882a593Smuzhiyun	GET_FRONT_BITS r10 8
174*4882a593Smuzhiyun	mov	r1, r11
175*4882a593Smuzhiyun	GET_AFTER_BITS r11 24
176*4882a593Smuzhiyun	or	r11, r10
177*4882a593Smuzhiyun
178*4882a593Smuzhiyun	GET_FRONT_BITS r1 8
179*4882a593Smuzhiyun	mov	r10, r8
180*4882a593Smuzhiyun	GET_AFTER_BITS r8 24
181*4882a593Smuzhiyun	or	r8, r1
182*4882a593Smuzhiyun
183*4882a593Smuzhiyun	GET_FRONT_BITS r10 8
184*4882a593Smuzhiyun	mov	r1, r9
185*4882a593Smuzhiyun	GET_AFTER_BITS r9 24
186*4882a593Smuzhiyun	or	r9, r10
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun	stw	r5, (r7, 0)
189*4882a593Smuzhiyun	stw	r11, (r7, 4)
190*4882a593Smuzhiyun	stw	r8, (r7, 8)
191*4882a593Smuzhiyun	stw	r9, (r7, 12)
192*4882a593Smuzhiyun	subi	r4, 16
193*4882a593Smuzhiyun	addi	r3, 16
194*4882a593Smuzhiyun	addi	r7, 16
195*4882a593Smuzhiyun	cmplti	r4, 16
196*4882a593Smuzhiyun	jbf	.L12
197*4882a593Smuzhiyun	ldw	r8, (sp, 0)
198*4882a593Smuzhiyun	ldw	r9, (sp, 4)
199*4882a593Smuzhiyun	ldw	r10, (sp, 8)
200*4882a593Smuzhiyun	ldw	r11, (sp, 12)
201*4882a593Smuzhiyun	addi	sp , 16
202*4882a593Smuzhiyun	cmplti	r4, 4
203*4882a593Smuzhiyun	bf	.L10
204*4882a593Smuzhiyun	subi	r3, 3
205*4882a593Smuzhiyun	br	.L_copy_by_byte
206*4882a593Smuzhiyun
207*4882a593Smuzhiyun.L_dest_aligned_but_src_not_aligned_2bytes:
208*4882a593Smuzhiyun	cmplti	r4, 16
209*4882a593Smuzhiyun	bf	.L21
210*4882a593Smuzhiyun.L20:
211*4882a593Smuzhiyun	GET_FRONT_BITS r1 16
212*4882a593Smuzhiyun	mov	r5, r1
213*4882a593Smuzhiyun	ldw	r6, (r3, 0)
214*4882a593Smuzhiyun	mov	r1, r6
215*4882a593Smuzhiyun	GET_AFTER_BITS r6 16
216*4882a593Smuzhiyun	or	r5, r6
217*4882a593Smuzhiyun	stw	r5, (r7, 0)
218*4882a593Smuzhiyun	subi	r4, 4
219*4882a593Smuzhiyun	addi	r3, 4
220*4882a593Smuzhiyun	addi	r7, 4
221*4882a593Smuzhiyun	cmplti	r4, 4
222*4882a593Smuzhiyun	bf	.L20
223*4882a593Smuzhiyun	subi	r3, 2
224*4882a593Smuzhiyun	br	.L_copy_by_byte
225*4882a593Smuzhiyun	rts
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun.L21:	/* n > 16 */
228*4882a593Smuzhiyun	subi 	sp, 16
229*4882a593Smuzhiyun	stw	r8, (sp, 0)
230*4882a593Smuzhiyun	stw	r9, (sp, 4)
231*4882a593Smuzhiyun	stw	r10, (sp, 8)
232*4882a593Smuzhiyun	stw	r11, (sp, 12)
233*4882a593Smuzhiyun
234*4882a593Smuzhiyun.L22:
235*4882a593Smuzhiyun	ldw	r5, (r3, 0)
236*4882a593Smuzhiyun	ldw	r11, (r3, 4)
237*4882a593Smuzhiyun	ldw	r8, (r3, 8)
238*4882a593Smuzhiyun	ldw	r9, (r3, 12)
239*4882a593Smuzhiyun
240*4882a593Smuzhiyun	GET_FRONT_BITS r1 16
241*4882a593Smuzhiyun	mov	r10, r5
242*4882a593Smuzhiyun	GET_AFTER_BITS r5 16
243*4882a593Smuzhiyun	or	r5, r1
244*4882a593Smuzhiyun
245*4882a593Smuzhiyun	GET_FRONT_BITS r10 16
246*4882a593Smuzhiyun	mov	r1, r11
247*4882a593Smuzhiyun	GET_AFTER_BITS r11 16
248*4882a593Smuzhiyun	or	r11, r10
249*4882a593Smuzhiyun
250*4882a593Smuzhiyun	GET_FRONT_BITS r1 16
251*4882a593Smuzhiyun	mov	r10, r8
252*4882a593Smuzhiyun	GET_AFTER_BITS r8 16
253*4882a593Smuzhiyun	or	r8, r1
254*4882a593Smuzhiyun
255*4882a593Smuzhiyun	GET_FRONT_BITS r10 16
256*4882a593Smuzhiyun	mov	r1, r9
257*4882a593Smuzhiyun	GET_AFTER_BITS r9 16
258*4882a593Smuzhiyun	or	r9, r10
259*4882a593Smuzhiyun
260*4882a593Smuzhiyun	stw	r5, (r7, 0)
261*4882a593Smuzhiyun	stw	r11, (r7, 4)
262*4882a593Smuzhiyun	stw	r8, (r7, 8)
263*4882a593Smuzhiyun	stw	r9, (r7, 12)
264*4882a593Smuzhiyun	subi	r4, 16
265*4882a593Smuzhiyun	addi	r3, 16
266*4882a593Smuzhiyun	addi	r7, 16
267*4882a593Smuzhiyun	cmplti	r4, 16
268*4882a593Smuzhiyun	jbf	.L22
269*4882a593Smuzhiyun	ldw	r8, (sp, 0)
270*4882a593Smuzhiyun	ldw	r9, (sp, 4)
271*4882a593Smuzhiyun	ldw	r10, (sp, 8)
272*4882a593Smuzhiyun	ldw	r11, (sp, 12)
273*4882a593Smuzhiyun	addi	sp, 16
274*4882a593Smuzhiyun	cmplti	r4, 4
275*4882a593Smuzhiyun	bf	.L20
276*4882a593Smuzhiyun	subi	r3, 2
277*4882a593Smuzhiyun	br	.L_copy_by_byte
278*4882a593Smuzhiyun
279*4882a593Smuzhiyun
280*4882a593Smuzhiyun.L_dest_aligned_but_src_not_aligned_3bytes:
281*4882a593Smuzhiyun	cmplti	r4, 16
282*4882a593Smuzhiyun	bf	.L31
283*4882a593Smuzhiyun.L30:
284*4882a593Smuzhiyun	GET_FRONT_BITS r1 24
285*4882a593Smuzhiyun	mov	r5, r1
286*4882a593Smuzhiyun	ldw	r6, (r3, 0)
287*4882a593Smuzhiyun	mov	r1, r6
288*4882a593Smuzhiyun	GET_AFTER_BITS r6 8
289*4882a593Smuzhiyun	or	r5, r6
290*4882a593Smuzhiyun	stw	r5, (r7, 0)
291*4882a593Smuzhiyun	subi	r4, 4
292*4882a593Smuzhiyun	addi	r3, 4
293*4882a593Smuzhiyun	addi	r7, 4
294*4882a593Smuzhiyun	cmplti	r4, 4
295*4882a593Smuzhiyun	bf	.L30
296*4882a593Smuzhiyun	subi	r3, 1
297*4882a593Smuzhiyun	br	.L_copy_by_byte
298*4882a593Smuzhiyun.L31:
299*4882a593Smuzhiyun	subi	sp, 16
300*4882a593Smuzhiyun	stw	r8, (sp, 0)
301*4882a593Smuzhiyun	stw	r9, (sp, 4)
302*4882a593Smuzhiyun	stw	r10, (sp, 8)
303*4882a593Smuzhiyun	stw	r11, (sp, 12)
304*4882a593Smuzhiyun.L32:
305*4882a593Smuzhiyun	ldw	r5, (r3, 0)
306*4882a593Smuzhiyun	ldw	r11, (r3, 4)
307*4882a593Smuzhiyun	ldw	r8, (r3, 8)
308*4882a593Smuzhiyun	ldw	r9, (r3, 12)
309*4882a593Smuzhiyun
310*4882a593Smuzhiyun	GET_FRONT_BITS r1 24
311*4882a593Smuzhiyun	mov	r10, r5
312*4882a593Smuzhiyun	GET_AFTER_BITS r5 8
313*4882a593Smuzhiyun	or	r5, r1
314*4882a593Smuzhiyun
315*4882a593Smuzhiyun	GET_FRONT_BITS r10 24
316*4882a593Smuzhiyun	mov	r1, r11
317*4882a593Smuzhiyun	GET_AFTER_BITS r11 8
318*4882a593Smuzhiyun	or	r11, r10
319*4882a593Smuzhiyun
320*4882a593Smuzhiyun	GET_FRONT_BITS r1 24
321*4882a593Smuzhiyun	mov	r10, r8
322*4882a593Smuzhiyun	GET_AFTER_BITS r8 8
323*4882a593Smuzhiyun	or	r8, r1
324*4882a593Smuzhiyun
325*4882a593Smuzhiyun	GET_FRONT_BITS r10 24
326*4882a593Smuzhiyun	mov	r1, r9
327*4882a593Smuzhiyun	GET_AFTER_BITS r9 8
328*4882a593Smuzhiyun	or	r9, r10
329*4882a593Smuzhiyun
330*4882a593Smuzhiyun	stw	r5, (r7, 0)
331*4882a593Smuzhiyun	stw	r11, (r7, 4)
332*4882a593Smuzhiyun	stw	r8, (r7, 8)
333*4882a593Smuzhiyun	stw	r9, (r7, 12)
334*4882a593Smuzhiyun	subi	r4, 16
335*4882a593Smuzhiyun	addi	r3, 16
336*4882a593Smuzhiyun	addi	r7, 16
337*4882a593Smuzhiyun	cmplti	r4, 16
338*4882a593Smuzhiyun	jbf	.L32
339*4882a593Smuzhiyun	ldw	r8, (sp, 0)
340*4882a593Smuzhiyun	ldw	r9, (sp, 4)
341*4882a593Smuzhiyun	ldw	r10, (sp, 8)
342*4882a593Smuzhiyun	ldw	r11, (sp, 12)
343*4882a593Smuzhiyun	addi	sp, 16
344*4882a593Smuzhiyun	cmplti	r4, 4
345*4882a593Smuzhiyun	bf	.L30
346*4882a593Smuzhiyun	subi	r3, 1
347*4882a593Smuzhiyun	br	.L_copy_by_byte
348