xref: /OK3568_Linux_fs/kernel/arch/alpha/lib/ev6-memcpy.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * arch/alpha/lib/ev6-memcpy.S
4*4882a593Smuzhiyun * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun * Reasonably optimized memcpy() routine for the Alpha 21264
7*4882a593Smuzhiyun *
8*4882a593Smuzhiyun *	- memory accessed as aligned quadwords only
9*4882a593Smuzhiyun *	- uses bcmpge to compare 8 bytes in parallel
10*4882a593Smuzhiyun *
11*4882a593Smuzhiyun * Much of the information about 21264 scheduling/coding comes from:
12*4882a593Smuzhiyun *	Compiler Writer's Guide for the Alpha 21264
13*4882a593Smuzhiyun *	abbreviated as 'CWG' in other comments here
14*4882a593Smuzhiyun *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
15*4882a593Smuzhiyun * Scheduling notation:
16*4882a593Smuzhiyun *	E	- either cluster
17*4882a593Smuzhiyun *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
18*4882a593Smuzhiyun *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
19*4882a593Smuzhiyun *
20*4882a593Smuzhiyun * Temp usage notes:
21*4882a593Smuzhiyun *	$1,$2,		- scratch
22*4882a593Smuzhiyun */
23*4882a593Smuzhiyun#include <asm/export.h>
24*4882a593Smuzhiyun	.set noreorder
25*4882a593Smuzhiyun	.set noat
26*4882a593Smuzhiyun
27*4882a593Smuzhiyun	.align	4
28*4882a593Smuzhiyun	.globl memcpy
29*4882a593Smuzhiyun	.ent memcpy
30*4882a593Smuzhiyunmemcpy:
31*4882a593Smuzhiyun	.frame $30,0,$26,0
32*4882a593Smuzhiyun	.prologue 0
33*4882a593Smuzhiyun
34*4882a593Smuzhiyun	mov	$16, $0			# E : copy dest to return
35*4882a593Smuzhiyun	ble	$18, $nomoredata	# U : done with the copy?
36*4882a593Smuzhiyun	xor	$16, $17, $1		# E : are source and dest alignments the same?
37*4882a593Smuzhiyun	and	$1, 7, $1		# E : are they the same mod 8?
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun	bne	$1, $misaligned		# U : Nope - gotta do this the slow way
40*4882a593Smuzhiyun	/* source and dest are same mod 8 address */
41*4882a593Smuzhiyun	and	$16, 7, $1		# E : Are both 0mod8?
42*4882a593Smuzhiyun	beq	$1, $both_0mod8		# U : Yes
43*4882a593Smuzhiyun	nop				# E :
44*4882a593Smuzhiyun
45*4882a593Smuzhiyun	/*
46*4882a593Smuzhiyun	 * source and dest are same misalignment.  move a byte at a time
47*4882a593Smuzhiyun	 * until a 0mod8 alignment for both is reached.
48*4882a593Smuzhiyun	 * At least one byte more to move
49*4882a593Smuzhiyun	 */
50*4882a593Smuzhiyun
51*4882a593Smuzhiyun$head_align:
52*4882a593Smuzhiyun	ldbu	$1, 0($17)		# L : grab a byte
53*4882a593Smuzhiyun	subq	$18, 1, $18		# E : count--
54*4882a593Smuzhiyun	addq	$17, 1, $17		# E : src++
55*4882a593Smuzhiyun	stb	$1, 0($16)		# L :
56*4882a593Smuzhiyun	addq	$16, 1, $16		# E : dest++
57*4882a593Smuzhiyun	and	$16, 7, $1		# E : Are we at 0mod8 yet?
58*4882a593Smuzhiyun	ble	$18, $nomoredata	# U : done with the copy?
59*4882a593Smuzhiyun	bne	$1, $head_align		# U :
60*4882a593Smuzhiyun
61*4882a593Smuzhiyun$both_0mod8:
62*4882a593Smuzhiyun	cmple	$18, 127, $1		# E : Can we unroll the loop?
63*4882a593Smuzhiyun	bne	$1, $no_unroll		# U :
64*4882a593Smuzhiyun	and	$16, 63, $1		# E : get mod64 alignment
65*4882a593Smuzhiyun	beq	$1, $do_unroll		# U : no single quads to fiddle
66*4882a593Smuzhiyun
67*4882a593Smuzhiyun$single_head_quad:
68*4882a593Smuzhiyun	ldq	$1, 0($17)		# L : get 8 bytes
69*4882a593Smuzhiyun	subq	$18, 8, $18		# E : count -= 8
70*4882a593Smuzhiyun	addq	$17, 8, $17		# E : src += 8
71*4882a593Smuzhiyun	nop				# E :
72*4882a593Smuzhiyun
73*4882a593Smuzhiyun	stq	$1, 0($16)		# L : store
74*4882a593Smuzhiyun	addq	$16, 8, $16		# E : dest += 8
75*4882a593Smuzhiyun	and	$16, 63, $1		# E : get mod64 alignment
76*4882a593Smuzhiyun	bne	$1, $single_head_quad	# U : still not fully aligned
77*4882a593Smuzhiyun
78*4882a593Smuzhiyun$do_unroll:
79*4882a593Smuzhiyun	addq	$16, 64, $7		# E : Initial (+1 trip) wh64 address
80*4882a593Smuzhiyun	cmple	$18, 127, $1		# E : Can we go through the unrolled loop?
81*4882a593Smuzhiyun	bne	$1, $tail_quads		# U : Nope
82*4882a593Smuzhiyun	nop				# E :
83*4882a593Smuzhiyun
84*4882a593Smuzhiyun$unroll_body:
85*4882a593Smuzhiyun	wh64	($7)			# L1 : memory subsystem hint: 64 bytes at
86*4882a593Smuzhiyun					# ($7) are about to be over-written
87*4882a593Smuzhiyun	ldq	$6, 0($17)		# L0 : bytes 0..7
88*4882a593Smuzhiyun	nop				# E :
89*4882a593Smuzhiyun	nop				# E :
90*4882a593Smuzhiyun
91*4882a593Smuzhiyun	ldq	$4, 8($17)		# L : bytes 8..15
92*4882a593Smuzhiyun	ldq	$5, 16($17)		# L : bytes 16..23
93*4882a593Smuzhiyun	addq	$7, 64, $7		# E : Update next wh64 address
94*4882a593Smuzhiyun	nop				# E :
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun	ldq	$3, 24($17)		# L : bytes 24..31
97*4882a593Smuzhiyun	addq	$16, 64, $1		# E : fallback value for wh64
98*4882a593Smuzhiyun	nop				# E :
99*4882a593Smuzhiyun	nop				# E :
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun	addq	$17, 32, $17		# E : src += 32 bytes
102*4882a593Smuzhiyun	stq	$6, 0($16)		# L : bytes 0..7
103*4882a593Smuzhiyun	nop				# E :
104*4882a593Smuzhiyun	nop				# E :
105*4882a593Smuzhiyun
106*4882a593Smuzhiyun	stq	$4, 8($16)		# L : bytes 8..15
107*4882a593Smuzhiyun	stq	$5, 16($16)		# L : bytes 16..23
108*4882a593Smuzhiyun	subq	$18, 192, $2		# E : At least two more trips to go?
109*4882a593Smuzhiyun	nop				# E :
110*4882a593Smuzhiyun
111*4882a593Smuzhiyun	stq	$3, 24($16)		# L : bytes 24..31
112*4882a593Smuzhiyun	addq	$16, 32, $16		# E : dest += 32 bytes
113*4882a593Smuzhiyun	nop				# E :
114*4882a593Smuzhiyun	nop				# E :
115*4882a593Smuzhiyun
116*4882a593Smuzhiyun	ldq	$6, 0($17)		# L : bytes 0..7
117*4882a593Smuzhiyun	ldq	$4, 8($17)		# L : bytes 8..15
118*4882a593Smuzhiyun	cmovlt	$2, $1, $7		# E : Latency 2, extra map slot - Use
119*4882a593Smuzhiyun					# fallback wh64 address if < 2 more trips
120*4882a593Smuzhiyun	nop				# E :
121*4882a593Smuzhiyun
122*4882a593Smuzhiyun	ldq	$5, 16($17)		# L : bytes 16..23
123*4882a593Smuzhiyun	ldq	$3, 24($17)		# L : bytes 24..31
124*4882a593Smuzhiyun	addq	$16, 32, $16		# E : dest += 32
125*4882a593Smuzhiyun	subq	$18, 64, $18		# E : count -= 64
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun	addq	$17, 32, $17		# E : src += 32
128*4882a593Smuzhiyun	stq	$6, -32($16)		# L : bytes 0..7
129*4882a593Smuzhiyun	stq	$4, -24($16)		# L : bytes 8..15
130*4882a593Smuzhiyun	cmple	$18, 63, $1		# E : At least one more trip?
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun	stq	$5, -16($16)		# L : bytes 16..23
133*4882a593Smuzhiyun	stq	$3, -8($16)		# L : bytes 24..31
134*4882a593Smuzhiyun	nop				# E :
135*4882a593Smuzhiyun	beq	$1, $unroll_body
136*4882a593Smuzhiyun
137*4882a593Smuzhiyun$tail_quads:
138*4882a593Smuzhiyun$no_unroll:
139*4882a593Smuzhiyun	.align 4
140*4882a593Smuzhiyun	subq	$18, 8, $18		# E : At least a quad left?
141*4882a593Smuzhiyun	blt	$18, $less_than_8	# U : Nope
142*4882a593Smuzhiyun	nop				# E :
143*4882a593Smuzhiyun	nop				# E :
144*4882a593Smuzhiyun
145*4882a593Smuzhiyun$move_a_quad:
146*4882a593Smuzhiyun	ldq	$1, 0($17)		# L : fetch 8
147*4882a593Smuzhiyun	subq	$18, 8, $18		# E : count -= 8
148*4882a593Smuzhiyun	addq	$17, 8, $17		# E : src += 8
149*4882a593Smuzhiyun	nop				# E :
150*4882a593Smuzhiyun
151*4882a593Smuzhiyun	stq	$1, 0($16)		# L : store 8
152*4882a593Smuzhiyun	addq	$16, 8, $16		# E : dest += 8
153*4882a593Smuzhiyun	bge	$18, $move_a_quad	# U :
154*4882a593Smuzhiyun	nop				# E :
155*4882a593Smuzhiyun
156*4882a593Smuzhiyun$less_than_8:
157*4882a593Smuzhiyun	.align 4
158*4882a593Smuzhiyun	addq	$18, 8, $18		# E : add back for trailing bytes
159*4882a593Smuzhiyun	ble	$18, $nomoredata	# U : All-done
160*4882a593Smuzhiyun	nop				# E :
161*4882a593Smuzhiyun	nop				# E :
162*4882a593Smuzhiyun
163*4882a593Smuzhiyun	/* Trailing bytes */
164*4882a593Smuzhiyun$tail_bytes:
165*4882a593Smuzhiyun	subq	$18, 1, $18		# E : count--
166*4882a593Smuzhiyun	ldbu	$1, 0($17)		# L : fetch a byte
167*4882a593Smuzhiyun	addq	$17, 1, $17		# E : src++
168*4882a593Smuzhiyun	nop				# E :
169*4882a593Smuzhiyun
170*4882a593Smuzhiyun	stb	$1, 0($16)		# L : store a byte
171*4882a593Smuzhiyun	addq	$16, 1, $16		# E : dest++
172*4882a593Smuzhiyun	bgt	$18, $tail_bytes	# U : more to be done?
173*4882a593Smuzhiyun	nop				# E :
174*4882a593Smuzhiyun
175*4882a593Smuzhiyun	/* branching to exit takes 3 extra cycles, so replicate exit here */
176*4882a593Smuzhiyun	ret	$31, ($26), 1		# L0 :
177*4882a593Smuzhiyun	nop				# E :
178*4882a593Smuzhiyun	nop				# E :
179*4882a593Smuzhiyun	nop				# E :
180*4882a593Smuzhiyun
181*4882a593Smuzhiyun$misaligned:
182*4882a593Smuzhiyun	mov	$0, $4			# E : dest temp
183*4882a593Smuzhiyun	and	$0, 7, $1		# E : dest alignment mod8
184*4882a593Smuzhiyun	beq	$1, $dest_0mod8		# U : life doesnt totally suck
185*4882a593Smuzhiyun	nop
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun$aligndest:
188*4882a593Smuzhiyun	ble	$18, $nomoredata	# U :
189*4882a593Smuzhiyun	ldbu	$1, 0($17)		# L : fetch a byte
190*4882a593Smuzhiyun	subq	$18, 1, $18		# E : count--
191*4882a593Smuzhiyun	addq	$17, 1, $17		# E : src++
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun	stb	$1, 0($4)		# L : store it
194*4882a593Smuzhiyun	addq	$4, 1, $4		# E : dest++
195*4882a593Smuzhiyun	and	$4, 7, $1		# E : dest 0mod8 yet?
196*4882a593Smuzhiyun	bne	$1, $aligndest		# U : go until we are aligned.
197*4882a593Smuzhiyun
198*4882a593Smuzhiyun	/* Source has unknown alignment, but dest is known to be 0mod8 */
199*4882a593Smuzhiyun$dest_0mod8:
200*4882a593Smuzhiyun	subq	$18, 8, $18		# E : At least a quad left?
201*4882a593Smuzhiyun	blt	$18, $misalign_tail	# U : Nope
202*4882a593Smuzhiyun	ldq_u	$3, 0($17)		# L : seed (rotating load) of 8 bytes
203*4882a593Smuzhiyun	nop				# E :
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun$mis_quad:
206*4882a593Smuzhiyun	ldq_u	$16, 8($17)		# L : Fetch next 8
207*4882a593Smuzhiyun	extql	$3, $17, $3		# U : masking
208*4882a593Smuzhiyun	extqh	$16, $17, $1		# U : masking
209*4882a593Smuzhiyun	bis	$3, $1, $1		# E : merged bytes to store
210*4882a593Smuzhiyun
211*4882a593Smuzhiyun	subq	$18, 8, $18		# E : count -= 8
212*4882a593Smuzhiyun	addq	$17, 8, $17		# E : src += 8
213*4882a593Smuzhiyun	stq	$1, 0($4)		# L : store 8 (aligned)
214*4882a593Smuzhiyun	mov	$16, $3			# E : "rotate" source data
215*4882a593Smuzhiyun
216*4882a593Smuzhiyun	addq	$4, 8, $4		# E : dest += 8
217*4882a593Smuzhiyun	bge	$18, $mis_quad		# U : More quads to move
218*4882a593Smuzhiyun	nop
219*4882a593Smuzhiyun	nop
220*4882a593Smuzhiyun
221*4882a593Smuzhiyun$misalign_tail:
222*4882a593Smuzhiyun	addq	$18, 8, $18		# E : account for tail stuff
223*4882a593Smuzhiyun	ble	$18, $nomoredata	# U :
224*4882a593Smuzhiyun	nop
225*4882a593Smuzhiyun	nop
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun$misalign_byte:
228*4882a593Smuzhiyun	ldbu	$1, 0($17)		# L : fetch 1
229*4882a593Smuzhiyun	subq	$18, 1, $18		# E : count--
230*4882a593Smuzhiyun	addq	$17, 1, $17		# E : src++
231*4882a593Smuzhiyun	nop				# E :
232*4882a593Smuzhiyun
233*4882a593Smuzhiyun	stb	$1, 0($4)		# L : store
234*4882a593Smuzhiyun	addq	$4, 1, $4		# E : dest++
235*4882a593Smuzhiyun	bgt	$18, $misalign_byte	# U : more to go?
236*4882a593Smuzhiyun	nop
237*4882a593Smuzhiyun
238*4882a593Smuzhiyun
239*4882a593Smuzhiyun$nomoredata:
240*4882a593Smuzhiyun	ret	$31, ($26), 1		# L0 :
241*4882a593Smuzhiyun	nop				# E :
242*4882a593Smuzhiyun	nop				# E :
243*4882a593Smuzhiyun	nop				# E :
244*4882a593Smuzhiyun
245*4882a593Smuzhiyun	.end memcpy
246*4882a593Smuzhiyun	EXPORT_SYMBOL(memcpy)
247*4882a593Smuzhiyun
248*4882a593Smuzhiyun/* For backwards module compatibility.  */
249*4882a593Smuzhiyun__memcpy = memcpy
250*4882a593Smuzhiyun.globl __memcpy
251