xref: /OK3568_Linux_fs/kernel/arch/alpha/lib/ev6-copy_user.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * arch/alpha/lib/ev6-copy_user.S
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * Copy to/from user space, handling exceptions as we go..  This
8*4882a593Smuzhiyun * isn't exactly pretty.
9*4882a593Smuzhiyun *
10*4882a593Smuzhiyun * This is essentially the same as "memcpy()", but with a few twists.
11*4882a593Smuzhiyun * Notably, we have to make sure that $0 is always up-to-date and
12*4882a593Smuzhiyun * contains the right "bytes left to copy" value (and that it is updated
13*4882a593Smuzhiyun * only _after_ a successful copy). There is also some rather minor
14*4882a593Smuzhiyun * exception setup stuff..
15*4882a593Smuzhiyun *
16*4882a593Smuzhiyun * Much of the information about 21264 scheduling/coding comes from:
17*4882a593Smuzhiyun *	Compiler Writer's Guide for the Alpha 21264
18*4882a593Smuzhiyun *	abbreviated as 'CWG' in other comments here
19*4882a593Smuzhiyun *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
20*4882a593Smuzhiyun * Scheduling notation:
21*4882a593Smuzhiyun *	E	- either cluster
22*4882a593Smuzhiyun *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
23*4882a593Smuzhiyun *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
24*4882a593Smuzhiyun */
25*4882a593Smuzhiyun
26*4882a593Smuzhiyun#include <asm/export.h>
27*4882a593Smuzhiyun/* Allow an exception for an insn; exit if we get one.  */
28*4882a593Smuzhiyun#define EXI(x,y...)			\
29*4882a593Smuzhiyun	99: x,##y;			\
30*4882a593Smuzhiyun	.section __ex_table,"a";	\
31*4882a593Smuzhiyun	.long 99b - .;			\
32*4882a593Smuzhiyun	lda $31, $exitin-99b($31);	\
33*4882a593Smuzhiyun	.previous
34*4882a593Smuzhiyun
35*4882a593Smuzhiyun#define EXO(x,y...)			\
36*4882a593Smuzhiyun	99: x,##y;			\
37*4882a593Smuzhiyun	.section __ex_table,"a";	\
38*4882a593Smuzhiyun	.long 99b - .;			\
39*4882a593Smuzhiyun	lda $31, $exitout-99b($31);	\
40*4882a593Smuzhiyun	.previous
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun	.set noat
43*4882a593Smuzhiyun	.align 4
44*4882a593Smuzhiyun	.globl __copy_user
45*4882a593Smuzhiyun	.ent __copy_user
46*4882a593Smuzhiyun				# Pipeline info: Slotting & Comments
47*4882a593Smuzhiyun__copy_user:
48*4882a593Smuzhiyun	.prologue 0
49*4882a593Smuzhiyun	mov $18, $0		# .. .. .. E
50*4882a593Smuzhiyun	subq $18, 32, $1	# .. .. E. ..	: Is this going to be a small copy?
51*4882a593Smuzhiyun	nop			# .. E  .. ..
52*4882a593Smuzhiyun	beq $18, $zerolength	# U  .. .. ..	: U L U L
53*4882a593Smuzhiyun
54*4882a593Smuzhiyun	and $16,7,$3		# .. .. .. E	: is leading dest misalignment
55*4882a593Smuzhiyun	ble $1, $onebyteloop	# .. .. U  ..	: 1st branch : small amount of data
56*4882a593Smuzhiyun	beq $3, $destaligned	# .. U  .. ..	: 2nd (one cycle fetcher stall)
57*4882a593Smuzhiyun	subq $3, 8, $3		# E  .. .. ..	: L U U L : trip counter
58*4882a593Smuzhiyun/*
59*4882a593Smuzhiyun * The fetcher stall also hides the 1 cycle cross-cluster stall for $3 (L --> U)
60*4882a593Smuzhiyun * This loop aligns the destination a byte at a time
61*4882a593Smuzhiyun * We know we have at least one trip through this loop
62*4882a593Smuzhiyun */
63*4882a593Smuzhiyun$aligndest:
64*4882a593Smuzhiyun	EXI( ldbu $1,0($17) )	# .. .. .. L	: Keep loads separate from stores
65*4882a593Smuzhiyun	addq $16,1,$16		# .. .. E  ..	: Section 3.8 in the CWG
66*4882a593Smuzhiyun	addq $3,1,$3		# .. E  .. ..	:
67*4882a593Smuzhiyun	nop			# E  .. .. ..	: U L U L
68*4882a593Smuzhiyun
69*4882a593Smuzhiyun/*
70*4882a593Smuzhiyun * the -1 is to compensate for the inc($16) done in a previous quadpack
71*4882a593Smuzhiyun * which allows us zero dependencies within either quadpack in the loop
72*4882a593Smuzhiyun */
73*4882a593Smuzhiyun	EXO( stb $1,-1($16) )	# .. .. .. L	:
74*4882a593Smuzhiyun	addq $17,1,$17		# .. .. E  ..	: Section 3.8 in the CWG
75*4882a593Smuzhiyun	subq $0,1,$0		# .. E  .. ..	:
76*4882a593Smuzhiyun	bne $3, $aligndest	# U  .. .. ..	: U L U L
77*4882a593Smuzhiyun
78*4882a593Smuzhiyun/*
79*4882a593Smuzhiyun * If we fell through into here, we have a minimum of 33 - 7 bytes
80*4882a593Smuzhiyun * If we arrived via branch, we have a minimum of 32 bytes
81*4882a593Smuzhiyun */
82*4882a593Smuzhiyun$destaligned:
83*4882a593Smuzhiyun	and $17,7,$1		# .. .. .. E	: Check _current_ source alignment
84*4882a593Smuzhiyun	bic $0,7,$4		# .. .. E  ..	: number bytes as a quadword loop
85*4882a593Smuzhiyun	EXI( ldq_u $3,0($17) )	# .. L  .. ..	: Forward fetch for fallthrough code
86*4882a593Smuzhiyun	beq $1,$quadaligned	# U  .. .. ..	: U L U L
87*4882a593Smuzhiyun
88*4882a593Smuzhiyun/*
89*4882a593Smuzhiyun * In the worst case, we've just executed an ldq_u here from 0($17)
90*4882a593Smuzhiyun * and we'll repeat it once if we take the branch
91*4882a593Smuzhiyun */
92*4882a593Smuzhiyun
93*4882a593Smuzhiyun/* Misaligned quadword loop - not unrolled.  Leave it that way. */
94*4882a593Smuzhiyun$misquad:
95*4882a593Smuzhiyun	EXI( ldq_u $2,8($17) )	# .. .. .. L	:
96*4882a593Smuzhiyun	subq $4,8,$4		# .. .. E  ..	:
97*4882a593Smuzhiyun	extql $3,$17,$3		# .. U  .. ..	:
98*4882a593Smuzhiyun	extqh $2,$17,$1		# U  .. .. ..	: U U L L
99*4882a593Smuzhiyun
100*4882a593Smuzhiyun	bis $3,$1,$1		# .. .. .. E	:
101*4882a593Smuzhiyun	EXO( stq $1,0($16) )	# .. .. L  ..	:
102*4882a593Smuzhiyun	addq $17,8,$17		# .. E  .. ..	:
103*4882a593Smuzhiyun	subq $0,8,$0		# E  .. .. ..	: U L L U
104*4882a593Smuzhiyun
105*4882a593Smuzhiyun	addq $16,8,$16		# .. .. .. E	:
106*4882a593Smuzhiyun	bis $2,$2,$3		# .. .. E  ..	:
107*4882a593Smuzhiyun	nop			# .. E  .. ..	:
108*4882a593Smuzhiyun	bne $4,$misquad		# U  .. .. ..	: U L U L
109*4882a593Smuzhiyun
110*4882a593Smuzhiyun	nop			# .. .. .. E
111*4882a593Smuzhiyun	nop			# .. .. E  ..
112*4882a593Smuzhiyun	nop			# .. E  .. ..
113*4882a593Smuzhiyun	beq $0,$zerolength	# U  .. .. ..	: U L U L
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun/* We know we have at least one trip through the byte loop */
116*4882a593Smuzhiyun	EXI ( ldbu $2,0($17) )	# .. .. .. L	: No loads in the same quad
117*4882a593Smuzhiyun	addq $16,1,$16		# .. .. E  ..	: as the store (Section 3.8 in CWG)
118*4882a593Smuzhiyun	nop			# .. E  .. ..	:
119*4882a593Smuzhiyun	br $31, $dirtyentry	# L0 .. .. ..	: L U U L
120*4882a593Smuzhiyun/* Do the trailing byte loop load, then hop into the store part of the loop */
121*4882a593Smuzhiyun
122*4882a593Smuzhiyun/*
123*4882a593Smuzhiyun * A minimum of (33 - 7) bytes to do a quad at a time.
124*4882a593Smuzhiyun * Based upon the usage context, it's worth the effort to unroll this loop
125*4882a593Smuzhiyun * $0 - number of bytes to be moved
126*4882a593Smuzhiyun * $4 - number of bytes to move as quadwords
127*4882a593Smuzhiyun * $16 is current destination address
128*4882a593Smuzhiyun * $17 is current source address
129*4882a593Smuzhiyun */
130*4882a593Smuzhiyun$quadaligned:
131*4882a593Smuzhiyun	subq	$4, 32, $2	# .. .. .. E	: do not unroll for small stuff
132*4882a593Smuzhiyun	nop			# .. .. E  ..
133*4882a593Smuzhiyun	nop			# .. E  .. ..
134*4882a593Smuzhiyun	blt	$2, $onequad	# U  .. .. ..	: U L U L
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun/*
137*4882a593Smuzhiyun * There is a significant assumption here that the source and destination
138*4882a593Smuzhiyun * addresses differ by more than 32 bytes.  In this particular case, a
139*4882a593Smuzhiyun * sparsity of registers further bounds this to be a minimum of 8 bytes.
140*4882a593Smuzhiyun * But if this isn't met, then the output result will be incorrect.
141*4882a593Smuzhiyun * Furthermore, due to a lack of available registers, we really can't
142*4882a593Smuzhiyun * unroll this to be an 8x loop (which would enable us to use the wh64
143*4882a593Smuzhiyun * instruction memory hint instruction).
144*4882a593Smuzhiyun */
145*4882a593Smuzhiyun$unroll4:
146*4882a593Smuzhiyun	EXI( ldq $1,0($17) )	# .. .. .. L
147*4882a593Smuzhiyun	EXI( ldq $2,8($17) )	# .. .. L  ..
148*4882a593Smuzhiyun	subq	$4,32,$4	# .. E  .. ..
149*4882a593Smuzhiyun	nop			# E  .. .. ..	: U U L L
150*4882a593Smuzhiyun
151*4882a593Smuzhiyun	addq	$17,16,$17	# .. .. .. E
152*4882a593Smuzhiyun	EXO( stq $1,0($16) )	# .. .. L  ..
153*4882a593Smuzhiyun	EXO( stq $2,8($16) )	# .. L  .. ..
154*4882a593Smuzhiyun	subq	$0,16,$0	# E  .. .. ..	: U L L U
155*4882a593Smuzhiyun
156*4882a593Smuzhiyun	addq	$16,16,$16	# .. .. .. E
157*4882a593Smuzhiyun	EXI( ldq $1,0($17) )	# .. .. L  ..
158*4882a593Smuzhiyun	EXI( ldq $2,8($17) )	# .. L  .. ..
159*4882a593Smuzhiyun	subq	$4, 32, $3	# E  .. .. ..	: U U L L : is there enough for another trip?
160*4882a593Smuzhiyun
161*4882a593Smuzhiyun	EXO( stq $1,0($16) )	# .. .. .. L
162*4882a593Smuzhiyun	EXO( stq $2,8($16) )	# .. .. L  ..
163*4882a593Smuzhiyun	subq	$0,16,$0	# .. E  .. ..
164*4882a593Smuzhiyun	addq	$17,16,$17	# E  .. .. ..	: U L L U
165*4882a593Smuzhiyun
166*4882a593Smuzhiyun	nop			# .. .. .. E
167*4882a593Smuzhiyun	nop			# .. .. E  ..
168*4882a593Smuzhiyun	addq	$16,16,$16	# .. E  .. ..
169*4882a593Smuzhiyun	bgt	$3,$unroll4	# U  .. .. ..	: U L U L
170*4882a593Smuzhiyun
171*4882a593Smuzhiyun	nop
172*4882a593Smuzhiyun	nop
173*4882a593Smuzhiyun	nop
174*4882a593Smuzhiyun	beq	$4, $noquads
175*4882a593Smuzhiyun
176*4882a593Smuzhiyun$onequad:
177*4882a593Smuzhiyun	EXI( ldq $1,0($17) )
178*4882a593Smuzhiyun	subq	$4,8,$4
179*4882a593Smuzhiyun	addq	$17,8,$17
180*4882a593Smuzhiyun	nop
181*4882a593Smuzhiyun
182*4882a593Smuzhiyun	EXO( stq $1,0($16) )
183*4882a593Smuzhiyun	subq	$0,8,$0
184*4882a593Smuzhiyun	addq	$16,8,$16
185*4882a593Smuzhiyun	bne	$4,$onequad
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun$noquads:
188*4882a593Smuzhiyun	nop
189*4882a593Smuzhiyun	nop
190*4882a593Smuzhiyun	nop
191*4882a593Smuzhiyun	beq $0,$zerolength
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun/*
194*4882a593Smuzhiyun * For small copies (or the tail of a larger copy), do a very simple byte loop.
195*4882a593Smuzhiyun * There's no point in doing a lot of complex alignment calculations to try to
196*4882a593Smuzhiyun * to quadword stuff for a small amount of data.
197*4882a593Smuzhiyun *	$0 - remaining number of bytes left to copy
198*4882a593Smuzhiyun *	$16 - current dest addr
199*4882a593Smuzhiyun *	$17 - current source addr
200*4882a593Smuzhiyun */
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun$onebyteloop:
203*4882a593Smuzhiyun	EXI ( ldbu $2,0($17) )	# .. .. .. L	: No loads in the same quad
204*4882a593Smuzhiyun	addq $16,1,$16		# .. .. E  ..	: as the store (Section 3.8 in CWG)
205*4882a593Smuzhiyun	nop			# .. E  .. ..	:
206*4882a593Smuzhiyun	nop			# E  .. .. ..	: U L U L
207*4882a593Smuzhiyun
208*4882a593Smuzhiyun$dirtyentry:
209*4882a593Smuzhiyun/*
210*4882a593Smuzhiyun * the -1 is to compensate for the inc($16) done in a previous quadpack
211*4882a593Smuzhiyun * which allows us zero dependencies within either quadpack in the loop
212*4882a593Smuzhiyun */
213*4882a593Smuzhiyun	EXO ( stb $2,-1($16) )	# .. .. .. L	:
214*4882a593Smuzhiyun	addq $17,1,$17		# .. .. E  ..	: quadpack as the load
215*4882a593Smuzhiyun	subq $0,1,$0		# .. E  .. ..	: change count _after_ copy
216*4882a593Smuzhiyun	bgt $0,$onebyteloop	# U  .. .. ..	: U L U L
217*4882a593Smuzhiyun
218*4882a593Smuzhiyun$zerolength:
219*4882a593Smuzhiyun$exitin:
220*4882a593Smuzhiyun$exitout:			# Destination for exception recovery(?)
221*4882a593Smuzhiyun	nop			# .. .. .. E
222*4882a593Smuzhiyun	nop			# .. .. E  ..
223*4882a593Smuzhiyun	nop			# .. E  .. ..
224*4882a593Smuzhiyun	ret $31,($26),1		# L0 .. .. ..	: L U L U
225*4882a593Smuzhiyun
226*4882a593Smuzhiyun	.end __copy_user
227*4882a593Smuzhiyun	EXPORT_SYMBOL(__copy_user)
228