xref: /OK3568_Linux_fs/kernel/arch/alpha/lib/ev6-memset.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * arch/alpha/lib/ev6-memset.S
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * This is an efficient (and relatively small) implementation of the C library
6*4882a593Smuzhiyun * "memset()" function for the 21264 implementation of Alpha.
7*4882a593Smuzhiyun *
8*4882a593Smuzhiyun * 21264 version  contributed by Rick Gorton <rick.gorton@alpha-processor.com>
9*4882a593Smuzhiyun *
10*4882a593Smuzhiyun * Much of the information about 21264 scheduling/coding comes from:
11*4882a593Smuzhiyun *	Compiler Writer's Guide for the Alpha 21264
12*4882a593Smuzhiyun *	abbreviated as 'CWG' in other comments here
13*4882a593Smuzhiyun *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
14*4882a593Smuzhiyun * Scheduling notation:
15*4882a593Smuzhiyun *	E	- either cluster
16*4882a593Smuzhiyun *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
17*4882a593Smuzhiyun *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
18*4882a593Smuzhiyun * The algorithm for the leading and trailing quadwords remains the same,
19*4882a593Smuzhiyun * however the loop has been unrolled to enable better memory throughput,
20*4882a593Smuzhiyun * and the code has been replicated for each of the entry points: __memset
21*4882a593Smuzhiyun * and __memset16 to permit better scheduling to eliminate the stalling
22*4882a593Smuzhiyun * encountered during the mask replication.
23*4882a593Smuzhiyun * A future enhancement might be to put in a byte store loop for really
24*4882a593Smuzhiyun * small (say < 32 bytes) memset()s.  Whether or not that change would be
25*4882a593Smuzhiyun * a win in the kernel would depend upon the contextual usage.
26*4882a593Smuzhiyun * WARNING: Maintaining this is going to be more work than the above version,
27*4882a593Smuzhiyun * as fixes will need to be made in multiple places.  The performance gain
28*4882a593Smuzhiyun * is worth it.
29*4882a593Smuzhiyun */
30*4882a593Smuzhiyun#include <asm/export.h>
31*4882a593Smuzhiyun	.set noat
32*4882a593Smuzhiyun	.set noreorder
33*4882a593Smuzhiyun.text
34*4882a593Smuzhiyun	.globl memset
35*4882a593Smuzhiyun	.globl __memset
36*4882a593Smuzhiyun	.globl ___memset
37*4882a593Smuzhiyun	.globl __memset16
38*4882a593Smuzhiyun	.globl __constant_c_memset
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun	.ent ___memset
41*4882a593Smuzhiyun.align 5
42*4882a593Smuzhiyun___memset:
43*4882a593Smuzhiyun	.frame $30,0,$26,0
44*4882a593Smuzhiyun	.prologue 0
45*4882a593Smuzhiyun
46*4882a593Smuzhiyun	/*
47*4882a593Smuzhiyun	 * Serious stalling happens.  The only way to mitigate this is to
48*4882a593Smuzhiyun	 * undertake a major re-write to interleave the constant materialization
49*4882a593Smuzhiyun	 * with other parts of the fall-through code.  This is important, even
50*4882a593Smuzhiyun	 * though it makes maintenance tougher.
51*4882a593Smuzhiyun	 * Do this later.
52*4882a593Smuzhiyun	 */
53*4882a593Smuzhiyun	and $17,255,$1		# E : 00000000000000ch
54*4882a593Smuzhiyun	insbl $17,1,$2		# U : 000000000000ch00
55*4882a593Smuzhiyun	bis $16,$16,$0		# E : return value
56*4882a593Smuzhiyun	ble $18,end_b		# U : zero length requested?
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun	addq $18,$16,$6		# E : max address to write to
59*4882a593Smuzhiyun	bis	$1,$2,$17	# E : 000000000000chch
60*4882a593Smuzhiyun	insbl	$1,2,$3		# U : 0000000000ch0000
61*4882a593Smuzhiyun	insbl	$1,3,$4		# U : 00000000ch000000
62*4882a593Smuzhiyun
63*4882a593Smuzhiyun	or	$3,$4,$3	# E : 00000000chch0000
64*4882a593Smuzhiyun	inswl	$17,4,$5	# U : 0000chch00000000
65*4882a593Smuzhiyun	xor	$16,$6,$1	# E : will complete write be within one quadword?
66*4882a593Smuzhiyun	inswl	$17,6,$2	# U : chch000000000000
67*4882a593Smuzhiyun
68*4882a593Smuzhiyun	or	$17,$3,$17	# E : 00000000chchchch
69*4882a593Smuzhiyun	or	$2,$5,$2	# E : chchchch00000000
70*4882a593Smuzhiyun	bic	$1,7,$1		# E : fit within a single quadword?
71*4882a593Smuzhiyun	and	$16,7,$3	# E : Target addr misalignment
72*4882a593Smuzhiyun
73*4882a593Smuzhiyun	or	$17,$2,$17	# E : chchchchchchchch
74*4882a593Smuzhiyun	beq	$1,within_quad_b # U :
75*4882a593Smuzhiyun	nop			# E :
76*4882a593Smuzhiyun	beq	$3,aligned_b	# U : target is 0mod8
77*4882a593Smuzhiyun
78*4882a593Smuzhiyun	/*
79*4882a593Smuzhiyun	 * Target address is misaligned, and won't fit within a quadword
80*4882a593Smuzhiyun	 */
81*4882a593Smuzhiyun	ldq_u $4,0($16)		# L : Fetch first partial
82*4882a593Smuzhiyun	bis $16,$16,$5		# E : Save the address
83*4882a593Smuzhiyun	insql $17,$16,$2	# U : Insert new bytes
84*4882a593Smuzhiyun	subq $3,8,$3		# E : Invert (for addressing uses)
85*4882a593Smuzhiyun
86*4882a593Smuzhiyun	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
87*4882a593Smuzhiyun	mskql $4,$16,$4		# U : clear relevant parts of the quad
88*4882a593Smuzhiyun	subq $16,$3,$16		# E : $16 is new aligned destination
89*4882a593Smuzhiyun	bis $2,$4,$1		# E : Final bytes
90*4882a593Smuzhiyun
91*4882a593Smuzhiyun	nop
92*4882a593Smuzhiyun	stq_u $1,0($5)		# L : Store result
93*4882a593Smuzhiyun	nop
94*4882a593Smuzhiyun	nop
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun.align 4
97*4882a593Smuzhiyunaligned_b:
98*4882a593Smuzhiyun	/*
99*4882a593Smuzhiyun	 * We are now guaranteed to be quad aligned, with at least
100*4882a593Smuzhiyun	 * one partial quad to write.
101*4882a593Smuzhiyun	 */
102*4882a593Smuzhiyun
103*4882a593Smuzhiyun	sra $18,3,$3		# U : Number of remaining quads to write
104*4882a593Smuzhiyun	and $18,7,$18		# E : Number of trailing bytes to write
105*4882a593Smuzhiyun	bis $16,$16,$5		# E : Save dest address
106*4882a593Smuzhiyun	beq $3,no_quad_b	# U : tail stuff only
107*4882a593Smuzhiyun
108*4882a593Smuzhiyun	/*
109*4882a593Smuzhiyun	 * it's worth the effort to unroll this and use wh64 if possible
110*4882a593Smuzhiyun	 * Lifted a bunch of code from clear_user.S
111*4882a593Smuzhiyun	 * At this point, entry values are:
112*4882a593Smuzhiyun	 * $16	Current destination address
113*4882a593Smuzhiyun	 * $5	A copy of $16
114*4882a593Smuzhiyun	 * $6	The max quadword address to write to
115*4882a593Smuzhiyun	 * $18	Number trailer bytes
116*4882a593Smuzhiyun	 * $3	Number quads to write
117*4882a593Smuzhiyun	 */
118*4882a593Smuzhiyun
119*4882a593Smuzhiyun	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
120*4882a593Smuzhiyun	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
121*4882a593Smuzhiyun	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
122*4882a593Smuzhiyun	blt	$4, loop_b	# U :
123*4882a593Smuzhiyun
124*4882a593Smuzhiyun	/*
125*4882a593Smuzhiyun	 * We know we've got at least 16 quads, minimum of one trip
126*4882a593Smuzhiyun	 * through unrolled loop.  Do a quad at a time to get us 0mod64
127*4882a593Smuzhiyun	 * aligned.
128*4882a593Smuzhiyun	 */
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun	nop			# E :
131*4882a593Smuzhiyun	nop			# E :
132*4882a593Smuzhiyun	nop			# E :
133*4882a593Smuzhiyun	beq	$1, $bigalign_b	# U :
134*4882a593Smuzhiyun
135*4882a593Smuzhiyun$alignmod64_b:
136*4882a593Smuzhiyun	stq	$17, 0($5)	# L :
137*4882a593Smuzhiyun	subq	$3, 1, $3	# E : For consistency later
138*4882a593Smuzhiyun	addq	$1, 8, $1	# E : Increment towards zero for alignment
139*4882a593Smuzhiyun	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
140*4882a593Smuzhiyun
141*4882a593Smuzhiyun	nop
142*4882a593Smuzhiyun	nop
143*4882a593Smuzhiyun	addq	$5, 8, $5	# E : Inc address
144*4882a593Smuzhiyun	blt	$1, $alignmod64_b # U :
145*4882a593Smuzhiyun
146*4882a593Smuzhiyun$bigalign_b:
147*4882a593Smuzhiyun	/*
148*4882a593Smuzhiyun	 * $3 - number quads left to go
149*4882a593Smuzhiyun	 * $5 - target address (aligned 0mod64)
150*4882a593Smuzhiyun	 * $17 - mask of stuff to store
151*4882a593Smuzhiyun	 * Scratch registers available: $7, $2, $4, $1
152*4882a593Smuzhiyun	 * we know that we'll be taking a minimum of one trip through
153*4882a593Smuzhiyun 	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
154*4882a593Smuzhiyun	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
155*4882a593Smuzhiyun	 * The wh64 is issued on for the starting destination address for trip +2
156*4882a593Smuzhiyun	 * through the loop, and if there are less than two trips left, the target
157*4882a593Smuzhiyun	 * address will be for the current trip.
158*4882a593Smuzhiyun	 */
159*4882a593Smuzhiyun
160*4882a593Smuzhiyun$do_wh64_b:
161*4882a593Smuzhiyun	wh64	($4)		# L1 : memory subsystem write hint
162*4882a593Smuzhiyun	subq	$3, 24, $2	# E : For determining future wh64 addresses
163*4882a593Smuzhiyun	stq	$17, 0($5)	# L :
164*4882a593Smuzhiyun	nop			# E :
165*4882a593Smuzhiyun
166*4882a593Smuzhiyun	addq	$5, 128, $4	# E : speculative target of next wh64
167*4882a593Smuzhiyun	stq	$17, 8($5)	# L :
168*4882a593Smuzhiyun	stq	$17, 16($5)	# L :
169*4882a593Smuzhiyun	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
170*4882a593Smuzhiyun
171*4882a593Smuzhiyun	stq	$17, 24($5)	# L :
172*4882a593Smuzhiyun	stq	$17, 32($5)	# L :
173*4882a593Smuzhiyun	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
174*4882a593Smuzhiyun	nop
175*4882a593Smuzhiyun
176*4882a593Smuzhiyun	stq	$17, 40($5)	# L :
177*4882a593Smuzhiyun	stq	$17, 48($5)	# L :
178*4882a593Smuzhiyun	subq	$3, 16, $2	# E : Repeat the loop at least once more?
179*4882a593Smuzhiyun	nop
180*4882a593Smuzhiyun
181*4882a593Smuzhiyun	stq	$17, 56($5)	# L :
182*4882a593Smuzhiyun	addq	$5, 64, $5	# E :
183*4882a593Smuzhiyun	subq	$3, 8, $3	# E :
184*4882a593Smuzhiyun	bge	$2, $do_wh64_b	# U :
185*4882a593Smuzhiyun
186*4882a593Smuzhiyun	nop
187*4882a593Smuzhiyun	nop
188*4882a593Smuzhiyun	nop
189*4882a593Smuzhiyun	beq	$3, no_quad_b	# U : Might have finished already
190*4882a593Smuzhiyun
191*4882a593Smuzhiyun.align 4
192*4882a593Smuzhiyun	/*
193*4882a593Smuzhiyun	 * Simple loop for trailing quadwords, or for small amounts
194*4882a593Smuzhiyun	 * of data (where we can't use an unrolled loop and wh64)
195*4882a593Smuzhiyun	 */
196*4882a593Smuzhiyunloop_b:
197*4882a593Smuzhiyun	stq $17,0($5)		# L :
198*4882a593Smuzhiyun	subq $3,1,$3		# E : Decrement number quads left
199*4882a593Smuzhiyun	addq $5,8,$5		# E : Inc address
200*4882a593Smuzhiyun	bne $3,loop_b		# U : more?
201*4882a593Smuzhiyun
202*4882a593Smuzhiyunno_quad_b:
203*4882a593Smuzhiyun	/*
204*4882a593Smuzhiyun	 * Write 0..7 trailing bytes.
205*4882a593Smuzhiyun	 */
206*4882a593Smuzhiyun	nop			# E :
207*4882a593Smuzhiyun	beq $18,end_b		# U : All done?
208*4882a593Smuzhiyun	ldq $7,0($5)		# L :
209*4882a593Smuzhiyun	mskqh $7,$6,$2		# U : Mask final quad
210*4882a593Smuzhiyun
211*4882a593Smuzhiyun	insqh $17,$6,$4		# U : New bits
212*4882a593Smuzhiyun	bis $2,$4,$1		# E : Put it all together
213*4882a593Smuzhiyun	stq $1,0($5)		# L : And back to memory
214*4882a593Smuzhiyun	ret $31,($26),1		# L0 :
215*4882a593Smuzhiyun
216*4882a593Smuzhiyunwithin_quad_b:
217*4882a593Smuzhiyun	ldq_u $1,0($16)		# L :
218*4882a593Smuzhiyun	insql $17,$16,$2	# U : New bits
219*4882a593Smuzhiyun	mskql $1,$16,$4		# U : Clear old
220*4882a593Smuzhiyun	bis $2,$4,$2		# E : New result
221*4882a593Smuzhiyun
222*4882a593Smuzhiyun	mskql $2,$6,$4		# U :
223*4882a593Smuzhiyun	mskqh $1,$6,$2		# U :
224*4882a593Smuzhiyun	bis $2,$4,$1		# E :
225*4882a593Smuzhiyun	stq_u $1,0($16)		# L :
226*4882a593Smuzhiyun
227*4882a593Smuzhiyunend_b:
228*4882a593Smuzhiyun	nop
229*4882a593Smuzhiyun	nop
230*4882a593Smuzhiyun	nop
231*4882a593Smuzhiyun	ret $31,($26),1		# L0 :
232*4882a593Smuzhiyun	.end ___memset
233*4882a593Smuzhiyun	EXPORT_SYMBOL(___memset)
234*4882a593Smuzhiyun
235*4882a593Smuzhiyun	/*
236*4882a593Smuzhiyun	 * This is the original body of code, prior to replication and
237*4882a593Smuzhiyun	 * rescheduling.  Leave it here, as there may be calls to this
238*4882a593Smuzhiyun	 * entry point.
239*4882a593Smuzhiyun	 */
240*4882a593Smuzhiyun.align 4
241*4882a593Smuzhiyun	.ent __constant_c_memset
242*4882a593Smuzhiyun__constant_c_memset:
243*4882a593Smuzhiyun	.frame $30,0,$26,0
244*4882a593Smuzhiyun	.prologue 0
245*4882a593Smuzhiyun
246*4882a593Smuzhiyun	addq $18,$16,$6		# E : max address to write to
247*4882a593Smuzhiyun	bis $16,$16,$0		# E : return value
248*4882a593Smuzhiyun	xor $16,$6,$1		# E : will complete write be within one quadword?
249*4882a593Smuzhiyun	ble $18,end		# U : zero length requested?
250*4882a593Smuzhiyun
251*4882a593Smuzhiyun	bic $1,7,$1		# E : fit within a single quadword
252*4882a593Smuzhiyun	beq $1,within_one_quad	# U :
253*4882a593Smuzhiyun	and $16,7,$3		# E : Target addr misalignment
254*4882a593Smuzhiyun	beq $3,aligned		# U : target is 0mod8
255*4882a593Smuzhiyun
256*4882a593Smuzhiyun	/*
257*4882a593Smuzhiyun	 * Target address is misaligned, and won't fit within a quadword
258*4882a593Smuzhiyun	 */
259*4882a593Smuzhiyun	ldq_u $4,0($16)		# L : Fetch first partial
260*4882a593Smuzhiyun	bis $16,$16,$5		# E : Save the address
261*4882a593Smuzhiyun	insql $17,$16,$2	# U : Insert new bytes
262*4882a593Smuzhiyun	subq $3,8,$3		# E : Invert (for addressing uses)
263*4882a593Smuzhiyun
264*4882a593Smuzhiyun	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
265*4882a593Smuzhiyun	mskql $4,$16,$4		# U : clear relevant parts of the quad
266*4882a593Smuzhiyun	subq $16,$3,$16		# E : $16 is new aligned destination
267*4882a593Smuzhiyun	bis $2,$4,$1		# E : Final bytes
268*4882a593Smuzhiyun
269*4882a593Smuzhiyun	nop
270*4882a593Smuzhiyun	stq_u $1,0($5)		# L : Store result
271*4882a593Smuzhiyun	nop
272*4882a593Smuzhiyun	nop
273*4882a593Smuzhiyun
274*4882a593Smuzhiyun.align 4
275*4882a593Smuzhiyunaligned:
276*4882a593Smuzhiyun	/*
277*4882a593Smuzhiyun	 * We are now guaranteed to be quad aligned, with at least
278*4882a593Smuzhiyun	 * one partial quad to write.
279*4882a593Smuzhiyun	 */
280*4882a593Smuzhiyun
281*4882a593Smuzhiyun	sra $18,3,$3		# U : Number of remaining quads to write
282*4882a593Smuzhiyun	and $18,7,$18		# E : Number of trailing bytes to write
283*4882a593Smuzhiyun	bis $16,$16,$5		# E : Save dest address
284*4882a593Smuzhiyun	beq $3,no_quad		# U : tail stuff only
285*4882a593Smuzhiyun
286*4882a593Smuzhiyun	/*
287*4882a593Smuzhiyun	 * it's worth the effort to unroll this and use wh64 if possible
288*4882a593Smuzhiyun	 * Lifted a bunch of code from clear_user.S
289*4882a593Smuzhiyun	 * At this point, entry values are:
290*4882a593Smuzhiyun	 * $16	Current destination address
291*4882a593Smuzhiyun	 * $5	A copy of $16
292*4882a593Smuzhiyun	 * $6	The max quadword address to write to
293*4882a593Smuzhiyun	 * $18	Number trailer bytes
294*4882a593Smuzhiyun	 * $3	Number quads to write
295*4882a593Smuzhiyun	 */
296*4882a593Smuzhiyun
297*4882a593Smuzhiyun	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
298*4882a593Smuzhiyun	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
299*4882a593Smuzhiyun	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
300*4882a593Smuzhiyun	blt	$4, loop	# U :
301*4882a593Smuzhiyun
302*4882a593Smuzhiyun	/*
303*4882a593Smuzhiyun	 * We know we've got at least 16 quads, minimum of one trip
304*4882a593Smuzhiyun	 * through unrolled loop.  Do a quad at a time to get us 0mod64
305*4882a593Smuzhiyun	 * aligned.
306*4882a593Smuzhiyun	 */
307*4882a593Smuzhiyun
308*4882a593Smuzhiyun	nop			# E :
309*4882a593Smuzhiyun	nop			# E :
310*4882a593Smuzhiyun	nop			# E :
311*4882a593Smuzhiyun	beq	$1, $bigalign	# U :
312*4882a593Smuzhiyun
313*4882a593Smuzhiyun$alignmod64:
314*4882a593Smuzhiyun	stq	$17, 0($5)	# L :
315*4882a593Smuzhiyun	subq	$3, 1, $3	# E : For consistency later
316*4882a593Smuzhiyun	addq	$1, 8, $1	# E : Increment towards zero for alignment
317*4882a593Smuzhiyun	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
318*4882a593Smuzhiyun
319*4882a593Smuzhiyun	nop
320*4882a593Smuzhiyun	nop
321*4882a593Smuzhiyun	addq	$5, 8, $5	# E : Inc address
322*4882a593Smuzhiyun	blt	$1, $alignmod64	# U :
323*4882a593Smuzhiyun
324*4882a593Smuzhiyun$bigalign:
325*4882a593Smuzhiyun	/*
326*4882a593Smuzhiyun	 * $3 - number quads left to go
327*4882a593Smuzhiyun	 * $5 - target address (aligned 0mod64)
328*4882a593Smuzhiyun	 * $17 - mask of stuff to store
329*4882a593Smuzhiyun	 * Scratch registers available: $7, $2, $4, $1
330*4882a593Smuzhiyun	 * we know that we'll be taking a minimum of one trip through
331*4882a593Smuzhiyun 	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
332*4882a593Smuzhiyun	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
333*4882a593Smuzhiyun	 * The wh64 is issued on for the starting destination address for trip +2
334*4882a593Smuzhiyun	 * through the loop, and if there are less than two trips left, the target
335*4882a593Smuzhiyun	 * address will be for the current trip.
336*4882a593Smuzhiyun	 */
337*4882a593Smuzhiyun
338*4882a593Smuzhiyun$do_wh64:
339*4882a593Smuzhiyun	wh64	($4)		# L1 : memory subsystem write hint
340*4882a593Smuzhiyun	subq	$3, 24, $2	# E : For determining future wh64 addresses
341*4882a593Smuzhiyun	stq	$17, 0($5)	# L :
342*4882a593Smuzhiyun	nop			# E :
343*4882a593Smuzhiyun
344*4882a593Smuzhiyun	addq	$5, 128, $4	# E : speculative target of next wh64
345*4882a593Smuzhiyun	stq	$17, 8($5)	# L :
346*4882a593Smuzhiyun	stq	$17, 16($5)	# L :
347*4882a593Smuzhiyun	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
348*4882a593Smuzhiyun
349*4882a593Smuzhiyun	stq	$17, 24($5)	# L :
350*4882a593Smuzhiyun	stq	$17, 32($5)	# L :
351*4882a593Smuzhiyun	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
352*4882a593Smuzhiyun	nop
353*4882a593Smuzhiyun
354*4882a593Smuzhiyun	stq	$17, 40($5)	# L :
355*4882a593Smuzhiyun	stq	$17, 48($5)	# L :
356*4882a593Smuzhiyun	subq	$3, 16, $2	# E : Repeat the loop at least once more?
357*4882a593Smuzhiyun	nop
358*4882a593Smuzhiyun
359*4882a593Smuzhiyun	stq	$17, 56($5)	# L :
360*4882a593Smuzhiyun	addq	$5, 64, $5	# E :
361*4882a593Smuzhiyun	subq	$3, 8, $3	# E :
362*4882a593Smuzhiyun	bge	$2, $do_wh64	# U :
363*4882a593Smuzhiyun
364*4882a593Smuzhiyun	nop
365*4882a593Smuzhiyun	nop
366*4882a593Smuzhiyun	nop
367*4882a593Smuzhiyun	beq	$3, no_quad	# U : Might have finished already
368*4882a593Smuzhiyun
369*4882a593Smuzhiyun.align 4
370*4882a593Smuzhiyun	/*
371*4882a593Smuzhiyun	 * Simple loop for trailing quadwords, or for small amounts
372*4882a593Smuzhiyun	 * of data (where we can't use an unrolled loop and wh64)
373*4882a593Smuzhiyun	 */
374*4882a593Smuzhiyunloop:
375*4882a593Smuzhiyun	stq $17,0($5)		# L :
376*4882a593Smuzhiyun	subq $3,1,$3		# E : Decrement number quads left
377*4882a593Smuzhiyun	addq $5,8,$5		# E : Inc address
378*4882a593Smuzhiyun	bne $3,loop		# U : more?
379*4882a593Smuzhiyun
380*4882a593Smuzhiyunno_quad:
381*4882a593Smuzhiyun	/*
382*4882a593Smuzhiyun	 * Write 0..7 trailing bytes.
383*4882a593Smuzhiyun	 */
384*4882a593Smuzhiyun	nop			# E :
385*4882a593Smuzhiyun	beq $18,end		# U : All done?
386*4882a593Smuzhiyun	ldq $7,0($5)		# L :
387*4882a593Smuzhiyun	mskqh $7,$6,$2		# U : Mask final quad
388*4882a593Smuzhiyun
389*4882a593Smuzhiyun	insqh $17,$6,$4		# U : New bits
390*4882a593Smuzhiyun	bis $2,$4,$1		# E : Put it all together
391*4882a593Smuzhiyun	stq $1,0($5)		# L : And back to memory
392*4882a593Smuzhiyun	ret $31,($26),1		# L0 :
393*4882a593Smuzhiyun
394*4882a593Smuzhiyunwithin_one_quad:
395*4882a593Smuzhiyun	ldq_u $1,0($16)		# L :
396*4882a593Smuzhiyun	insql $17,$16,$2	# U : New bits
397*4882a593Smuzhiyun	mskql $1,$16,$4		# U : Clear old
398*4882a593Smuzhiyun	bis $2,$4,$2		# E : New result
399*4882a593Smuzhiyun
400*4882a593Smuzhiyun	mskql $2,$6,$4		# U :
401*4882a593Smuzhiyun	mskqh $1,$6,$2		# U :
402*4882a593Smuzhiyun	bis $2,$4,$1		# E :
403*4882a593Smuzhiyun	stq_u $1,0($16)		# L :
404*4882a593Smuzhiyun
405*4882a593Smuzhiyunend:
406*4882a593Smuzhiyun	nop
407*4882a593Smuzhiyun	nop
408*4882a593Smuzhiyun	nop
409*4882a593Smuzhiyun	ret $31,($26),1		# L0 :
410*4882a593Smuzhiyun	.end __constant_c_memset
411*4882a593Smuzhiyun	EXPORT_SYMBOL(__constant_c_memset)
412*4882a593Smuzhiyun
413*4882a593Smuzhiyun	/*
414*4882a593Smuzhiyun	 * This is a replicant of the __constant_c_memset code, rescheduled
415*4882a593Smuzhiyun	 * to mask stalls.  Note that entry point names also had to change
416*4882a593Smuzhiyun	 */
417*4882a593Smuzhiyun	.align 5
418*4882a593Smuzhiyun	.ent __memset16
419*4882a593Smuzhiyun
420*4882a593Smuzhiyun__memset16:
421*4882a593Smuzhiyun	.frame $30,0,$26,0
422*4882a593Smuzhiyun	.prologue 0
423*4882a593Smuzhiyun
424*4882a593Smuzhiyun	inswl $17,0,$5		# U : 000000000000c1c2
425*4882a593Smuzhiyun	inswl $17,2,$2		# U : 00000000c1c20000
426*4882a593Smuzhiyun	bis $16,$16,$0		# E : return value
427*4882a593Smuzhiyun	addq	$18,$16,$6	# E : max address to write to
428*4882a593Smuzhiyun
429*4882a593Smuzhiyun	ble $18, end_w		# U : zero length requested?
430*4882a593Smuzhiyun	inswl	$17,4,$3	# U : 0000c1c200000000
431*4882a593Smuzhiyun	inswl	$17,6,$4	# U : c1c2000000000000
432*4882a593Smuzhiyun	xor	$16,$6,$1	# E : will complete write be within one quadword?
433*4882a593Smuzhiyun
434*4882a593Smuzhiyun	or	$2,$5,$2	# E : 00000000c1c2c1c2
435*4882a593Smuzhiyun	or	$3,$4,$17	# E : c1c2c1c200000000
436*4882a593Smuzhiyun	bic	$1,7,$1		# E : fit within a single quadword
437*4882a593Smuzhiyun	and	$16,7,$3	# E : Target addr misalignment
438*4882a593Smuzhiyun
439*4882a593Smuzhiyun	or	$17,$2,$17	# E : c1c2c1c2c1c2c1c2
440*4882a593Smuzhiyun	beq $1,within_quad_w	# U :
441*4882a593Smuzhiyun	nop
442*4882a593Smuzhiyun	beq $3,aligned_w	# U : target is 0mod8
443*4882a593Smuzhiyun
444*4882a593Smuzhiyun	/*
445*4882a593Smuzhiyun	 * Target address is misaligned, and won't fit within a quadword
446*4882a593Smuzhiyun	 */
447*4882a593Smuzhiyun	ldq_u $4,0($16)		# L : Fetch first partial
448*4882a593Smuzhiyun	bis $16,$16,$5		# E : Save the address
449*4882a593Smuzhiyun	insql $17,$16,$2	# U : Insert new bytes
450*4882a593Smuzhiyun	subq $3,8,$3		# E : Invert (for addressing uses)
451*4882a593Smuzhiyun
452*4882a593Smuzhiyun	addq $18,$3,$18		# E : $18 is new count ($3 is negative)
453*4882a593Smuzhiyun	mskql $4,$16,$4		# U : clear relevant parts of the quad
454*4882a593Smuzhiyun	subq $16,$3,$16		# E : $16 is new aligned destination
455*4882a593Smuzhiyun	bis $2,$4,$1		# E : Final bytes
456*4882a593Smuzhiyun
457*4882a593Smuzhiyun	nop
458*4882a593Smuzhiyun	stq_u $1,0($5)		# L : Store result
459*4882a593Smuzhiyun	nop
460*4882a593Smuzhiyun	nop
461*4882a593Smuzhiyun
462*4882a593Smuzhiyun.align 4
463*4882a593Smuzhiyunaligned_w:
464*4882a593Smuzhiyun	/*
465*4882a593Smuzhiyun	 * We are now guaranteed to be quad aligned, with at least
466*4882a593Smuzhiyun	 * one partial quad to write.
467*4882a593Smuzhiyun	 */
468*4882a593Smuzhiyun
469*4882a593Smuzhiyun	sra $18,3,$3		# U : Number of remaining quads to write
470*4882a593Smuzhiyun	and $18,7,$18		# E : Number of trailing bytes to write
471*4882a593Smuzhiyun	bis $16,$16,$5		# E : Save dest address
472*4882a593Smuzhiyun	beq $3,no_quad_w	# U : tail stuff only
473*4882a593Smuzhiyun
474*4882a593Smuzhiyun	/*
475*4882a593Smuzhiyun	 * it's worth the effort to unroll this and use wh64 if possible
476*4882a593Smuzhiyun	 * Lifted a bunch of code from clear_user.S
477*4882a593Smuzhiyun	 * At this point, entry values are:
478*4882a593Smuzhiyun	 * $16	Current destination address
479*4882a593Smuzhiyun	 * $5	A copy of $16
480*4882a593Smuzhiyun	 * $6	The max quadword address to write to
481*4882a593Smuzhiyun	 * $18	Number trailer bytes
482*4882a593Smuzhiyun	 * $3	Number quads to write
483*4882a593Smuzhiyun	 */
484*4882a593Smuzhiyun
485*4882a593Smuzhiyun	and	$16, 0x3f, $2	# E : Forward work (only useful for unrolled loop)
486*4882a593Smuzhiyun	subq	$3, 16, $4	# E : Only try to unroll if > 128 bytes
487*4882a593Smuzhiyun	subq	$2, 0x40, $1	# E : bias counter (aligning stuff 0mod64)
488*4882a593Smuzhiyun	blt	$4, loop_w	# U :
489*4882a593Smuzhiyun
490*4882a593Smuzhiyun	/*
491*4882a593Smuzhiyun	 * We know we've got at least 16 quads, minimum of one trip
492*4882a593Smuzhiyun	 * through unrolled loop.  Do a quad at a time to get us 0mod64
493*4882a593Smuzhiyun	 * aligned.
494*4882a593Smuzhiyun	 */
495*4882a593Smuzhiyun
496*4882a593Smuzhiyun	nop			# E :
497*4882a593Smuzhiyun	nop			# E :
498*4882a593Smuzhiyun	nop			# E :
499*4882a593Smuzhiyun	beq	$1, $bigalign_w	# U :
500*4882a593Smuzhiyun
501*4882a593Smuzhiyun$alignmod64_w:
502*4882a593Smuzhiyun	stq	$17, 0($5)	# L :
503*4882a593Smuzhiyun	subq	$3, 1, $3	# E : For consistency later
504*4882a593Smuzhiyun	addq	$1, 8, $1	# E : Increment towards zero for alignment
505*4882a593Smuzhiyun	addq	$5, 8, $4	# E : Initial wh64 address (filler instruction)
506*4882a593Smuzhiyun
507*4882a593Smuzhiyun	nop
508*4882a593Smuzhiyun	nop
509*4882a593Smuzhiyun	addq	$5, 8, $5	# E : Inc address
510*4882a593Smuzhiyun	blt	$1, $alignmod64_w	# U :
511*4882a593Smuzhiyun
512*4882a593Smuzhiyun$bigalign_w:
513*4882a593Smuzhiyun	/*
514*4882a593Smuzhiyun	 * $3 - number quads left to go
515*4882a593Smuzhiyun	 * $5 - target address (aligned 0mod64)
516*4882a593Smuzhiyun	 * $17 - mask of stuff to store
517*4882a593Smuzhiyun	 * Scratch registers available: $7, $2, $4, $1
518*4882a593Smuzhiyun	 * we know that we'll be taking a minimum of one trip through
519*4882a593Smuzhiyun 	 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
520*4882a593Smuzhiyun	 * Assumes the wh64 needs to be for 2 trips through the loop in the future
521*4882a593Smuzhiyun	 * The wh64 is issued on for the starting destination address for trip +2
522*4882a593Smuzhiyun	 * through the loop, and if there are less than two trips left, the target
523*4882a593Smuzhiyun	 * address will be for the current trip.
524*4882a593Smuzhiyun	 */
525*4882a593Smuzhiyun
526*4882a593Smuzhiyun$do_wh64_w:
527*4882a593Smuzhiyun	wh64	($4)		# L1 : memory subsystem write hint
528*4882a593Smuzhiyun	subq	$3, 24, $2	# E : For determining future wh64 addresses
529*4882a593Smuzhiyun	stq	$17, 0($5)	# L :
530*4882a593Smuzhiyun	nop			# E :
531*4882a593Smuzhiyun
532*4882a593Smuzhiyun	addq	$5, 128, $4	# E : speculative target of next wh64
533*4882a593Smuzhiyun	stq	$17, 8($5)	# L :
534*4882a593Smuzhiyun	stq	$17, 16($5)	# L :
535*4882a593Smuzhiyun	addq	$5, 64, $7	# E : Fallback address for wh64 (== next trip addr)
536*4882a593Smuzhiyun
537*4882a593Smuzhiyun	stq	$17, 24($5)	# L :
538*4882a593Smuzhiyun	stq	$17, 32($5)	# L :
539*4882a593Smuzhiyun	cmovlt	$2, $7, $4	# E : Latency 2, extra mapping cycle
540*4882a593Smuzhiyun	nop
541*4882a593Smuzhiyun
542*4882a593Smuzhiyun	stq	$17, 40($5)	# L :
543*4882a593Smuzhiyun	stq	$17, 48($5)	# L :
544*4882a593Smuzhiyun	subq	$3, 16, $2	# E : Repeat the loop at least once more?
545*4882a593Smuzhiyun	nop
546*4882a593Smuzhiyun
547*4882a593Smuzhiyun	stq	$17, 56($5)	# L :
548*4882a593Smuzhiyun	addq	$5, 64, $5	# E :
549*4882a593Smuzhiyun	subq	$3, 8, $3	# E :
550*4882a593Smuzhiyun	bge	$2, $do_wh64_w	# U :
551*4882a593Smuzhiyun
552*4882a593Smuzhiyun	nop
553*4882a593Smuzhiyun	nop
554*4882a593Smuzhiyun	nop
555*4882a593Smuzhiyun	beq	$3, no_quad_w	# U : Might have finished already
556*4882a593Smuzhiyun
557*4882a593Smuzhiyun.align 4
558*4882a593Smuzhiyun	/*
559*4882a593Smuzhiyun	 * Simple loop for trailing quadwords, or for small amounts
560*4882a593Smuzhiyun	 * of data (where we can't use an unrolled loop and wh64)
561*4882a593Smuzhiyun	 */
562*4882a593Smuzhiyunloop_w:
563*4882a593Smuzhiyun	stq $17,0($5)		# L :
564*4882a593Smuzhiyun	subq $3,1,$3		# E : Decrement number quads left
565*4882a593Smuzhiyun	addq $5,8,$5		# E : Inc address
566*4882a593Smuzhiyun	bne $3,loop_w		# U : more?
567*4882a593Smuzhiyun
568*4882a593Smuzhiyunno_quad_w:
569*4882a593Smuzhiyun	/*
570*4882a593Smuzhiyun	 * Write 0..7 trailing bytes.
571*4882a593Smuzhiyun	 */
572*4882a593Smuzhiyun	nop			# E :
573*4882a593Smuzhiyun	beq $18,end_w		# U : All done?
574*4882a593Smuzhiyun	ldq $7,0($5)		# L :
575*4882a593Smuzhiyun	mskqh $7,$6,$2		# U : Mask final quad
576*4882a593Smuzhiyun
577*4882a593Smuzhiyun	insqh $17,$6,$4		# U : New bits
578*4882a593Smuzhiyun	bis $2,$4,$1		# E : Put it all together
579*4882a593Smuzhiyun	stq $1,0($5)		# L : And back to memory
580*4882a593Smuzhiyun	ret $31,($26),1		# L0 :
581*4882a593Smuzhiyun
582*4882a593Smuzhiyunwithin_quad_w:
583*4882a593Smuzhiyun	ldq_u $1,0($16)		# L :
584*4882a593Smuzhiyun	insql $17,$16,$2	# U : New bits
585*4882a593Smuzhiyun	mskql $1,$16,$4		# U : Clear old
586*4882a593Smuzhiyun	bis $2,$4,$2		# E : New result
587*4882a593Smuzhiyun
588*4882a593Smuzhiyun	mskql $2,$6,$4		# U :
589*4882a593Smuzhiyun	mskqh $1,$6,$2		# U :
590*4882a593Smuzhiyun	bis $2,$4,$1		# E :
591*4882a593Smuzhiyun	stq_u $1,0($16)		# L :
592*4882a593Smuzhiyun
593*4882a593Smuzhiyunend_w:
594*4882a593Smuzhiyun	nop
595*4882a593Smuzhiyun	nop
596*4882a593Smuzhiyun	nop
597*4882a593Smuzhiyun	ret $31,($26),1		# L0 :
598*4882a593Smuzhiyun
599*4882a593Smuzhiyun	.end __memset16
600*4882a593Smuzhiyun	EXPORT_SYMBOL(__memset16)
601*4882a593Smuzhiyun
602*4882a593Smuzhiyunmemset = ___memset
603*4882a593Smuzhiyun__memset = ___memset
604*4882a593Smuzhiyun	EXPORT_SYMBOL(memset)
605*4882a593Smuzhiyun	EXPORT_SYMBOL(__memset)
606