xref: /OK3568_Linux_fs/kernel/arch/alpha/lib/ev6-copy_page.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * arch/alpha/lib/ev6-copy_page.S
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copy an entire page.
6*4882a593Smuzhiyun */
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun/* The following comparison of this routine vs the normal copy_page.S
9*4882a593Smuzhiyun   was written by an unnamed ev6 hardware designer and forwarded to me
10*4882a593Smuzhiyun   via Steven Hobbs <hobbs@steven.zko.dec.com>.
11*4882a593Smuzhiyun
12*4882a593Smuzhiyun   First Problem: STQ overflows.
13*4882a593Smuzhiyun   -----------------------------
14*4882a593Smuzhiyun
15*4882a593Smuzhiyun	It would be nice if EV6 handled every resource overflow efficiently,
16*4882a593Smuzhiyun	but for some it doesn't.  Including store queue overflows.  It causes
17*4882a593Smuzhiyun	a trap and a restart of the pipe.
18*4882a593Smuzhiyun
19*4882a593Smuzhiyun	To get around this we sometimes use (to borrow a term from a VSSAD
20*4882a593Smuzhiyun	researcher) "aeration".  The idea is to slow the rate at which the
21*4882a593Smuzhiyun	processor receives valid instructions by inserting nops in the fetch
22*4882a593Smuzhiyun	path.  In doing so, you can prevent the overflow and actually make
23*4882a593Smuzhiyun	the code run faster.  You can, of course, take advantage of the fact
24*4882a593Smuzhiyun	that the processor can fetch at most 4 aligned instructions per cycle.
25*4882a593Smuzhiyun
26*4882a593Smuzhiyun	I inserted enough nops to force it to take 10 cycles to fetch the
27*4882a593Smuzhiyun	loop code.  In theory, EV6 should be able to execute this loop in
28*4882a593Smuzhiyun	9 cycles but I was not able to get it to run that fast -- the initial
29*4882a593Smuzhiyun	conditions were such that I could not reach this optimum rate on
30*4882a593Smuzhiyun	(chaotic) EV6.  I wrote the code such that everything would issue
31*4882a593Smuzhiyun	in order.
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun   Second Problem: Dcache index matches.
34*4882a593Smuzhiyun   -------------------------------------
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun	If you are going to use this routine on random aligned pages, there
37*4882a593Smuzhiyun	is a 25% chance that the pages will be at the same dcache indices.
38*4882a593Smuzhiyun	This results in many nasty memory traps without care.
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun	The solution is to schedule the prefetches to avoid the memory
41*4882a593Smuzhiyun	conflicts.  I schedule the wh64 prefetches farther ahead of the
42*4882a593Smuzhiyun	read prefetches to avoid this problem.
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun   Third Problem: Needs more prefetching.
45*4882a593Smuzhiyun   --------------------------------------
46*4882a593Smuzhiyun
47*4882a593Smuzhiyun	In order to improve the code I added deeper prefetching to take the
48*4882a593Smuzhiyun	most advantage of EV6's bandwidth.
49*4882a593Smuzhiyun
50*4882a593Smuzhiyun	I also prefetched the read stream. Note that adding the read prefetch
51*4882a593Smuzhiyun	forced me to add another cycle to the inner-most kernel - up to 11
52*4882a593Smuzhiyun	from the original 8 cycles per iteration.  We could improve performance
53*4882a593Smuzhiyun	further by unrolling the loop and doing multiple prefetches per cycle.
54*4882a593Smuzhiyun
55*4882a593Smuzhiyun   I think that the code below will be very robust and fast code for the
56*4882a593Smuzhiyun   purposes of copying aligned pages.  It is slower when both source and
57*4882a593Smuzhiyun   destination pages are in the dcache, but it is my guess that this is
58*4882a593Smuzhiyun   less important than the dcache miss case.  */
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun#include <asm/export.h>
61*4882a593Smuzhiyun	.text
62*4882a593Smuzhiyun	.align 4
63*4882a593Smuzhiyun	.global copy_page
64*4882a593Smuzhiyun	.ent copy_page
65*4882a593Smuzhiyuncopy_page:
66*4882a593Smuzhiyun	.prologue 0
67*4882a593Smuzhiyun
68*4882a593Smuzhiyun	/* Prefetch 5 read cachelines; write-hint 10 cache lines.  */
69*4882a593Smuzhiyun	wh64	($16)
70*4882a593Smuzhiyun	ldl	$31,0($17)
71*4882a593Smuzhiyun	ldl	$31,64($17)
72*4882a593Smuzhiyun	lda	$1,1*64($16)
73*4882a593Smuzhiyun
74*4882a593Smuzhiyun	wh64	($1)
75*4882a593Smuzhiyun	ldl	$31,128($17)
76*4882a593Smuzhiyun	ldl	$31,192($17)
77*4882a593Smuzhiyun	lda	$1,2*64($16)
78*4882a593Smuzhiyun
79*4882a593Smuzhiyun	wh64	($1)
80*4882a593Smuzhiyun	ldl	$31,256($17)
81*4882a593Smuzhiyun	lda	$18,118
82*4882a593Smuzhiyun	lda	$1,3*64($16)
83*4882a593Smuzhiyun
84*4882a593Smuzhiyun	wh64	($1)
85*4882a593Smuzhiyun	nop
86*4882a593Smuzhiyun	lda	$1,4*64($16)
87*4882a593Smuzhiyun	lda	$2,5*64($16)
88*4882a593Smuzhiyun
89*4882a593Smuzhiyun	wh64	($1)
90*4882a593Smuzhiyun	wh64	($2)
91*4882a593Smuzhiyun	lda	$1,6*64($16)
92*4882a593Smuzhiyun	lda	$2,7*64($16)
93*4882a593Smuzhiyun
94*4882a593Smuzhiyun	wh64	($1)
95*4882a593Smuzhiyun	wh64	($2)
96*4882a593Smuzhiyun	lda	$1,8*64($16)
97*4882a593Smuzhiyun	lda	$2,9*64($16)
98*4882a593Smuzhiyun
99*4882a593Smuzhiyun	wh64	($1)
100*4882a593Smuzhiyun	wh64	($2)
101*4882a593Smuzhiyun	lda	$19,10*64($16)
102*4882a593Smuzhiyun	nop
103*4882a593Smuzhiyun
104*4882a593Smuzhiyun	/* Main prefetching/write-hinting loop.  */
105*4882a593Smuzhiyun1:	ldq	$0,0($17)
106*4882a593Smuzhiyun	ldq	$1,8($17)
107*4882a593Smuzhiyun	unop
108*4882a593Smuzhiyun	unop
109*4882a593Smuzhiyun
110*4882a593Smuzhiyun	unop
111*4882a593Smuzhiyun	unop
112*4882a593Smuzhiyun	ldq	$2,16($17)
113*4882a593Smuzhiyun	ldq	$3,24($17)
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun	ldq	$4,32($17)
116*4882a593Smuzhiyun	ldq	$5,40($17)
117*4882a593Smuzhiyun	unop
118*4882a593Smuzhiyun	unop
119*4882a593Smuzhiyun
120*4882a593Smuzhiyun	unop
121*4882a593Smuzhiyun	unop
122*4882a593Smuzhiyun	ldq	$6,48($17)
123*4882a593Smuzhiyun	ldq	$7,56($17)
124*4882a593Smuzhiyun
125*4882a593Smuzhiyun	ldl	$31,320($17)
126*4882a593Smuzhiyun	unop
127*4882a593Smuzhiyun	unop
128*4882a593Smuzhiyun	unop
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun	/* This gives the extra cycle of aeration above the minimum.  */
131*4882a593Smuzhiyun	unop
132*4882a593Smuzhiyun	unop
133*4882a593Smuzhiyun	unop
134*4882a593Smuzhiyun	unop
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun	wh64	($19)
137*4882a593Smuzhiyun	unop
138*4882a593Smuzhiyun	unop
139*4882a593Smuzhiyun	unop
140*4882a593Smuzhiyun
141*4882a593Smuzhiyun	stq	$0,0($16)
142*4882a593Smuzhiyun	subq	$18,1,$18
143*4882a593Smuzhiyun	stq	$1,8($16)
144*4882a593Smuzhiyun	unop
145*4882a593Smuzhiyun
146*4882a593Smuzhiyun	unop
147*4882a593Smuzhiyun	stq	$2,16($16)
148*4882a593Smuzhiyun	addq	$17,64,$17
149*4882a593Smuzhiyun	stq	$3,24($16)
150*4882a593Smuzhiyun
151*4882a593Smuzhiyun	stq	$4,32($16)
152*4882a593Smuzhiyun	stq	$5,40($16)
153*4882a593Smuzhiyun	addq	$19,64,$19
154*4882a593Smuzhiyun	unop
155*4882a593Smuzhiyun
156*4882a593Smuzhiyun	stq	$6,48($16)
157*4882a593Smuzhiyun	stq	$7,56($16)
158*4882a593Smuzhiyun	addq	$16,64,$16
159*4882a593Smuzhiyun	bne	$18, 1b
160*4882a593Smuzhiyun
161*4882a593Smuzhiyun	/* Prefetch the final 5 cache lines of the read stream.  */
162*4882a593Smuzhiyun	lda	$18,10
163*4882a593Smuzhiyun	ldl	$31,320($17)
164*4882a593Smuzhiyun	ldl	$31,384($17)
165*4882a593Smuzhiyun	ldl	$31,448($17)
166*4882a593Smuzhiyun
167*4882a593Smuzhiyun	ldl	$31,512($17)
168*4882a593Smuzhiyun	ldl	$31,576($17)
169*4882a593Smuzhiyun	nop
170*4882a593Smuzhiyun	nop
171*4882a593Smuzhiyun
172*4882a593Smuzhiyun	/* Non-prefetching, non-write-hinting cleanup loop for the
173*4882a593Smuzhiyun	   final 10 cache lines.  */
174*4882a593Smuzhiyun2:	ldq	$0,0($17)
175*4882a593Smuzhiyun	ldq	$1,8($17)
176*4882a593Smuzhiyun	ldq	$2,16($17)
177*4882a593Smuzhiyun	ldq	$3,24($17)
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun	ldq	$4,32($17)
180*4882a593Smuzhiyun	ldq	$5,40($17)
181*4882a593Smuzhiyun	ldq	$6,48($17)
182*4882a593Smuzhiyun	ldq	$7,56($17)
183*4882a593Smuzhiyun
184*4882a593Smuzhiyun	stq	$0,0($16)
185*4882a593Smuzhiyun	subq	$18,1,$18
186*4882a593Smuzhiyun	stq	$1,8($16)
187*4882a593Smuzhiyun	addq	$17,64,$17
188*4882a593Smuzhiyun
189*4882a593Smuzhiyun	stq	$2,16($16)
190*4882a593Smuzhiyun	stq	$3,24($16)
191*4882a593Smuzhiyun	stq	$4,32($16)
192*4882a593Smuzhiyun	stq	$5,40($16)
193*4882a593Smuzhiyun
194*4882a593Smuzhiyun	stq	$6,48($16)
195*4882a593Smuzhiyun	stq	$7,56($16)
196*4882a593Smuzhiyun	addq	$16,64,$16
197*4882a593Smuzhiyun	bne	$18, 2b
198*4882a593Smuzhiyun
199*4882a593Smuzhiyun	ret
200*4882a593Smuzhiyun	nop
201*4882a593Smuzhiyun	unop
202*4882a593Smuzhiyun	nop
203*4882a593Smuzhiyun
204*4882a593Smuzhiyun	.end copy_page
205*4882a593Smuzhiyun	EXPORT_SYMBOL(copy_page)
206