1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * arch/alpha/lib/ev6-copy_page.S 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copy an entire page. 6*4882a593Smuzhiyun */ 7*4882a593Smuzhiyun 8*4882a593Smuzhiyun/* The following comparison of this routine vs the normal copy_page.S 9*4882a593Smuzhiyun was written by an unnamed ev6 hardware designer and forwarded to me 10*4882a593Smuzhiyun via Steven Hobbs <hobbs@steven.zko.dec.com>. 11*4882a593Smuzhiyun 12*4882a593Smuzhiyun First Problem: STQ overflows. 13*4882a593Smuzhiyun ----------------------------- 14*4882a593Smuzhiyun 15*4882a593Smuzhiyun It would be nice if EV6 handled every resource overflow efficiently, 16*4882a593Smuzhiyun but for some it doesn't. Including store queue overflows. It causes 17*4882a593Smuzhiyun a trap and a restart of the pipe. 18*4882a593Smuzhiyun 19*4882a593Smuzhiyun To get around this we sometimes use (to borrow a term from a VSSAD 20*4882a593Smuzhiyun researcher) "aeration". The idea is to slow the rate at which the 21*4882a593Smuzhiyun processor receives valid instructions by inserting nops in the fetch 22*4882a593Smuzhiyun path. In doing so, you can prevent the overflow and actually make 23*4882a593Smuzhiyun the code run faster. You can, of course, take advantage of the fact 24*4882a593Smuzhiyun that the processor can fetch at most 4 aligned instructions per cycle. 25*4882a593Smuzhiyun 26*4882a593Smuzhiyun I inserted enough nops to force it to take 10 cycles to fetch the 27*4882a593Smuzhiyun loop code. In theory, EV6 should be able to execute this loop in 28*4882a593Smuzhiyun 9 cycles but I was not able to get it to run that fast -- the initial 29*4882a593Smuzhiyun conditions were such that I could not reach this optimum rate on 30*4882a593Smuzhiyun (chaotic) EV6. I wrote the code such that everything would issue 31*4882a593Smuzhiyun in order. 32*4882a593Smuzhiyun 33*4882a593Smuzhiyun Second Problem: Dcache index matches. 34*4882a593Smuzhiyun ------------------------------------- 35*4882a593Smuzhiyun 36*4882a593Smuzhiyun If you are going to use this routine on random aligned pages, there 37*4882a593Smuzhiyun is a 25% chance that the pages will be at the same dcache indices. 38*4882a593Smuzhiyun This results in many nasty memory traps without care. 39*4882a593Smuzhiyun 40*4882a593Smuzhiyun The solution is to schedule the prefetches to avoid the memory 41*4882a593Smuzhiyun conflicts. I schedule the wh64 prefetches farther ahead of the 42*4882a593Smuzhiyun read prefetches to avoid this problem. 43*4882a593Smuzhiyun 44*4882a593Smuzhiyun Third Problem: Needs more prefetching. 45*4882a593Smuzhiyun -------------------------------------- 46*4882a593Smuzhiyun 47*4882a593Smuzhiyun In order to improve the code I added deeper prefetching to take the 48*4882a593Smuzhiyun most advantage of EV6's bandwidth. 49*4882a593Smuzhiyun 50*4882a593Smuzhiyun I also prefetched the read stream. Note that adding the read prefetch 51*4882a593Smuzhiyun forced me to add another cycle to the inner-most kernel - up to 11 52*4882a593Smuzhiyun from the original 8 cycles per iteration. We could improve performance 53*4882a593Smuzhiyun further by unrolling the loop and doing multiple prefetches per cycle. 54*4882a593Smuzhiyun 55*4882a593Smuzhiyun I think that the code below will be very robust and fast code for the 56*4882a593Smuzhiyun purposes of copying aligned pages. It is slower when both source and 57*4882a593Smuzhiyun destination pages are in the dcache, but it is my guess that this is 58*4882a593Smuzhiyun less important than the dcache miss case. */ 59*4882a593Smuzhiyun 60*4882a593Smuzhiyun#include <asm/export.h> 61*4882a593Smuzhiyun .text 62*4882a593Smuzhiyun .align 4 63*4882a593Smuzhiyun .global copy_page 64*4882a593Smuzhiyun .ent copy_page 65*4882a593Smuzhiyuncopy_page: 66*4882a593Smuzhiyun .prologue 0 67*4882a593Smuzhiyun 68*4882a593Smuzhiyun /* Prefetch 5 read cachelines; write-hint 10 cache lines. */ 69*4882a593Smuzhiyun wh64 ($16) 70*4882a593Smuzhiyun ldl $31,0($17) 71*4882a593Smuzhiyun ldl $31,64($17) 72*4882a593Smuzhiyun lda $1,1*64($16) 73*4882a593Smuzhiyun 74*4882a593Smuzhiyun wh64 ($1) 75*4882a593Smuzhiyun ldl $31,128($17) 76*4882a593Smuzhiyun ldl $31,192($17) 77*4882a593Smuzhiyun lda $1,2*64($16) 78*4882a593Smuzhiyun 79*4882a593Smuzhiyun wh64 ($1) 80*4882a593Smuzhiyun ldl $31,256($17) 81*4882a593Smuzhiyun lda $18,118 82*4882a593Smuzhiyun lda $1,3*64($16) 83*4882a593Smuzhiyun 84*4882a593Smuzhiyun wh64 ($1) 85*4882a593Smuzhiyun nop 86*4882a593Smuzhiyun lda $1,4*64($16) 87*4882a593Smuzhiyun lda $2,5*64($16) 88*4882a593Smuzhiyun 89*4882a593Smuzhiyun wh64 ($1) 90*4882a593Smuzhiyun wh64 ($2) 91*4882a593Smuzhiyun lda $1,6*64($16) 92*4882a593Smuzhiyun lda $2,7*64($16) 93*4882a593Smuzhiyun 94*4882a593Smuzhiyun wh64 ($1) 95*4882a593Smuzhiyun wh64 ($2) 96*4882a593Smuzhiyun lda $1,8*64($16) 97*4882a593Smuzhiyun lda $2,9*64($16) 98*4882a593Smuzhiyun 99*4882a593Smuzhiyun wh64 ($1) 100*4882a593Smuzhiyun wh64 ($2) 101*4882a593Smuzhiyun lda $19,10*64($16) 102*4882a593Smuzhiyun nop 103*4882a593Smuzhiyun 104*4882a593Smuzhiyun /* Main prefetching/write-hinting loop. */ 105*4882a593Smuzhiyun1: ldq $0,0($17) 106*4882a593Smuzhiyun ldq $1,8($17) 107*4882a593Smuzhiyun unop 108*4882a593Smuzhiyun unop 109*4882a593Smuzhiyun 110*4882a593Smuzhiyun unop 111*4882a593Smuzhiyun unop 112*4882a593Smuzhiyun ldq $2,16($17) 113*4882a593Smuzhiyun ldq $3,24($17) 114*4882a593Smuzhiyun 115*4882a593Smuzhiyun ldq $4,32($17) 116*4882a593Smuzhiyun ldq $5,40($17) 117*4882a593Smuzhiyun unop 118*4882a593Smuzhiyun unop 119*4882a593Smuzhiyun 120*4882a593Smuzhiyun unop 121*4882a593Smuzhiyun unop 122*4882a593Smuzhiyun ldq $6,48($17) 123*4882a593Smuzhiyun ldq $7,56($17) 124*4882a593Smuzhiyun 125*4882a593Smuzhiyun ldl $31,320($17) 126*4882a593Smuzhiyun unop 127*4882a593Smuzhiyun unop 128*4882a593Smuzhiyun unop 129*4882a593Smuzhiyun 130*4882a593Smuzhiyun /* This gives the extra cycle of aeration above the minimum. */ 131*4882a593Smuzhiyun unop 132*4882a593Smuzhiyun unop 133*4882a593Smuzhiyun unop 134*4882a593Smuzhiyun unop 135*4882a593Smuzhiyun 136*4882a593Smuzhiyun wh64 ($19) 137*4882a593Smuzhiyun unop 138*4882a593Smuzhiyun unop 139*4882a593Smuzhiyun unop 140*4882a593Smuzhiyun 141*4882a593Smuzhiyun stq $0,0($16) 142*4882a593Smuzhiyun subq $18,1,$18 143*4882a593Smuzhiyun stq $1,8($16) 144*4882a593Smuzhiyun unop 145*4882a593Smuzhiyun 146*4882a593Smuzhiyun unop 147*4882a593Smuzhiyun stq $2,16($16) 148*4882a593Smuzhiyun addq $17,64,$17 149*4882a593Smuzhiyun stq $3,24($16) 150*4882a593Smuzhiyun 151*4882a593Smuzhiyun stq $4,32($16) 152*4882a593Smuzhiyun stq $5,40($16) 153*4882a593Smuzhiyun addq $19,64,$19 154*4882a593Smuzhiyun unop 155*4882a593Smuzhiyun 156*4882a593Smuzhiyun stq $6,48($16) 157*4882a593Smuzhiyun stq $7,56($16) 158*4882a593Smuzhiyun addq $16,64,$16 159*4882a593Smuzhiyun bne $18, 1b 160*4882a593Smuzhiyun 161*4882a593Smuzhiyun /* Prefetch the final 5 cache lines of the read stream. */ 162*4882a593Smuzhiyun lda $18,10 163*4882a593Smuzhiyun ldl $31,320($17) 164*4882a593Smuzhiyun ldl $31,384($17) 165*4882a593Smuzhiyun ldl $31,448($17) 166*4882a593Smuzhiyun 167*4882a593Smuzhiyun ldl $31,512($17) 168*4882a593Smuzhiyun ldl $31,576($17) 169*4882a593Smuzhiyun nop 170*4882a593Smuzhiyun nop 171*4882a593Smuzhiyun 172*4882a593Smuzhiyun /* Non-prefetching, non-write-hinting cleanup loop for the 173*4882a593Smuzhiyun final 10 cache lines. */ 174*4882a593Smuzhiyun2: ldq $0,0($17) 175*4882a593Smuzhiyun ldq $1,8($17) 176*4882a593Smuzhiyun ldq $2,16($17) 177*4882a593Smuzhiyun ldq $3,24($17) 178*4882a593Smuzhiyun 179*4882a593Smuzhiyun ldq $4,32($17) 180*4882a593Smuzhiyun ldq $5,40($17) 181*4882a593Smuzhiyun ldq $6,48($17) 182*4882a593Smuzhiyun ldq $7,56($17) 183*4882a593Smuzhiyun 184*4882a593Smuzhiyun stq $0,0($16) 185*4882a593Smuzhiyun subq $18,1,$18 186*4882a593Smuzhiyun stq $1,8($16) 187*4882a593Smuzhiyun addq $17,64,$17 188*4882a593Smuzhiyun 189*4882a593Smuzhiyun stq $2,16($16) 190*4882a593Smuzhiyun stq $3,24($16) 191*4882a593Smuzhiyun stq $4,32($16) 192*4882a593Smuzhiyun stq $5,40($16) 193*4882a593Smuzhiyun 194*4882a593Smuzhiyun stq $6,48($16) 195*4882a593Smuzhiyun stq $7,56($16) 196*4882a593Smuzhiyun addq $16,64,$16 197*4882a593Smuzhiyun bne $18, 2b 198*4882a593Smuzhiyun 199*4882a593Smuzhiyun ret 200*4882a593Smuzhiyun nop 201*4882a593Smuzhiyun unop 202*4882a593Smuzhiyun nop 203*4882a593Smuzhiyun 204*4882a593Smuzhiyun .end copy_page 205*4882a593Smuzhiyun EXPORT_SYMBOL(copy_page) 206