1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * arch/alpha/lib/ev6-copy_user.S 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 6*4882a593Smuzhiyun * 7*4882a593Smuzhiyun * Copy to/from user space, handling exceptions as we go.. This 8*4882a593Smuzhiyun * isn't exactly pretty. 9*4882a593Smuzhiyun * 10*4882a593Smuzhiyun * This is essentially the same as "memcpy()", but with a few twists. 11*4882a593Smuzhiyun * Notably, we have to make sure that $0 is always up-to-date and 12*4882a593Smuzhiyun * contains the right "bytes left to copy" value (and that it is updated 13*4882a593Smuzhiyun * only _after_ a successful copy). There is also some rather minor 14*4882a593Smuzhiyun * exception setup stuff.. 15*4882a593Smuzhiyun * 16*4882a593Smuzhiyun * Much of the information about 21264 scheduling/coding comes from: 17*4882a593Smuzhiyun * Compiler Writer's Guide for the Alpha 21264 18*4882a593Smuzhiyun * abbreviated as 'CWG' in other comments here 19*4882a593Smuzhiyun * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 20*4882a593Smuzhiyun * Scheduling notation: 21*4882a593Smuzhiyun * E - either cluster 22*4882a593Smuzhiyun * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 23*4882a593Smuzhiyun * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 24*4882a593Smuzhiyun */ 25*4882a593Smuzhiyun 26*4882a593Smuzhiyun#include <asm/export.h> 27*4882a593Smuzhiyun/* Allow an exception for an insn; exit if we get one. */ 28*4882a593Smuzhiyun#define EXI(x,y...) \ 29*4882a593Smuzhiyun 99: x,##y; \ 30*4882a593Smuzhiyun .section __ex_table,"a"; \ 31*4882a593Smuzhiyun .long 99b - .; \ 32*4882a593Smuzhiyun lda $31, $exitin-99b($31); \ 33*4882a593Smuzhiyun .previous 34*4882a593Smuzhiyun 35*4882a593Smuzhiyun#define EXO(x,y...) \ 36*4882a593Smuzhiyun 99: x,##y; \ 37*4882a593Smuzhiyun .section __ex_table,"a"; \ 38*4882a593Smuzhiyun .long 99b - .; \ 39*4882a593Smuzhiyun lda $31, $exitout-99b($31); \ 40*4882a593Smuzhiyun .previous 41*4882a593Smuzhiyun 42*4882a593Smuzhiyun .set noat 43*4882a593Smuzhiyun .align 4 44*4882a593Smuzhiyun .globl __copy_user 45*4882a593Smuzhiyun .ent __copy_user 46*4882a593Smuzhiyun # Pipeline info: Slotting & Comments 47*4882a593Smuzhiyun__copy_user: 48*4882a593Smuzhiyun .prologue 0 49*4882a593Smuzhiyun mov $18, $0 # .. .. .. E 50*4882a593Smuzhiyun subq $18, 32, $1 # .. .. E. .. : Is this going to be a small copy? 51*4882a593Smuzhiyun nop # .. E .. .. 52*4882a593Smuzhiyun beq $18, $zerolength # U .. .. .. : U L U L 53*4882a593Smuzhiyun 54*4882a593Smuzhiyun and $16,7,$3 # .. .. .. E : is leading dest misalignment 55*4882a593Smuzhiyun ble $1, $onebyteloop # .. .. U .. : 1st branch : small amount of data 56*4882a593Smuzhiyun beq $3, $destaligned # .. U .. .. : 2nd (one cycle fetcher stall) 57*4882a593Smuzhiyun subq $3, 8, $3 # E .. .. .. : L U U L : trip counter 58*4882a593Smuzhiyun/* 59*4882a593Smuzhiyun * The fetcher stall also hides the 1 cycle cross-cluster stall for $3 (L --> U) 60*4882a593Smuzhiyun * This loop aligns the destination a byte at a time 61*4882a593Smuzhiyun * We know we have at least one trip through this loop 62*4882a593Smuzhiyun */ 63*4882a593Smuzhiyun$aligndest: 64*4882a593Smuzhiyun EXI( ldbu $1,0($17) ) # .. .. .. L : Keep loads separate from stores 65*4882a593Smuzhiyun addq $16,1,$16 # .. .. E .. : Section 3.8 in the CWG 66*4882a593Smuzhiyun addq $3,1,$3 # .. E .. .. : 67*4882a593Smuzhiyun nop # E .. .. .. : U L U L 68*4882a593Smuzhiyun 69*4882a593Smuzhiyun/* 70*4882a593Smuzhiyun * the -1 is to compensate for the inc($16) done in a previous quadpack 71*4882a593Smuzhiyun * which allows us zero dependencies within either quadpack in the loop 72*4882a593Smuzhiyun */ 73*4882a593Smuzhiyun EXO( stb $1,-1($16) ) # .. .. .. L : 74*4882a593Smuzhiyun addq $17,1,$17 # .. .. E .. : Section 3.8 in the CWG 75*4882a593Smuzhiyun subq $0,1,$0 # .. E .. .. : 76*4882a593Smuzhiyun bne $3, $aligndest # U .. .. .. : U L U L 77*4882a593Smuzhiyun 78*4882a593Smuzhiyun/* 79*4882a593Smuzhiyun * If we fell through into here, we have a minimum of 33 - 7 bytes 80*4882a593Smuzhiyun * If we arrived via branch, we have a minimum of 32 bytes 81*4882a593Smuzhiyun */ 82*4882a593Smuzhiyun$destaligned: 83*4882a593Smuzhiyun and $17,7,$1 # .. .. .. E : Check _current_ source alignment 84*4882a593Smuzhiyun bic $0,7,$4 # .. .. E .. : number bytes as a quadword loop 85*4882a593Smuzhiyun EXI( ldq_u $3,0($17) ) # .. L .. .. : Forward fetch for fallthrough code 86*4882a593Smuzhiyun beq $1,$quadaligned # U .. .. .. : U L U L 87*4882a593Smuzhiyun 88*4882a593Smuzhiyun/* 89*4882a593Smuzhiyun * In the worst case, we've just executed an ldq_u here from 0($17) 90*4882a593Smuzhiyun * and we'll repeat it once if we take the branch 91*4882a593Smuzhiyun */ 92*4882a593Smuzhiyun 93*4882a593Smuzhiyun/* Misaligned quadword loop - not unrolled. Leave it that way. */ 94*4882a593Smuzhiyun$misquad: 95*4882a593Smuzhiyun EXI( ldq_u $2,8($17) ) # .. .. .. L : 96*4882a593Smuzhiyun subq $4,8,$4 # .. .. E .. : 97*4882a593Smuzhiyun extql $3,$17,$3 # .. U .. .. : 98*4882a593Smuzhiyun extqh $2,$17,$1 # U .. .. .. : U U L L 99*4882a593Smuzhiyun 100*4882a593Smuzhiyun bis $3,$1,$1 # .. .. .. E : 101*4882a593Smuzhiyun EXO( stq $1,0($16) ) # .. .. L .. : 102*4882a593Smuzhiyun addq $17,8,$17 # .. E .. .. : 103*4882a593Smuzhiyun subq $0,8,$0 # E .. .. .. : U L L U 104*4882a593Smuzhiyun 105*4882a593Smuzhiyun addq $16,8,$16 # .. .. .. E : 106*4882a593Smuzhiyun bis $2,$2,$3 # .. .. E .. : 107*4882a593Smuzhiyun nop # .. E .. .. : 108*4882a593Smuzhiyun bne $4,$misquad # U .. .. .. : U L U L 109*4882a593Smuzhiyun 110*4882a593Smuzhiyun nop # .. .. .. E 111*4882a593Smuzhiyun nop # .. .. E .. 112*4882a593Smuzhiyun nop # .. E .. .. 113*4882a593Smuzhiyun beq $0,$zerolength # U .. .. .. : U L U L 114*4882a593Smuzhiyun 115*4882a593Smuzhiyun/* We know we have at least one trip through the byte loop */ 116*4882a593Smuzhiyun EXI ( ldbu $2,0($17) ) # .. .. .. L : No loads in the same quad 117*4882a593Smuzhiyun addq $16,1,$16 # .. .. E .. : as the store (Section 3.8 in CWG) 118*4882a593Smuzhiyun nop # .. E .. .. : 119*4882a593Smuzhiyun br $31, $dirtyentry # L0 .. .. .. : L U U L 120*4882a593Smuzhiyun/* Do the trailing byte loop load, then hop into the store part of the loop */ 121*4882a593Smuzhiyun 122*4882a593Smuzhiyun/* 123*4882a593Smuzhiyun * A minimum of (33 - 7) bytes to do a quad at a time. 124*4882a593Smuzhiyun * Based upon the usage context, it's worth the effort to unroll this loop 125*4882a593Smuzhiyun * $0 - number of bytes to be moved 126*4882a593Smuzhiyun * $4 - number of bytes to move as quadwords 127*4882a593Smuzhiyun * $16 is current destination address 128*4882a593Smuzhiyun * $17 is current source address 129*4882a593Smuzhiyun */ 130*4882a593Smuzhiyun$quadaligned: 131*4882a593Smuzhiyun subq $4, 32, $2 # .. .. .. E : do not unroll for small stuff 132*4882a593Smuzhiyun nop # .. .. E .. 133*4882a593Smuzhiyun nop # .. E .. .. 134*4882a593Smuzhiyun blt $2, $onequad # U .. .. .. : U L U L 135*4882a593Smuzhiyun 136*4882a593Smuzhiyun/* 137*4882a593Smuzhiyun * There is a significant assumption here that the source and destination 138*4882a593Smuzhiyun * addresses differ by more than 32 bytes. In this particular case, a 139*4882a593Smuzhiyun * sparsity of registers further bounds this to be a minimum of 8 bytes. 140*4882a593Smuzhiyun * But if this isn't met, then the output result will be incorrect. 141*4882a593Smuzhiyun * Furthermore, due to a lack of available registers, we really can't 142*4882a593Smuzhiyun * unroll this to be an 8x loop (which would enable us to use the wh64 143*4882a593Smuzhiyun * instruction memory hint instruction). 144*4882a593Smuzhiyun */ 145*4882a593Smuzhiyun$unroll4: 146*4882a593Smuzhiyun EXI( ldq $1,0($17) ) # .. .. .. L 147*4882a593Smuzhiyun EXI( ldq $2,8($17) ) # .. .. L .. 148*4882a593Smuzhiyun subq $4,32,$4 # .. E .. .. 149*4882a593Smuzhiyun nop # E .. .. .. : U U L L 150*4882a593Smuzhiyun 151*4882a593Smuzhiyun addq $17,16,$17 # .. .. .. E 152*4882a593Smuzhiyun EXO( stq $1,0($16) ) # .. .. L .. 153*4882a593Smuzhiyun EXO( stq $2,8($16) ) # .. L .. .. 154*4882a593Smuzhiyun subq $0,16,$0 # E .. .. .. : U L L U 155*4882a593Smuzhiyun 156*4882a593Smuzhiyun addq $16,16,$16 # .. .. .. E 157*4882a593Smuzhiyun EXI( ldq $1,0($17) ) # .. .. L .. 158*4882a593Smuzhiyun EXI( ldq $2,8($17) ) # .. L .. .. 159*4882a593Smuzhiyun subq $4, 32, $3 # E .. .. .. : U U L L : is there enough for another trip? 160*4882a593Smuzhiyun 161*4882a593Smuzhiyun EXO( stq $1,0($16) ) # .. .. .. L 162*4882a593Smuzhiyun EXO( stq $2,8($16) ) # .. .. L .. 163*4882a593Smuzhiyun subq $0,16,$0 # .. E .. .. 164*4882a593Smuzhiyun addq $17,16,$17 # E .. .. .. : U L L U 165*4882a593Smuzhiyun 166*4882a593Smuzhiyun nop # .. .. .. E 167*4882a593Smuzhiyun nop # .. .. E .. 168*4882a593Smuzhiyun addq $16,16,$16 # .. E .. .. 169*4882a593Smuzhiyun bgt $3,$unroll4 # U .. .. .. : U L U L 170*4882a593Smuzhiyun 171*4882a593Smuzhiyun nop 172*4882a593Smuzhiyun nop 173*4882a593Smuzhiyun nop 174*4882a593Smuzhiyun beq $4, $noquads 175*4882a593Smuzhiyun 176*4882a593Smuzhiyun$onequad: 177*4882a593Smuzhiyun EXI( ldq $1,0($17) ) 178*4882a593Smuzhiyun subq $4,8,$4 179*4882a593Smuzhiyun addq $17,8,$17 180*4882a593Smuzhiyun nop 181*4882a593Smuzhiyun 182*4882a593Smuzhiyun EXO( stq $1,0($16) ) 183*4882a593Smuzhiyun subq $0,8,$0 184*4882a593Smuzhiyun addq $16,8,$16 185*4882a593Smuzhiyun bne $4,$onequad 186*4882a593Smuzhiyun 187*4882a593Smuzhiyun$noquads: 188*4882a593Smuzhiyun nop 189*4882a593Smuzhiyun nop 190*4882a593Smuzhiyun nop 191*4882a593Smuzhiyun beq $0,$zerolength 192*4882a593Smuzhiyun 193*4882a593Smuzhiyun/* 194*4882a593Smuzhiyun * For small copies (or the tail of a larger copy), do a very simple byte loop. 195*4882a593Smuzhiyun * There's no point in doing a lot of complex alignment calculations to try to 196*4882a593Smuzhiyun * to quadword stuff for a small amount of data. 197*4882a593Smuzhiyun * $0 - remaining number of bytes left to copy 198*4882a593Smuzhiyun * $16 - current dest addr 199*4882a593Smuzhiyun * $17 - current source addr 200*4882a593Smuzhiyun */ 201*4882a593Smuzhiyun 202*4882a593Smuzhiyun$onebyteloop: 203*4882a593Smuzhiyun EXI ( ldbu $2,0($17) ) # .. .. .. L : No loads in the same quad 204*4882a593Smuzhiyun addq $16,1,$16 # .. .. E .. : as the store (Section 3.8 in CWG) 205*4882a593Smuzhiyun nop # .. E .. .. : 206*4882a593Smuzhiyun nop # E .. .. .. : U L U L 207*4882a593Smuzhiyun 208*4882a593Smuzhiyun$dirtyentry: 209*4882a593Smuzhiyun/* 210*4882a593Smuzhiyun * the -1 is to compensate for the inc($16) done in a previous quadpack 211*4882a593Smuzhiyun * which allows us zero dependencies within either quadpack in the loop 212*4882a593Smuzhiyun */ 213*4882a593Smuzhiyun EXO ( stb $2,-1($16) ) # .. .. .. L : 214*4882a593Smuzhiyun addq $17,1,$17 # .. .. E .. : quadpack as the load 215*4882a593Smuzhiyun subq $0,1,$0 # .. E .. .. : change count _after_ copy 216*4882a593Smuzhiyun bgt $0,$onebyteloop # U .. .. .. : U L U L 217*4882a593Smuzhiyun 218*4882a593Smuzhiyun$zerolength: 219*4882a593Smuzhiyun$exitin: 220*4882a593Smuzhiyun$exitout: # Destination for exception recovery(?) 221*4882a593Smuzhiyun nop # .. .. .. E 222*4882a593Smuzhiyun nop # .. .. E .. 223*4882a593Smuzhiyun nop # .. E .. .. 224*4882a593Smuzhiyun ret $31,($26),1 # L0 .. .. .. : L U L U 225*4882a593Smuzhiyun 226*4882a593Smuzhiyun .end __copy_user 227*4882a593Smuzhiyun EXPORT_SYMBOL(__copy_user) 228