1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * arch/alpha/lib/ev6-divide.S 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 6*4882a593Smuzhiyun * 7*4882a593Smuzhiyun * Alpha division.. 8*4882a593Smuzhiyun */ 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun/* 11*4882a593Smuzhiyun * The alpha chip doesn't provide hardware division, so we have to do it 12*4882a593Smuzhiyun * by hand. The compiler expects the functions 13*4882a593Smuzhiyun * 14*4882a593Smuzhiyun * __divqu: 64-bit unsigned long divide 15*4882a593Smuzhiyun * __remqu: 64-bit unsigned long remainder 16*4882a593Smuzhiyun * __divqs/__remqs: signed 64-bit 17*4882a593Smuzhiyun * __divlu/__remlu: unsigned 32-bit 18*4882a593Smuzhiyun * __divls/__remls: signed 32-bit 19*4882a593Smuzhiyun * 20*4882a593Smuzhiyun * These are not normal C functions: instead of the normal 21*4882a593Smuzhiyun * calling sequence, these expect their arguments in registers 22*4882a593Smuzhiyun * $24 and $25, and return the result in $27. Register $28 may 23*4882a593Smuzhiyun * be clobbered (assembly temporary), anything else must be saved. 24*4882a593Smuzhiyun * 25*4882a593Smuzhiyun * In short: painful. 26*4882a593Smuzhiyun * 27*4882a593Smuzhiyun * This is a rather simple bit-at-a-time algorithm: it's very good 28*4882a593Smuzhiyun * at dividing random 64-bit numbers, but the more usual case where 29*4882a593Smuzhiyun * the divisor is small is handled better by the DEC algorithm 30*4882a593Smuzhiyun * using lookup tables. This uses much less memory, though, and is 31*4882a593Smuzhiyun * nicer on the cache.. Besides, I don't know the copyright status 32*4882a593Smuzhiyun * of the DEC code. 33*4882a593Smuzhiyun */ 34*4882a593Smuzhiyun 35*4882a593Smuzhiyun/* 36*4882a593Smuzhiyun * My temporaries: 37*4882a593Smuzhiyun * $0 - current bit 38*4882a593Smuzhiyun * $1 - shifted divisor 39*4882a593Smuzhiyun * $2 - modulus/quotient 40*4882a593Smuzhiyun * 41*4882a593Smuzhiyun * $23 - return address 42*4882a593Smuzhiyun * $24 - dividend 43*4882a593Smuzhiyun * $25 - divisor 44*4882a593Smuzhiyun * 45*4882a593Smuzhiyun * $27 - quotient/modulus 46*4882a593Smuzhiyun * $28 - compare status 47*4882a593Smuzhiyun * 48*4882a593Smuzhiyun * Much of the information about 21264 scheduling/coding comes from: 49*4882a593Smuzhiyun * Compiler Writer's Guide for the Alpha 21264 50*4882a593Smuzhiyun * abbreviated as 'CWG' in other comments here 51*4882a593Smuzhiyun * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 52*4882a593Smuzhiyun * Scheduling notation: 53*4882a593Smuzhiyun * E - either cluster 54*4882a593Smuzhiyun * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 55*4882a593Smuzhiyun * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 56*4882a593Smuzhiyun * Try not to change the actual algorithm if possible for consistency. 57*4882a593Smuzhiyun */ 58*4882a593Smuzhiyun 59*4882a593Smuzhiyun#include <asm/export.h> 60*4882a593Smuzhiyun#define halt .long 0 61*4882a593Smuzhiyun 62*4882a593Smuzhiyun/* 63*4882a593Smuzhiyun * Select function type and registers 64*4882a593Smuzhiyun */ 65*4882a593Smuzhiyun#define mask $0 66*4882a593Smuzhiyun#define divisor $1 67*4882a593Smuzhiyun#define compare $28 68*4882a593Smuzhiyun#define tmp1 $3 69*4882a593Smuzhiyun#define tmp2 $4 70*4882a593Smuzhiyun 71*4882a593Smuzhiyun#ifdef DIV 72*4882a593Smuzhiyun#define DIV_ONLY(x,y...) x,##y 73*4882a593Smuzhiyun#define MOD_ONLY(x,y...) 74*4882a593Smuzhiyun#define func(x) __div##x 75*4882a593Smuzhiyun#define modulus $2 76*4882a593Smuzhiyun#define quotient $27 77*4882a593Smuzhiyun#define GETSIGN(x) xor $24,$25,x 78*4882a593Smuzhiyun#define STACK 48 79*4882a593Smuzhiyun#else 80*4882a593Smuzhiyun#define DIV_ONLY(x,y...) 81*4882a593Smuzhiyun#define MOD_ONLY(x,y...) x,##y 82*4882a593Smuzhiyun#define func(x) __rem##x 83*4882a593Smuzhiyun#define modulus $27 84*4882a593Smuzhiyun#define quotient $2 85*4882a593Smuzhiyun#define GETSIGN(x) bis $24,$24,x 86*4882a593Smuzhiyun#define STACK 32 87*4882a593Smuzhiyun#endif 88*4882a593Smuzhiyun 89*4882a593Smuzhiyun/* 90*4882a593Smuzhiyun * For 32-bit operations, we need to extend to 64-bit 91*4882a593Smuzhiyun */ 92*4882a593Smuzhiyun#ifdef INTSIZE 93*4882a593Smuzhiyun#define ufunction func(lu) 94*4882a593Smuzhiyun#define sfunction func(l) 95*4882a593Smuzhiyun#define LONGIFY(x) zapnot x,15,x 96*4882a593Smuzhiyun#define SLONGIFY(x) addl x,0,x 97*4882a593Smuzhiyun#else 98*4882a593Smuzhiyun#define ufunction func(qu) 99*4882a593Smuzhiyun#define sfunction func(q) 100*4882a593Smuzhiyun#define LONGIFY(x) 101*4882a593Smuzhiyun#define SLONGIFY(x) 102*4882a593Smuzhiyun#endif 103*4882a593Smuzhiyun 104*4882a593Smuzhiyun.set noat 105*4882a593Smuzhiyun.align 4 106*4882a593Smuzhiyun.globl ufunction 107*4882a593Smuzhiyun.ent ufunction 108*4882a593Smuzhiyunufunction: 109*4882a593Smuzhiyun subq $30,STACK,$30 # E : 110*4882a593Smuzhiyun .frame $30,STACK,$23 111*4882a593Smuzhiyun .prologue 0 112*4882a593Smuzhiyun 113*4882a593Smuzhiyun7: stq $1, 0($30) # L : 114*4882a593Smuzhiyun bis $25,$25,divisor # E : 115*4882a593Smuzhiyun stq $2, 8($30) # L : L U L U 116*4882a593Smuzhiyun 117*4882a593Smuzhiyun bis $24,$24,modulus # E : 118*4882a593Smuzhiyun stq $0,16($30) # L : 119*4882a593Smuzhiyun bis $31,$31,quotient # E : 120*4882a593Smuzhiyun LONGIFY(divisor) # E : U L L U 121*4882a593Smuzhiyun 122*4882a593Smuzhiyun stq tmp1,24($30) # L : 123*4882a593Smuzhiyun LONGIFY(modulus) # E : 124*4882a593Smuzhiyun bis $31,1,mask # E : 125*4882a593Smuzhiyun DIV_ONLY(stq tmp2,32($30)) # L : L U U L 126*4882a593Smuzhiyun 127*4882a593Smuzhiyun beq divisor, 9f /* div by zero */ 128*4882a593Smuzhiyun /* 129*4882a593Smuzhiyun * In spite of the DIV_ONLY being either a non-instruction 130*4882a593Smuzhiyun * or an actual stq, the addition of the .align directive 131*4882a593Smuzhiyun * below ensures that label 1 is going to be nicely aligned 132*4882a593Smuzhiyun */ 133*4882a593Smuzhiyun 134*4882a593Smuzhiyun .align 4 135*4882a593Smuzhiyun#ifdef INTSIZE 136*4882a593Smuzhiyun /* 137*4882a593Smuzhiyun * shift divisor left, using 3-bit shifts for 138*4882a593Smuzhiyun * 32-bit divides as we can't overflow. Three-bit 139*4882a593Smuzhiyun * shifts will result in looping three times less 140*4882a593Smuzhiyun * here, but can result in two loops more later. 141*4882a593Smuzhiyun * Thus using a large shift isn't worth it (and 142*4882a593Smuzhiyun * s8add pairs better than a sll..) 143*4882a593Smuzhiyun */ 144*4882a593Smuzhiyun1: cmpult divisor,modulus,compare # E : 145*4882a593Smuzhiyun s8addq divisor,$31,divisor # E : 146*4882a593Smuzhiyun s8addq mask,$31,mask # E : 147*4882a593Smuzhiyun bne compare,1b # U : U L U L 148*4882a593Smuzhiyun#else 149*4882a593Smuzhiyun1: cmpult divisor,modulus,compare # E : 150*4882a593Smuzhiyun nop # E : 151*4882a593Smuzhiyun nop # E : 152*4882a593Smuzhiyun blt divisor, 2f # U : U L U L 153*4882a593Smuzhiyun 154*4882a593Smuzhiyun addq divisor,divisor,divisor # E : 155*4882a593Smuzhiyun addq mask,mask,mask # E : 156*4882a593Smuzhiyun unop # E : 157*4882a593Smuzhiyun bne compare,1b # U : U L U L 158*4882a593Smuzhiyun#endif 159*4882a593Smuzhiyun 160*4882a593Smuzhiyun /* ok, start to go right again.. */ 161*4882a593Smuzhiyun2: 162*4882a593Smuzhiyun /* 163*4882a593Smuzhiyun * Keep things nicely bundled... use a nop instead of not 164*4882a593Smuzhiyun * having an instruction for DIV_ONLY 165*4882a593Smuzhiyun */ 166*4882a593Smuzhiyun#ifdef DIV 167*4882a593Smuzhiyun DIV_ONLY(addq quotient,mask,tmp2) # E : 168*4882a593Smuzhiyun#else 169*4882a593Smuzhiyun nop # E : 170*4882a593Smuzhiyun#endif 171*4882a593Smuzhiyun srl mask,1,mask # U : 172*4882a593Smuzhiyun cmpule divisor,modulus,compare # E : 173*4882a593Smuzhiyun subq modulus,divisor,tmp1 # E : 174*4882a593Smuzhiyun 175*4882a593Smuzhiyun#ifdef DIV 176*4882a593Smuzhiyun DIV_ONLY(cmovne compare,tmp2,quotient) # E : Latency 2, extra map slot 177*4882a593Smuzhiyun nop # E : as part of the cmovne 178*4882a593Smuzhiyun srl divisor,1,divisor # U : 179*4882a593Smuzhiyun nop # E : L U L U 180*4882a593Smuzhiyun 181*4882a593Smuzhiyun nop # E : 182*4882a593Smuzhiyun cmovne compare,tmp1,modulus # E : Latency 2, extra map slot 183*4882a593Smuzhiyun nop # E : as part of the cmovne 184*4882a593Smuzhiyun bne mask,2b # U : U L U L 185*4882a593Smuzhiyun#else 186*4882a593Smuzhiyun srl divisor,1,divisor # U : 187*4882a593Smuzhiyun cmovne compare,tmp1,modulus # E : Latency 2, extra map slot 188*4882a593Smuzhiyun nop # E : as part of the cmovne 189*4882a593Smuzhiyun bne mask,2b # U : U L L U 190*4882a593Smuzhiyun#endif 191*4882a593Smuzhiyun 192*4882a593Smuzhiyun9: ldq $1, 0($30) # L : 193*4882a593Smuzhiyun ldq $2, 8($30) # L : 194*4882a593Smuzhiyun nop # E : 195*4882a593Smuzhiyun nop # E : U U L L 196*4882a593Smuzhiyun 197*4882a593Smuzhiyun ldq $0,16($30) # L : 198*4882a593Smuzhiyun ldq tmp1,24($30) # L : 199*4882a593Smuzhiyun nop # E : 200*4882a593Smuzhiyun nop # E : 201*4882a593Smuzhiyun 202*4882a593Smuzhiyun#ifdef DIV 203*4882a593Smuzhiyun DIV_ONLY(ldq tmp2,32($30)) # L : 204*4882a593Smuzhiyun#else 205*4882a593Smuzhiyun nop # E : 206*4882a593Smuzhiyun#endif 207*4882a593Smuzhiyun addq $30,STACK,$30 # E : 208*4882a593Smuzhiyun ret $31,($23),1 # L0 : L U U L 209*4882a593Smuzhiyun .end ufunction 210*4882a593SmuzhiyunEXPORT_SYMBOL(ufunction) 211*4882a593Smuzhiyun 212*4882a593Smuzhiyun/* 213*4882a593Smuzhiyun * Uhh.. Ugly signed division. I'd rather not have it at all, but 214*4882a593Smuzhiyun * it's needed in some circumstances. There are different ways to 215*4882a593Smuzhiyun * handle this, really. This does: 216*4882a593Smuzhiyun * -a / b = a / -b = -(a / b) 217*4882a593Smuzhiyun * -a % b = -(a % b) 218*4882a593Smuzhiyun * a % -b = a % b 219*4882a593Smuzhiyun * which is probably not the best solution, but at least should 220*4882a593Smuzhiyun * have the property that (x/y)*y + (x%y) = x. 221*4882a593Smuzhiyun */ 222*4882a593Smuzhiyun.align 4 223*4882a593Smuzhiyun.globl sfunction 224*4882a593Smuzhiyun.ent sfunction 225*4882a593Smuzhiyunsfunction: 226*4882a593Smuzhiyun subq $30,STACK,$30 # E : 227*4882a593Smuzhiyun .frame $30,STACK,$23 228*4882a593Smuzhiyun .prologue 0 229*4882a593Smuzhiyun bis $24,$25,$28 # E : 230*4882a593Smuzhiyun SLONGIFY($28) # E : 231*4882a593Smuzhiyun bge $28,7b # U : 232*4882a593Smuzhiyun 233*4882a593Smuzhiyun stq $24,0($30) # L : 234*4882a593Smuzhiyun subq $31,$24,$28 # E : 235*4882a593Smuzhiyun stq $25,8($30) # L : 236*4882a593Smuzhiyun nop # E : U L U L 237*4882a593Smuzhiyun 238*4882a593Smuzhiyun cmovlt $24,$28,$24 /* abs($24) */ # E : Latency 2, extra map slot 239*4882a593Smuzhiyun nop # E : as part of the cmov 240*4882a593Smuzhiyun stq $23,16($30) # L : 241*4882a593Smuzhiyun subq $31,$25,$28 # E : U L U L 242*4882a593Smuzhiyun 243*4882a593Smuzhiyun stq tmp1,24($30) # L : 244*4882a593Smuzhiyun cmovlt $25,$28,$25 /* abs($25) */ # E : Latency 2, extra map slot 245*4882a593Smuzhiyun nop # E : 246*4882a593Smuzhiyun bsr $23,ufunction # L0: L U L U 247*4882a593Smuzhiyun 248*4882a593Smuzhiyun ldq $24,0($30) # L : 249*4882a593Smuzhiyun ldq $25,8($30) # L : 250*4882a593Smuzhiyun GETSIGN($28) # E : 251*4882a593Smuzhiyun subq $31,$27,tmp1 # E : U U L L 252*4882a593Smuzhiyun 253*4882a593Smuzhiyun SLONGIFY($28) # E : 254*4882a593Smuzhiyun ldq $23,16($30) # L : 255*4882a593Smuzhiyun cmovlt $28,tmp1,$27 # E : Latency 2, extra map slot 256*4882a593Smuzhiyun nop # E : U L L U : as part of the cmov 257*4882a593Smuzhiyun 258*4882a593Smuzhiyun ldq tmp1,24($30) # L : 259*4882a593Smuzhiyun nop # E : as part of the cmov 260*4882a593Smuzhiyun addq $30,STACK,$30 # E : 261*4882a593Smuzhiyun ret $31,($23),1 # L0 : L U U L 262*4882a593Smuzhiyun .end sfunction 263*4882a593SmuzhiyunEXPORT_SYMBOL(sfunction) 264