1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * arch/alpha/lib/divide.S 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * (C) 1995 Linus Torvalds 6*4882a593Smuzhiyun * 7*4882a593Smuzhiyun * Alpha division.. 8*4882a593Smuzhiyun */ 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun/* 11*4882a593Smuzhiyun * The alpha chip doesn't provide hardware division, so we have to do it 12*4882a593Smuzhiyun * by hand. The compiler expects the functions 13*4882a593Smuzhiyun * 14*4882a593Smuzhiyun * __divqu: 64-bit unsigned long divide 15*4882a593Smuzhiyun * __remqu: 64-bit unsigned long remainder 16*4882a593Smuzhiyun * __divqs/__remqs: signed 64-bit 17*4882a593Smuzhiyun * __divlu/__remlu: unsigned 32-bit 18*4882a593Smuzhiyun * __divls/__remls: signed 32-bit 19*4882a593Smuzhiyun * 20*4882a593Smuzhiyun * These are not normal C functions: instead of the normal 21*4882a593Smuzhiyun * calling sequence, these expect their arguments in registers 22*4882a593Smuzhiyun * $24 and $25, and return the result in $27. Register $28 may 23*4882a593Smuzhiyun * be clobbered (assembly temporary), anything else must be saved. 24*4882a593Smuzhiyun * 25*4882a593Smuzhiyun * In short: painful. 26*4882a593Smuzhiyun * 27*4882a593Smuzhiyun * This is a rather simple bit-at-a-time algorithm: it's very good 28*4882a593Smuzhiyun * at dividing random 64-bit numbers, but the more usual case where 29*4882a593Smuzhiyun * the divisor is small is handled better by the DEC algorithm 30*4882a593Smuzhiyun * using lookup tables. This uses much less memory, though, and is 31*4882a593Smuzhiyun * nicer on the cache.. Besides, I don't know the copyright status 32*4882a593Smuzhiyun * of the DEC code. 33*4882a593Smuzhiyun */ 34*4882a593Smuzhiyun 35*4882a593Smuzhiyun/* 36*4882a593Smuzhiyun * My temporaries: 37*4882a593Smuzhiyun * $0 - current bit 38*4882a593Smuzhiyun * $1 - shifted divisor 39*4882a593Smuzhiyun * $2 - modulus/quotient 40*4882a593Smuzhiyun * 41*4882a593Smuzhiyun * $23 - return address 42*4882a593Smuzhiyun * $24 - dividend 43*4882a593Smuzhiyun * $25 - divisor 44*4882a593Smuzhiyun * 45*4882a593Smuzhiyun * $27 - quotient/modulus 46*4882a593Smuzhiyun * $28 - compare status 47*4882a593Smuzhiyun */ 48*4882a593Smuzhiyun 49*4882a593Smuzhiyun#include <asm/export.h> 50*4882a593Smuzhiyun#define halt .long 0 51*4882a593Smuzhiyun 52*4882a593Smuzhiyun/* 53*4882a593Smuzhiyun * Select function type and registers 54*4882a593Smuzhiyun */ 55*4882a593Smuzhiyun#define mask $0 56*4882a593Smuzhiyun#define divisor $1 57*4882a593Smuzhiyun#define compare $28 58*4882a593Smuzhiyun#define tmp1 $3 59*4882a593Smuzhiyun#define tmp2 $4 60*4882a593Smuzhiyun 61*4882a593Smuzhiyun#ifdef DIV 62*4882a593Smuzhiyun#define DIV_ONLY(x,y...) x,##y 63*4882a593Smuzhiyun#define MOD_ONLY(x,y...) 64*4882a593Smuzhiyun#define func(x) __div##x 65*4882a593Smuzhiyun#define modulus $2 66*4882a593Smuzhiyun#define quotient $27 67*4882a593Smuzhiyun#define GETSIGN(x) xor $24,$25,x 68*4882a593Smuzhiyun#define STACK 48 69*4882a593Smuzhiyun#else 70*4882a593Smuzhiyun#define DIV_ONLY(x,y...) 71*4882a593Smuzhiyun#define MOD_ONLY(x,y...) x,##y 72*4882a593Smuzhiyun#define func(x) __rem##x 73*4882a593Smuzhiyun#define modulus $27 74*4882a593Smuzhiyun#define quotient $2 75*4882a593Smuzhiyun#define GETSIGN(x) bis $24,$24,x 76*4882a593Smuzhiyun#define STACK 32 77*4882a593Smuzhiyun#endif 78*4882a593Smuzhiyun 79*4882a593Smuzhiyun/* 80*4882a593Smuzhiyun * For 32-bit operations, we need to extend to 64-bit 81*4882a593Smuzhiyun */ 82*4882a593Smuzhiyun#ifdef INTSIZE 83*4882a593Smuzhiyun#define ufunction func(lu) 84*4882a593Smuzhiyun#define sfunction func(l) 85*4882a593Smuzhiyun#define LONGIFY(x) zapnot x,15,x 86*4882a593Smuzhiyun#define SLONGIFY(x) addl x,0,x 87*4882a593Smuzhiyun#else 88*4882a593Smuzhiyun#define ufunction func(qu) 89*4882a593Smuzhiyun#define sfunction func(q) 90*4882a593Smuzhiyun#define LONGIFY(x) 91*4882a593Smuzhiyun#define SLONGIFY(x) 92*4882a593Smuzhiyun#endif 93*4882a593Smuzhiyun 94*4882a593Smuzhiyun.set noat 95*4882a593Smuzhiyun.align 3 96*4882a593Smuzhiyun.globl ufunction 97*4882a593Smuzhiyun.ent ufunction 98*4882a593Smuzhiyunufunction: 99*4882a593Smuzhiyun subq $30,STACK,$30 100*4882a593Smuzhiyun .frame $30,STACK,$23 101*4882a593Smuzhiyun .prologue 0 102*4882a593Smuzhiyun 103*4882a593Smuzhiyun7: stq $1, 0($30) 104*4882a593Smuzhiyun bis $25,$25,divisor 105*4882a593Smuzhiyun stq $2, 8($30) 106*4882a593Smuzhiyun bis $24,$24,modulus 107*4882a593Smuzhiyun stq $0,16($30) 108*4882a593Smuzhiyun bis $31,$31,quotient 109*4882a593Smuzhiyun LONGIFY(divisor) 110*4882a593Smuzhiyun stq tmp1,24($30) 111*4882a593Smuzhiyun LONGIFY(modulus) 112*4882a593Smuzhiyun bis $31,1,mask 113*4882a593Smuzhiyun DIV_ONLY(stq tmp2,32($30)) 114*4882a593Smuzhiyun beq divisor, 9f /* div by zero */ 115*4882a593Smuzhiyun 116*4882a593Smuzhiyun#ifdef INTSIZE 117*4882a593Smuzhiyun /* 118*4882a593Smuzhiyun * shift divisor left, using 3-bit shifts for 119*4882a593Smuzhiyun * 32-bit divides as we can't overflow. Three-bit 120*4882a593Smuzhiyun * shifts will result in looping three times less 121*4882a593Smuzhiyun * here, but can result in two loops more later. 122*4882a593Smuzhiyun * Thus using a large shift isn't worth it (and 123*4882a593Smuzhiyun * s8add pairs better than a sll..) 124*4882a593Smuzhiyun */ 125*4882a593Smuzhiyun1: cmpult divisor,modulus,compare 126*4882a593Smuzhiyun s8addq divisor,$31,divisor 127*4882a593Smuzhiyun s8addq mask,$31,mask 128*4882a593Smuzhiyun bne compare,1b 129*4882a593Smuzhiyun#else 130*4882a593Smuzhiyun1: cmpult divisor,modulus,compare 131*4882a593Smuzhiyun blt divisor, 2f 132*4882a593Smuzhiyun addq divisor,divisor,divisor 133*4882a593Smuzhiyun addq mask,mask,mask 134*4882a593Smuzhiyun bne compare,1b 135*4882a593Smuzhiyun unop 136*4882a593Smuzhiyun#endif 137*4882a593Smuzhiyun 138*4882a593Smuzhiyun /* ok, start to go right again.. */ 139*4882a593Smuzhiyun2: DIV_ONLY(addq quotient,mask,tmp2) 140*4882a593Smuzhiyun srl mask,1,mask 141*4882a593Smuzhiyun cmpule divisor,modulus,compare 142*4882a593Smuzhiyun subq modulus,divisor,tmp1 143*4882a593Smuzhiyun DIV_ONLY(cmovne compare,tmp2,quotient) 144*4882a593Smuzhiyun srl divisor,1,divisor 145*4882a593Smuzhiyun cmovne compare,tmp1,modulus 146*4882a593Smuzhiyun bne mask,2b 147*4882a593Smuzhiyun 148*4882a593Smuzhiyun9: ldq $1, 0($30) 149*4882a593Smuzhiyun ldq $2, 8($30) 150*4882a593Smuzhiyun ldq $0,16($30) 151*4882a593Smuzhiyun ldq tmp1,24($30) 152*4882a593Smuzhiyun DIV_ONLY(ldq tmp2,32($30)) 153*4882a593Smuzhiyun addq $30,STACK,$30 154*4882a593Smuzhiyun ret $31,($23),1 155*4882a593Smuzhiyun .end ufunction 156*4882a593SmuzhiyunEXPORT_SYMBOL(ufunction) 157*4882a593Smuzhiyun 158*4882a593Smuzhiyun/* 159*4882a593Smuzhiyun * Uhh.. Ugly signed division. I'd rather not have it at all, but 160*4882a593Smuzhiyun * it's needed in some circumstances. There are different ways to 161*4882a593Smuzhiyun * handle this, really. This does: 162*4882a593Smuzhiyun * -a / b = a / -b = -(a / b) 163*4882a593Smuzhiyun * -a % b = -(a % b) 164*4882a593Smuzhiyun * a % -b = a % b 165*4882a593Smuzhiyun * which is probably not the best solution, but at least should 166*4882a593Smuzhiyun * have the property that (x/y)*y + (x%y) = x. 167*4882a593Smuzhiyun */ 168*4882a593Smuzhiyun.align 3 169*4882a593Smuzhiyun.globl sfunction 170*4882a593Smuzhiyun.ent sfunction 171*4882a593Smuzhiyunsfunction: 172*4882a593Smuzhiyun subq $30,STACK,$30 173*4882a593Smuzhiyun .frame $30,STACK,$23 174*4882a593Smuzhiyun .prologue 0 175*4882a593Smuzhiyun bis $24,$25,$28 176*4882a593Smuzhiyun SLONGIFY($28) 177*4882a593Smuzhiyun bge $28,7b 178*4882a593Smuzhiyun stq $24,0($30) 179*4882a593Smuzhiyun subq $31,$24,$28 180*4882a593Smuzhiyun stq $25,8($30) 181*4882a593Smuzhiyun cmovlt $24,$28,$24 /* abs($24) */ 182*4882a593Smuzhiyun stq $23,16($30) 183*4882a593Smuzhiyun subq $31,$25,$28 184*4882a593Smuzhiyun stq tmp1,24($30) 185*4882a593Smuzhiyun cmovlt $25,$28,$25 /* abs($25) */ 186*4882a593Smuzhiyun unop 187*4882a593Smuzhiyun bsr $23,ufunction 188*4882a593Smuzhiyun ldq $24,0($30) 189*4882a593Smuzhiyun ldq $25,8($30) 190*4882a593Smuzhiyun GETSIGN($28) 191*4882a593Smuzhiyun subq $31,$27,tmp1 192*4882a593Smuzhiyun SLONGIFY($28) 193*4882a593Smuzhiyun ldq $23,16($30) 194*4882a593Smuzhiyun cmovlt $28,tmp1,$27 195*4882a593Smuzhiyun ldq tmp1,24($30) 196*4882a593Smuzhiyun addq $30,STACK,$30 197*4882a593Smuzhiyun ret $31,($23),1 198*4882a593Smuzhiyun .end sfunction 199*4882a593SmuzhiyunEXPORT_SYMBOL(sfunction) 200