1*4882a593Smuzhiyun/* 2*4882a593Smuzhiyun * linux/arch/arm/lib/div64.S 3*4882a593Smuzhiyun * 4*4882a593Smuzhiyun * Optimized computation of 64-bit dividend / 32-bit divisor 5*4882a593Smuzhiyun * 6*4882a593Smuzhiyun * Author: Nicolas Pitre 7*4882a593Smuzhiyun * Created: Oct 5, 2003 8*4882a593Smuzhiyun * Copyright: Monta Vista Software, Inc. 9*4882a593Smuzhiyun * 10*4882a593Smuzhiyun * SPDX-License-Identifier: GPL-2.0 11*4882a593Smuzhiyun */ 12*4882a593Smuzhiyun 13*4882a593Smuzhiyun#include <linux/linkage.h> 14*4882a593Smuzhiyun#include <asm/assembler.h> 15*4882a593Smuzhiyun#ifdef __UBOOT__ 16*4882a593Smuzhiyun#define UNWIND(x...) 17*4882a593Smuzhiyun#endif 18*4882a593Smuzhiyun 19*4882a593Smuzhiyun#ifdef __ARMEB__ 20*4882a593Smuzhiyun#define xh r0 21*4882a593Smuzhiyun#define xl r1 22*4882a593Smuzhiyun#define yh r2 23*4882a593Smuzhiyun#define yl r3 24*4882a593Smuzhiyun#else 25*4882a593Smuzhiyun#define xl r0 26*4882a593Smuzhiyun#define xh r1 27*4882a593Smuzhiyun#define yl r2 28*4882a593Smuzhiyun#define yh r3 29*4882a593Smuzhiyun#endif 30*4882a593Smuzhiyun 31*4882a593Smuzhiyun/* 32*4882a593Smuzhiyun * __do_div64: perform a division with 64-bit dividend and 32-bit divisor. 33*4882a593Smuzhiyun * 34*4882a593Smuzhiyun * Note: Calling convention is totally non standard for optimal code. 35*4882a593Smuzhiyun * This is meant to be used by do_div() from include/asm/div64.h only. 36*4882a593Smuzhiyun * 37*4882a593Smuzhiyun * Input parameters: 38*4882a593Smuzhiyun * xh-xl = dividend (clobbered) 39*4882a593Smuzhiyun * r4 = divisor (preserved) 40*4882a593Smuzhiyun * 41*4882a593Smuzhiyun * Output values: 42*4882a593Smuzhiyun * yh-yl = result 43*4882a593Smuzhiyun * xh = remainder 44*4882a593Smuzhiyun * 45*4882a593Smuzhiyun * Clobbered regs: xl, ip 46*4882a593Smuzhiyun */ 47*4882a593Smuzhiyun 48*4882a593Smuzhiyun.pushsection .text.__do_div64, "ax" 49*4882a593SmuzhiyunENTRY(__do_div64) 50*4882a593SmuzhiyunUNWIND(.fnstart) 51*4882a593Smuzhiyun 52*4882a593Smuzhiyun @ Test for easy paths first. 53*4882a593Smuzhiyun subs ip, r4, #1 54*4882a593Smuzhiyun bls 9f @ divisor is 0 or 1 55*4882a593Smuzhiyun tst ip, r4 56*4882a593Smuzhiyun beq 8f @ divisor is power of 2 57*4882a593Smuzhiyun 58*4882a593Smuzhiyun @ See if we need to handle upper 32-bit result. 59*4882a593Smuzhiyun cmp xh, r4 60*4882a593Smuzhiyun mov yh, #0 61*4882a593Smuzhiyun blo 3f 62*4882a593Smuzhiyun 63*4882a593Smuzhiyun @ Align divisor with upper part of dividend. 64*4882a593Smuzhiyun @ The aligned divisor is stored in yl preserving the original. 65*4882a593Smuzhiyun @ The bit position is stored in ip. 66*4882a593Smuzhiyun 67*4882a593Smuzhiyun#if __LINUX_ARM_ARCH__ >= 5 68*4882a593Smuzhiyun 69*4882a593Smuzhiyun clz yl, r4 70*4882a593Smuzhiyun clz ip, xh 71*4882a593Smuzhiyun sub yl, yl, ip 72*4882a593Smuzhiyun mov ip, #1 73*4882a593Smuzhiyun mov ip, ip, lsl yl 74*4882a593Smuzhiyun mov yl, r4, lsl yl 75*4882a593Smuzhiyun 76*4882a593Smuzhiyun#else 77*4882a593Smuzhiyun 78*4882a593Smuzhiyun mov yl, r4 79*4882a593Smuzhiyun mov ip, #1 80*4882a593Smuzhiyun1: cmp yl, #0x80000000 81*4882a593Smuzhiyun cmpcc yl, xh 82*4882a593Smuzhiyun movcc yl, yl, lsl #1 83*4882a593Smuzhiyun movcc ip, ip, lsl #1 84*4882a593Smuzhiyun bcc 1b 85*4882a593Smuzhiyun 86*4882a593Smuzhiyun#endif 87*4882a593Smuzhiyun 88*4882a593Smuzhiyun @ The division loop for needed upper bit positions. 89*4882a593Smuzhiyun @ Break out early if dividend reaches 0. 90*4882a593Smuzhiyun2: cmp xh, yl 91*4882a593Smuzhiyun orrcs yh, yh, ip 92*4882a593Smuzhiyun subscs xh, xh, yl 93*4882a593Smuzhiyun movsne ip, ip, lsr #1 94*4882a593Smuzhiyun mov yl, yl, lsr #1 95*4882a593Smuzhiyun bne 2b 96*4882a593Smuzhiyun 97*4882a593Smuzhiyun @ See if we need to handle lower 32-bit result. 98*4882a593Smuzhiyun3: cmp xh, #0 99*4882a593Smuzhiyun mov yl, #0 100*4882a593Smuzhiyun cmpeq xl, r4 101*4882a593Smuzhiyun movlo xh, xl 102*4882a593Smuzhiyun retlo lr 103*4882a593Smuzhiyun 104*4882a593Smuzhiyun @ The division loop for lower bit positions. 105*4882a593Smuzhiyun @ Here we shift remainer bits leftwards rather than moving the 106*4882a593Smuzhiyun @ divisor for comparisons, considering the carry-out bit as well. 107*4882a593Smuzhiyun mov ip, #0x80000000 108*4882a593Smuzhiyun4: movs xl, xl, lsl #1 109*4882a593Smuzhiyun adcs xh, xh, xh 110*4882a593Smuzhiyun beq 6f 111*4882a593Smuzhiyun cmpcc xh, r4 112*4882a593Smuzhiyun5: orrcs yl, yl, ip 113*4882a593Smuzhiyun subcs xh, xh, r4 114*4882a593Smuzhiyun movs ip, ip, lsr #1 115*4882a593Smuzhiyun bne 4b 116*4882a593Smuzhiyun ret lr 117*4882a593Smuzhiyun 118*4882a593Smuzhiyun @ The top part of remainder became zero. If carry is set 119*4882a593Smuzhiyun @ (the 33th bit) this is a false positive so resume the loop. 120*4882a593Smuzhiyun @ Otherwise, if lower part is also null then we are done. 121*4882a593Smuzhiyun6: bcs 5b 122*4882a593Smuzhiyun cmp xl, #0 123*4882a593Smuzhiyun reteq lr 124*4882a593Smuzhiyun 125*4882a593Smuzhiyun @ We still have remainer bits in the low part. Bring them up. 126*4882a593Smuzhiyun 127*4882a593Smuzhiyun#if __LINUX_ARM_ARCH__ >= 5 128*4882a593Smuzhiyun 129*4882a593Smuzhiyun clz xh, xl @ we know xh is zero here so... 130*4882a593Smuzhiyun add xh, xh, #1 131*4882a593Smuzhiyun mov xl, xl, lsl xh 132*4882a593Smuzhiyun mov ip, ip, lsr xh 133*4882a593Smuzhiyun 134*4882a593Smuzhiyun#else 135*4882a593Smuzhiyun 136*4882a593Smuzhiyun7: movs xl, xl, lsl #1 137*4882a593Smuzhiyun mov ip, ip, lsr #1 138*4882a593Smuzhiyun bcc 7b 139*4882a593Smuzhiyun 140*4882a593Smuzhiyun#endif 141*4882a593Smuzhiyun 142*4882a593Smuzhiyun @ Current remainder is now 1. It is worthless to compare with 143*4882a593Smuzhiyun @ divisor at this point since divisor can not be smaller than 3 here. 144*4882a593Smuzhiyun @ If possible, branch for another shift in the division loop. 145*4882a593Smuzhiyun @ If no bit position left then we are done. 146*4882a593Smuzhiyun movs ip, ip, lsr #1 147*4882a593Smuzhiyun mov xh, #1 148*4882a593Smuzhiyun bne 4b 149*4882a593Smuzhiyun ret lr 150*4882a593Smuzhiyun 151*4882a593Smuzhiyun8: @ Division by a power of 2: determine what that divisor order is 152*4882a593Smuzhiyun @ then simply shift values around 153*4882a593Smuzhiyun 154*4882a593Smuzhiyun#if __LINUX_ARM_ARCH__ >= 5 155*4882a593Smuzhiyun 156*4882a593Smuzhiyun clz ip, r4 157*4882a593Smuzhiyun rsb ip, ip, #31 158*4882a593Smuzhiyun 159*4882a593Smuzhiyun#else 160*4882a593Smuzhiyun 161*4882a593Smuzhiyun mov yl, r4 162*4882a593Smuzhiyun cmp r4, #(1 << 16) 163*4882a593Smuzhiyun mov ip, #0 164*4882a593Smuzhiyun movhs yl, yl, lsr #16 165*4882a593Smuzhiyun movhs ip, #16 166*4882a593Smuzhiyun 167*4882a593Smuzhiyun cmp yl, #(1 << 8) 168*4882a593Smuzhiyun movhs yl, yl, lsr #8 169*4882a593Smuzhiyun addhs ip, ip, #8 170*4882a593Smuzhiyun 171*4882a593Smuzhiyun cmp yl, #(1 << 4) 172*4882a593Smuzhiyun movhs yl, yl, lsr #4 173*4882a593Smuzhiyun addhs ip, ip, #4 174*4882a593Smuzhiyun 175*4882a593Smuzhiyun cmp yl, #(1 << 2) 176*4882a593Smuzhiyun addhi ip, ip, #3 177*4882a593Smuzhiyun addls ip, ip, yl, lsr #1 178*4882a593Smuzhiyun 179*4882a593Smuzhiyun#endif 180*4882a593Smuzhiyun 181*4882a593Smuzhiyun mov yh, xh, lsr ip 182*4882a593Smuzhiyun mov yl, xl, lsr ip 183*4882a593Smuzhiyun rsb ip, ip, #32 184*4882a593Smuzhiyun ARM( orr yl, yl, xh, lsl ip ) 185*4882a593Smuzhiyun THUMB( lsl xh, xh, ip ) 186*4882a593Smuzhiyun THUMB( orr yl, yl, xh ) 187*4882a593Smuzhiyun mov xh, xl, lsl ip 188*4882a593Smuzhiyun mov xh, xh, lsr ip 189*4882a593Smuzhiyun ret lr 190*4882a593Smuzhiyun 191*4882a593Smuzhiyun @ eq -> division by 1: obvious enough... 192*4882a593Smuzhiyun9: moveq yl, xl 193*4882a593Smuzhiyun moveq yh, xh 194*4882a593Smuzhiyun moveq xh, #0 195*4882a593Smuzhiyun reteq lr 196*4882a593SmuzhiyunUNWIND(.fnend) 197*4882a593Smuzhiyun 198*4882a593SmuzhiyunUNWIND(.fnstart) 199*4882a593SmuzhiyunUNWIND(.pad #4) 200*4882a593SmuzhiyunUNWIND(.save {lr}) 201*4882a593SmuzhiyunLdiv0_64: 202*4882a593Smuzhiyun @ Division by 0: 203*4882a593Smuzhiyun str lr, [sp, #-8]! 204*4882a593Smuzhiyun bl __div0 205*4882a593Smuzhiyun 206*4882a593Smuzhiyun @ as wrong as it could be... 207*4882a593Smuzhiyun mov yl, #0 208*4882a593Smuzhiyun mov yh, #0 209*4882a593Smuzhiyun mov xh, #0 210*4882a593Smuzhiyun ldr pc, [sp], #8 211*4882a593Smuzhiyun 212*4882a593SmuzhiyunUNWIND(.fnend) 213*4882a593SmuzhiyunENDPROC(__do_div64) 214*4882a593Smuzhiyun.popsection 215