1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */ 2*4882a593Smuzhiyun .file "wm_sqrt.S" 3*4882a593Smuzhiyun/*---------------------------------------------------------------------------+ 4*4882a593Smuzhiyun | wm_sqrt.S | 5*4882a593Smuzhiyun | | 6*4882a593Smuzhiyun | Fixed point arithmetic square root evaluation. | 7*4882a593Smuzhiyun | | 8*4882a593Smuzhiyun | Copyright (C) 1992,1993,1995,1997 | 9*4882a593Smuzhiyun | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | 10*4882a593Smuzhiyun | Australia. E-mail billm@suburbia.net | 11*4882a593Smuzhiyun | | 12*4882a593Smuzhiyun | Call from C as: | 13*4882a593Smuzhiyun | int wm_sqrt(FPU_REG *n, unsigned int control_word) | 14*4882a593Smuzhiyun | | 15*4882a593Smuzhiyun +---------------------------------------------------------------------------*/ 16*4882a593Smuzhiyun 17*4882a593Smuzhiyun/*---------------------------------------------------------------------------+ 18*4882a593Smuzhiyun | wm_sqrt(FPU_REG *n, unsigned int control_word) | 19*4882a593Smuzhiyun | returns the square root of n in n. | 20*4882a593Smuzhiyun | | 21*4882a593Smuzhiyun | Use Newton's method to compute the square root of a number, which must | 22*4882a593Smuzhiyun | be in the range [1.0 .. 4.0), to 64 bits accuracy. | 23*4882a593Smuzhiyun | Does not check the sign or tag of the argument. | 24*4882a593Smuzhiyun | Sets the exponent, but not the sign or tag of the result. | 25*4882a593Smuzhiyun | | 26*4882a593Smuzhiyun | The guess is kept in %esi:%edi | 27*4882a593Smuzhiyun +---------------------------------------------------------------------------*/ 28*4882a593Smuzhiyun 29*4882a593Smuzhiyun#include "exception.h" 30*4882a593Smuzhiyun#include "fpu_emu.h" 31*4882a593Smuzhiyun 32*4882a593Smuzhiyun 33*4882a593Smuzhiyun#ifndef NON_REENTRANT_FPU 34*4882a593Smuzhiyun/* Local storage on the stack: */ 35*4882a593Smuzhiyun#define FPU_accum_3 -4(%ebp) /* ms word */ 36*4882a593Smuzhiyun#define FPU_accum_2 -8(%ebp) 37*4882a593Smuzhiyun#define FPU_accum_1 -12(%ebp) 38*4882a593Smuzhiyun#define FPU_accum_0 -16(%ebp) 39*4882a593Smuzhiyun 40*4882a593Smuzhiyun/* 41*4882a593Smuzhiyun * The de-normalised argument: 42*4882a593Smuzhiyun * sq_2 sq_1 sq_0 43*4882a593Smuzhiyun * b b b b b b b ... b b b b b b .... b b b b 0 0 0 ... 0 44*4882a593Smuzhiyun * ^ binary point here 45*4882a593Smuzhiyun */ 46*4882a593Smuzhiyun#define FPU_fsqrt_arg_2 -20(%ebp) /* ms word */ 47*4882a593Smuzhiyun#define FPU_fsqrt_arg_1 -24(%ebp) 48*4882a593Smuzhiyun#define FPU_fsqrt_arg_0 -28(%ebp) /* ls word, at most the ms bit is set */ 49*4882a593Smuzhiyun 50*4882a593Smuzhiyun#else 51*4882a593Smuzhiyun/* Local storage in a static area: */ 52*4882a593Smuzhiyun.data 53*4882a593Smuzhiyun .align 4,0 54*4882a593SmuzhiyunFPU_accum_3: 55*4882a593Smuzhiyun .long 0 /* ms word */ 56*4882a593SmuzhiyunFPU_accum_2: 57*4882a593Smuzhiyun .long 0 58*4882a593SmuzhiyunFPU_accum_1: 59*4882a593Smuzhiyun .long 0 60*4882a593SmuzhiyunFPU_accum_0: 61*4882a593Smuzhiyun .long 0 62*4882a593Smuzhiyun 63*4882a593Smuzhiyun/* The de-normalised argument: 64*4882a593Smuzhiyun sq_2 sq_1 sq_0 65*4882a593Smuzhiyun b b b b b b b ... b b b b b b .... b b b b 0 0 0 ... 0 66*4882a593Smuzhiyun ^ binary point here 67*4882a593Smuzhiyun */ 68*4882a593SmuzhiyunFPU_fsqrt_arg_2: 69*4882a593Smuzhiyun .long 0 /* ms word */ 70*4882a593SmuzhiyunFPU_fsqrt_arg_1: 71*4882a593Smuzhiyun .long 0 72*4882a593SmuzhiyunFPU_fsqrt_arg_0: 73*4882a593Smuzhiyun .long 0 /* ls word, at most the ms bit is set */ 74*4882a593Smuzhiyun#endif /* NON_REENTRANT_FPU */ 75*4882a593Smuzhiyun 76*4882a593Smuzhiyun 77*4882a593Smuzhiyun.text 78*4882a593SmuzhiyunSYM_FUNC_START(wm_sqrt) 79*4882a593Smuzhiyun pushl %ebp 80*4882a593Smuzhiyun movl %esp,%ebp 81*4882a593Smuzhiyun#ifndef NON_REENTRANT_FPU 82*4882a593Smuzhiyun subl $28,%esp 83*4882a593Smuzhiyun#endif /* NON_REENTRANT_FPU */ 84*4882a593Smuzhiyun pushl %esi 85*4882a593Smuzhiyun pushl %edi 86*4882a593Smuzhiyun pushl %ebx 87*4882a593Smuzhiyun 88*4882a593Smuzhiyun movl PARAM1,%esi 89*4882a593Smuzhiyun 90*4882a593Smuzhiyun movl SIGH(%esi),%eax 91*4882a593Smuzhiyun movl SIGL(%esi),%ecx 92*4882a593Smuzhiyun xorl %edx,%edx 93*4882a593Smuzhiyun 94*4882a593Smuzhiyun/* We use a rough linear estimate for the first guess.. */ 95*4882a593Smuzhiyun 96*4882a593Smuzhiyun cmpw EXP_BIAS,EXP(%esi) 97*4882a593Smuzhiyun jnz sqrt_arg_ge_2 98*4882a593Smuzhiyun 99*4882a593Smuzhiyun shrl $1,%eax /* arg is in the range [1.0 .. 2.0) */ 100*4882a593Smuzhiyun rcrl $1,%ecx 101*4882a593Smuzhiyun rcrl $1,%edx 102*4882a593Smuzhiyun 103*4882a593Smuzhiyunsqrt_arg_ge_2: 104*4882a593Smuzhiyun/* From here on, n is never accessed directly again until it is 105*4882a593Smuzhiyun replaced by the answer. */ 106*4882a593Smuzhiyun 107*4882a593Smuzhiyun movl %eax,FPU_fsqrt_arg_2 /* ms word of n */ 108*4882a593Smuzhiyun movl %ecx,FPU_fsqrt_arg_1 109*4882a593Smuzhiyun movl %edx,FPU_fsqrt_arg_0 110*4882a593Smuzhiyun 111*4882a593Smuzhiyun/* Make a linear first estimate */ 112*4882a593Smuzhiyun shrl $1,%eax 113*4882a593Smuzhiyun addl $0x40000000,%eax 114*4882a593Smuzhiyun movl $0xaaaaaaaa,%ecx 115*4882a593Smuzhiyun mull %ecx 116*4882a593Smuzhiyun shll %edx /* max result was 7fff... */ 117*4882a593Smuzhiyun testl $0x80000000,%edx /* but min was 3fff... */ 118*4882a593Smuzhiyun jnz sqrt_prelim_no_adjust 119*4882a593Smuzhiyun 120*4882a593Smuzhiyun movl $0x80000000,%edx /* round up */ 121*4882a593Smuzhiyun 122*4882a593Smuzhiyunsqrt_prelim_no_adjust: 123*4882a593Smuzhiyun movl %edx,%esi /* Our first guess */ 124*4882a593Smuzhiyun 125*4882a593Smuzhiyun/* We have now computed (approx) (2 + x) / 3, which forms the basis 126*4882a593Smuzhiyun for a few iterations of Newton's method */ 127*4882a593Smuzhiyun 128*4882a593Smuzhiyun movl FPU_fsqrt_arg_2,%ecx /* ms word */ 129*4882a593Smuzhiyun 130*4882a593Smuzhiyun/* 131*4882a593Smuzhiyun * From our initial estimate, three iterations are enough to get us 132*4882a593Smuzhiyun * to 30 bits or so. This will then allow two iterations at better 133*4882a593Smuzhiyun * precision to complete the process. 134*4882a593Smuzhiyun */ 135*4882a593Smuzhiyun 136*4882a593Smuzhiyun/* Compute (g + n/g)/2 at each iteration (g is the guess). */ 137*4882a593Smuzhiyun shrl %ecx /* Doing this first will prevent a divide */ 138*4882a593Smuzhiyun /* overflow later. */ 139*4882a593Smuzhiyun 140*4882a593Smuzhiyun movl %ecx,%edx /* msw of the arg / 2 */ 141*4882a593Smuzhiyun divl %esi /* current estimate */ 142*4882a593Smuzhiyun shrl %esi /* divide by 2 */ 143*4882a593Smuzhiyun addl %eax,%esi /* the new estimate */ 144*4882a593Smuzhiyun 145*4882a593Smuzhiyun movl %ecx,%edx 146*4882a593Smuzhiyun divl %esi 147*4882a593Smuzhiyun shrl %esi 148*4882a593Smuzhiyun addl %eax,%esi 149*4882a593Smuzhiyun 150*4882a593Smuzhiyun movl %ecx,%edx 151*4882a593Smuzhiyun divl %esi 152*4882a593Smuzhiyun shrl %esi 153*4882a593Smuzhiyun addl %eax,%esi 154*4882a593Smuzhiyun 155*4882a593Smuzhiyun/* 156*4882a593Smuzhiyun * Now that an estimate accurate to about 30 bits has been obtained (in %esi), 157*4882a593Smuzhiyun * we improve it to 60 bits or so. 158*4882a593Smuzhiyun * 159*4882a593Smuzhiyun * The strategy from now on is to compute new estimates from 160*4882a593Smuzhiyun * guess := guess + (n - guess^2) / (2 * guess) 161*4882a593Smuzhiyun */ 162*4882a593Smuzhiyun 163*4882a593Smuzhiyun/* First, find the square of the guess */ 164*4882a593Smuzhiyun movl %esi,%eax 165*4882a593Smuzhiyun mull %esi 166*4882a593Smuzhiyun/* guess^2 now in %edx:%eax */ 167*4882a593Smuzhiyun 168*4882a593Smuzhiyun movl FPU_fsqrt_arg_1,%ecx 169*4882a593Smuzhiyun subl %ecx,%eax 170*4882a593Smuzhiyun movl FPU_fsqrt_arg_2,%ecx /* ms word of normalized n */ 171*4882a593Smuzhiyun sbbl %ecx,%edx 172*4882a593Smuzhiyun jnc sqrt_stage_2_positive 173*4882a593Smuzhiyun 174*4882a593Smuzhiyun/* Subtraction gives a negative result, 175*4882a593Smuzhiyun negate the result before division. */ 176*4882a593Smuzhiyun notl %edx 177*4882a593Smuzhiyun notl %eax 178*4882a593Smuzhiyun addl $1,%eax 179*4882a593Smuzhiyun adcl $0,%edx 180*4882a593Smuzhiyun 181*4882a593Smuzhiyun divl %esi 182*4882a593Smuzhiyun movl %eax,%ecx 183*4882a593Smuzhiyun 184*4882a593Smuzhiyun movl %edx,%eax 185*4882a593Smuzhiyun divl %esi 186*4882a593Smuzhiyun jmp sqrt_stage_2_finish 187*4882a593Smuzhiyun 188*4882a593Smuzhiyunsqrt_stage_2_positive: 189*4882a593Smuzhiyun divl %esi 190*4882a593Smuzhiyun movl %eax,%ecx 191*4882a593Smuzhiyun 192*4882a593Smuzhiyun movl %edx,%eax 193*4882a593Smuzhiyun divl %esi 194*4882a593Smuzhiyun 195*4882a593Smuzhiyun notl %ecx 196*4882a593Smuzhiyun notl %eax 197*4882a593Smuzhiyun addl $1,%eax 198*4882a593Smuzhiyun adcl $0,%ecx 199*4882a593Smuzhiyun 200*4882a593Smuzhiyunsqrt_stage_2_finish: 201*4882a593Smuzhiyun sarl $1,%ecx /* divide by 2 */ 202*4882a593Smuzhiyun rcrl $1,%eax 203*4882a593Smuzhiyun 204*4882a593Smuzhiyun /* Form the new estimate in %esi:%edi */ 205*4882a593Smuzhiyun movl %eax,%edi 206*4882a593Smuzhiyun addl %ecx,%esi 207*4882a593Smuzhiyun 208*4882a593Smuzhiyun jnz sqrt_stage_2_done /* result should be [1..2) */ 209*4882a593Smuzhiyun 210*4882a593Smuzhiyun#ifdef PARANOID 211*4882a593Smuzhiyun/* It should be possible to get here only if the arg is ffff....ffff */ 212*4882a593Smuzhiyun cmpl $0xffffffff,FPU_fsqrt_arg_1 213*4882a593Smuzhiyun jnz sqrt_stage_2_error 214*4882a593Smuzhiyun#endif /* PARANOID */ 215*4882a593Smuzhiyun 216*4882a593Smuzhiyun/* The best rounded result. */ 217*4882a593Smuzhiyun xorl %eax,%eax 218*4882a593Smuzhiyun decl %eax 219*4882a593Smuzhiyun movl %eax,%edi 220*4882a593Smuzhiyun movl %eax,%esi 221*4882a593Smuzhiyun movl $0x7fffffff,%eax 222*4882a593Smuzhiyun jmp sqrt_round_result 223*4882a593Smuzhiyun 224*4882a593Smuzhiyun#ifdef PARANOID 225*4882a593Smuzhiyunsqrt_stage_2_error: 226*4882a593Smuzhiyun pushl EX_INTERNAL|0x213 227*4882a593Smuzhiyun call EXCEPTION 228*4882a593Smuzhiyun#endif /* PARANOID */ 229*4882a593Smuzhiyun 230*4882a593Smuzhiyunsqrt_stage_2_done: 231*4882a593Smuzhiyun 232*4882a593Smuzhiyun/* Now the square root has been computed to better than 60 bits. */ 233*4882a593Smuzhiyun 234*4882a593Smuzhiyun/* Find the square of the guess. */ 235*4882a593Smuzhiyun movl %edi,%eax /* ls word of guess */ 236*4882a593Smuzhiyun mull %edi 237*4882a593Smuzhiyun movl %edx,FPU_accum_1 238*4882a593Smuzhiyun 239*4882a593Smuzhiyun movl %esi,%eax 240*4882a593Smuzhiyun mull %esi 241*4882a593Smuzhiyun movl %edx,FPU_accum_3 242*4882a593Smuzhiyun movl %eax,FPU_accum_2 243*4882a593Smuzhiyun 244*4882a593Smuzhiyun movl %edi,%eax 245*4882a593Smuzhiyun mull %esi 246*4882a593Smuzhiyun addl %eax,FPU_accum_1 247*4882a593Smuzhiyun adcl %edx,FPU_accum_2 248*4882a593Smuzhiyun adcl $0,FPU_accum_3 249*4882a593Smuzhiyun 250*4882a593Smuzhiyun/* movl %esi,%eax */ 251*4882a593Smuzhiyun/* mull %edi */ 252*4882a593Smuzhiyun addl %eax,FPU_accum_1 253*4882a593Smuzhiyun adcl %edx,FPU_accum_2 254*4882a593Smuzhiyun adcl $0,FPU_accum_3 255*4882a593Smuzhiyun 256*4882a593Smuzhiyun/* guess^2 now in FPU_accum_3:FPU_accum_2:FPU_accum_1 */ 257*4882a593Smuzhiyun 258*4882a593Smuzhiyun movl FPU_fsqrt_arg_0,%eax /* get normalized n */ 259*4882a593Smuzhiyun subl %eax,FPU_accum_1 260*4882a593Smuzhiyun movl FPU_fsqrt_arg_1,%eax 261*4882a593Smuzhiyun sbbl %eax,FPU_accum_2 262*4882a593Smuzhiyun movl FPU_fsqrt_arg_2,%eax /* ms word of normalized n */ 263*4882a593Smuzhiyun sbbl %eax,FPU_accum_3 264*4882a593Smuzhiyun jnc sqrt_stage_3_positive 265*4882a593Smuzhiyun 266*4882a593Smuzhiyun/* Subtraction gives a negative result, 267*4882a593Smuzhiyun negate the result before division */ 268*4882a593Smuzhiyun notl FPU_accum_1 269*4882a593Smuzhiyun notl FPU_accum_2 270*4882a593Smuzhiyun notl FPU_accum_3 271*4882a593Smuzhiyun addl $1,FPU_accum_1 272*4882a593Smuzhiyun adcl $0,FPU_accum_2 273*4882a593Smuzhiyun 274*4882a593Smuzhiyun#ifdef PARANOID 275*4882a593Smuzhiyun adcl $0,FPU_accum_3 /* This must be zero */ 276*4882a593Smuzhiyun jz sqrt_stage_3_no_error 277*4882a593Smuzhiyun 278*4882a593Smuzhiyunsqrt_stage_3_error: 279*4882a593Smuzhiyun pushl EX_INTERNAL|0x207 280*4882a593Smuzhiyun call EXCEPTION 281*4882a593Smuzhiyun 282*4882a593Smuzhiyunsqrt_stage_3_no_error: 283*4882a593Smuzhiyun#endif /* PARANOID */ 284*4882a593Smuzhiyun 285*4882a593Smuzhiyun movl FPU_accum_2,%edx 286*4882a593Smuzhiyun movl FPU_accum_1,%eax 287*4882a593Smuzhiyun divl %esi 288*4882a593Smuzhiyun movl %eax,%ecx 289*4882a593Smuzhiyun 290*4882a593Smuzhiyun movl %edx,%eax 291*4882a593Smuzhiyun divl %esi 292*4882a593Smuzhiyun 293*4882a593Smuzhiyun sarl $1,%ecx /* divide by 2 */ 294*4882a593Smuzhiyun rcrl $1,%eax 295*4882a593Smuzhiyun 296*4882a593Smuzhiyun /* prepare to round the result */ 297*4882a593Smuzhiyun 298*4882a593Smuzhiyun addl %ecx,%edi 299*4882a593Smuzhiyun adcl $0,%esi 300*4882a593Smuzhiyun 301*4882a593Smuzhiyun jmp sqrt_stage_3_finished 302*4882a593Smuzhiyun 303*4882a593Smuzhiyunsqrt_stage_3_positive: 304*4882a593Smuzhiyun movl FPU_accum_2,%edx 305*4882a593Smuzhiyun movl FPU_accum_1,%eax 306*4882a593Smuzhiyun divl %esi 307*4882a593Smuzhiyun movl %eax,%ecx 308*4882a593Smuzhiyun 309*4882a593Smuzhiyun movl %edx,%eax 310*4882a593Smuzhiyun divl %esi 311*4882a593Smuzhiyun 312*4882a593Smuzhiyun sarl $1,%ecx /* divide by 2 */ 313*4882a593Smuzhiyun rcrl $1,%eax 314*4882a593Smuzhiyun 315*4882a593Smuzhiyun /* prepare to round the result */ 316*4882a593Smuzhiyun 317*4882a593Smuzhiyun notl %eax /* Negate the correction term */ 318*4882a593Smuzhiyun notl %ecx 319*4882a593Smuzhiyun addl $1,%eax 320*4882a593Smuzhiyun adcl $0,%ecx /* carry here ==> correction == 0 */ 321*4882a593Smuzhiyun adcl $0xffffffff,%esi 322*4882a593Smuzhiyun 323*4882a593Smuzhiyun addl %ecx,%edi 324*4882a593Smuzhiyun adcl $0,%esi 325*4882a593Smuzhiyun 326*4882a593Smuzhiyunsqrt_stage_3_finished: 327*4882a593Smuzhiyun 328*4882a593Smuzhiyun/* 329*4882a593Smuzhiyun * The result in %esi:%edi:%esi should be good to about 90 bits here, 330*4882a593Smuzhiyun * and the rounding information here does not have sufficient accuracy 331*4882a593Smuzhiyun * in a few rare cases. 332*4882a593Smuzhiyun */ 333*4882a593Smuzhiyun cmpl $0xffffffe0,%eax 334*4882a593Smuzhiyun ja sqrt_near_exact_x 335*4882a593Smuzhiyun 336*4882a593Smuzhiyun cmpl $0x00000020,%eax 337*4882a593Smuzhiyun jb sqrt_near_exact 338*4882a593Smuzhiyun 339*4882a593Smuzhiyun cmpl $0x7fffffe0,%eax 340*4882a593Smuzhiyun jb sqrt_round_result 341*4882a593Smuzhiyun 342*4882a593Smuzhiyun cmpl $0x80000020,%eax 343*4882a593Smuzhiyun jb sqrt_get_more_precision 344*4882a593Smuzhiyun 345*4882a593Smuzhiyunsqrt_round_result: 346*4882a593Smuzhiyun/* Set up for rounding operations */ 347*4882a593Smuzhiyun movl %eax,%edx 348*4882a593Smuzhiyun movl %esi,%eax 349*4882a593Smuzhiyun movl %edi,%ebx 350*4882a593Smuzhiyun movl PARAM1,%edi 351*4882a593Smuzhiyun movw EXP_BIAS,EXP(%edi) /* Result is in [1.0 .. 2.0) */ 352*4882a593Smuzhiyun jmp fpu_reg_round 353*4882a593Smuzhiyun 354*4882a593Smuzhiyun 355*4882a593Smuzhiyunsqrt_near_exact_x: 356*4882a593Smuzhiyun/* First, the estimate must be rounded up. */ 357*4882a593Smuzhiyun addl $1,%edi 358*4882a593Smuzhiyun adcl $0,%esi 359*4882a593Smuzhiyun 360*4882a593Smuzhiyunsqrt_near_exact: 361*4882a593Smuzhiyun/* 362*4882a593Smuzhiyun * This is an easy case because x^1/2 is monotonic. 363*4882a593Smuzhiyun * We need just find the square of our estimate, compare it 364*4882a593Smuzhiyun * with the argument, and deduce whether our estimate is 365*4882a593Smuzhiyun * above, below, or exact. We use the fact that the estimate 366*4882a593Smuzhiyun * is known to be accurate to about 90 bits. 367*4882a593Smuzhiyun */ 368*4882a593Smuzhiyun movl %edi,%eax /* ls word of guess */ 369*4882a593Smuzhiyun mull %edi 370*4882a593Smuzhiyun movl %edx,%ebx /* 2nd ls word of square */ 371*4882a593Smuzhiyun movl %eax,%ecx /* ls word of square */ 372*4882a593Smuzhiyun 373*4882a593Smuzhiyun movl %edi,%eax 374*4882a593Smuzhiyun mull %esi 375*4882a593Smuzhiyun addl %eax,%ebx 376*4882a593Smuzhiyun addl %eax,%ebx 377*4882a593Smuzhiyun 378*4882a593Smuzhiyun#ifdef PARANOID 379*4882a593Smuzhiyun cmp $0xffffffb0,%ebx 380*4882a593Smuzhiyun jb sqrt_near_exact_ok 381*4882a593Smuzhiyun 382*4882a593Smuzhiyun cmp $0x00000050,%ebx 383*4882a593Smuzhiyun ja sqrt_near_exact_ok 384*4882a593Smuzhiyun 385*4882a593Smuzhiyun pushl EX_INTERNAL|0x214 386*4882a593Smuzhiyun call EXCEPTION 387*4882a593Smuzhiyun 388*4882a593Smuzhiyunsqrt_near_exact_ok: 389*4882a593Smuzhiyun#endif /* PARANOID */ 390*4882a593Smuzhiyun 391*4882a593Smuzhiyun or %ebx,%ebx 392*4882a593Smuzhiyun js sqrt_near_exact_small 393*4882a593Smuzhiyun 394*4882a593Smuzhiyun jnz sqrt_near_exact_large 395*4882a593Smuzhiyun 396*4882a593Smuzhiyun or %ebx,%edx 397*4882a593Smuzhiyun jnz sqrt_near_exact_large 398*4882a593Smuzhiyun 399*4882a593Smuzhiyun/* Our estimate is exactly the right answer */ 400*4882a593Smuzhiyun xorl %eax,%eax 401*4882a593Smuzhiyun jmp sqrt_round_result 402*4882a593Smuzhiyun 403*4882a593Smuzhiyunsqrt_near_exact_small: 404*4882a593Smuzhiyun/* Our estimate is too small */ 405*4882a593Smuzhiyun movl $0x000000ff,%eax 406*4882a593Smuzhiyun jmp sqrt_round_result 407*4882a593Smuzhiyun 408*4882a593Smuzhiyunsqrt_near_exact_large: 409*4882a593Smuzhiyun/* Our estimate is too large, we need to decrement it */ 410*4882a593Smuzhiyun subl $1,%edi 411*4882a593Smuzhiyun sbbl $0,%esi 412*4882a593Smuzhiyun movl $0xffffff00,%eax 413*4882a593Smuzhiyun jmp sqrt_round_result 414*4882a593Smuzhiyun 415*4882a593Smuzhiyun 416*4882a593Smuzhiyunsqrt_get_more_precision: 417*4882a593Smuzhiyun/* This case is almost the same as the above, except we start 418*4882a593Smuzhiyun with an extra bit of precision in the estimate. */ 419*4882a593Smuzhiyun stc /* The extra bit. */ 420*4882a593Smuzhiyun rcll $1,%edi /* Shift the estimate left one bit */ 421*4882a593Smuzhiyun rcll $1,%esi 422*4882a593Smuzhiyun 423*4882a593Smuzhiyun movl %edi,%eax /* ls word of guess */ 424*4882a593Smuzhiyun mull %edi 425*4882a593Smuzhiyun movl %edx,%ebx /* 2nd ls word of square */ 426*4882a593Smuzhiyun movl %eax,%ecx /* ls word of square */ 427*4882a593Smuzhiyun 428*4882a593Smuzhiyun movl %edi,%eax 429*4882a593Smuzhiyun mull %esi 430*4882a593Smuzhiyun addl %eax,%ebx 431*4882a593Smuzhiyun addl %eax,%ebx 432*4882a593Smuzhiyun 433*4882a593Smuzhiyun/* Put our estimate back to its original value */ 434*4882a593Smuzhiyun stc /* The ms bit. */ 435*4882a593Smuzhiyun rcrl $1,%esi /* Shift the estimate left one bit */ 436*4882a593Smuzhiyun rcrl $1,%edi 437*4882a593Smuzhiyun 438*4882a593Smuzhiyun#ifdef PARANOID 439*4882a593Smuzhiyun cmp $0xffffff60,%ebx 440*4882a593Smuzhiyun jb sqrt_more_prec_ok 441*4882a593Smuzhiyun 442*4882a593Smuzhiyun cmp $0x000000a0,%ebx 443*4882a593Smuzhiyun ja sqrt_more_prec_ok 444*4882a593Smuzhiyun 445*4882a593Smuzhiyun pushl EX_INTERNAL|0x215 446*4882a593Smuzhiyun call EXCEPTION 447*4882a593Smuzhiyun 448*4882a593Smuzhiyunsqrt_more_prec_ok: 449*4882a593Smuzhiyun#endif /* PARANOID */ 450*4882a593Smuzhiyun 451*4882a593Smuzhiyun or %ebx,%ebx 452*4882a593Smuzhiyun js sqrt_more_prec_small 453*4882a593Smuzhiyun 454*4882a593Smuzhiyun jnz sqrt_more_prec_large 455*4882a593Smuzhiyun 456*4882a593Smuzhiyun or %ebx,%ecx 457*4882a593Smuzhiyun jnz sqrt_more_prec_large 458*4882a593Smuzhiyun 459*4882a593Smuzhiyun/* Our estimate is exactly the right answer */ 460*4882a593Smuzhiyun movl $0x80000000,%eax 461*4882a593Smuzhiyun jmp sqrt_round_result 462*4882a593Smuzhiyun 463*4882a593Smuzhiyunsqrt_more_prec_small: 464*4882a593Smuzhiyun/* Our estimate is too small */ 465*4882a593Smuzhiyun movl $0x800000ff,%eax 466*4882a593Smuzhiyun jmp sqrt_round_result 467*4882a593Smuzhiyun 468*4882a593Smuzhiyunsqrt_more_prec_large: 469*4882a593Smuzhiyun/* Our estimate is too large */ 470*4882a593Smuzhiyun movl $0x7fffff00,%eax 471*4882a593Smuzhiyun jmp sqrt_round_result 472*4882a593SmuzhiyunSYM_FUNC_END(wm_sqrt) 473