xref: /OK3568_Linux_fs/kernel/arch/x86/math-emu/wm_sqrt.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun	.file	"wm_sqrt.S"
3*4882a593Smuzhiyun/*---------------------------------------------------------------------------+
4*4882a593Smuzhiyun |  wm_sqrt.S                                                                |
5*4882a593Smuzhiyun |                                                                           |
6*4882a593Smuzhiyun | Fixed point arithmetic square root evaluation.                            |
7*4882a593Smuzhiyun |                                                                           |
8*4882a593Smuzhiyun | Copyright (C) 1992,1993,1995,1997                                         |
9*4882a593Smuzhiyun |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
10*4882a593Smuzhiyun |                       Australia.  E-mail billm@suburbia.net               |
11*4882a593Smuzhiyun |                                                                           |
12*4882a593Smuzhiyun | Call from C as:                                                           |
13*4882a593Smuzhiyun |    int wm_sqrt(FPU_REG *n, unsigned int control_word)                     |
14*4882a593Smuzhiyun |                                                                           |
15*4882a593Smuzhiyun +---------------------------------------------------------------------------*/
16*4882a593Smuzhiyun
17*4882a593Smuzhiyun/*---------------------------------------------------------------------------+
18*4882a593Smuzhiyun |  wm_sqrt(FPU_REG *n, unsigned int control_word)                           |
19*4882a593Smuzhiyun |    returns the square root of n in n.                                     |
20*4882a593Smuzhiyun |                                                                           |
21*4882a593Smuzhiyun |  Use Newton's method to compute the square root of a number, which must   |
22*4882a593Smuzhiyun |  be in the range  [1.0 .. 4.0),  to 64 bits accuracy.                     |
23*4882a593Smuzhiyun |  Does not check the sign or tag of the argument.                          |
24*4882a593Smuzhiyun |  Sets the exponent, but not the sign or tag of the result.                |
25*4882a593Smuzhiyun |                                                                           |
26*4882a593Smuzhiyun |  The guess is kept in %esi:%edi                                           |
27*4882a593Smuzhiyun +---------------------------------------------------------------------------*/
28*4882a593Smuzhiyun
29*4882a593Smuzhiyun#include "exception.h"
30*4882a593Smuzhiyun#include "fpu_emu.h"
31*4882a593Smuzhiyun
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun#ifndef NON_REENTRANT_FPU
34*4882a593Smuzhiyun/*	Local storage on the stack: */
35*4882a593Smuzhiyun#define FPU_accum_3	-4(%ebp)	/* ms word */
36*4882a593Smuzhiyun#define FPU_accum_2	-8(%ebp)
37*4882a593Smuzhiyun#define FPU_accum_1	-12(%ebp)
38*4882a593Smuzhiyun#define FPU_accum_0	-16(%ebp)
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun/*
41*4882a593Smuzhiyun * The de-normalised argument:
42*4882a593Smuzhiyun *                  sq_2                  sq_1              sq_0
43*4882a593Smuzhiyun *        b b b b b b b ... b b b   b b b .... b b b   b 0 0 0 ... 0
44*4882a593Smuzhiyun *           ^ binary point here
45*4882a593Smuzhiyun */
46*4882a593Smuzhiyun#define FPU_fsqrt_arg_2	-20(%ebp)	/* ms word */
47*4882a593Smuzhiyun#define FPU_fsqrt_arg_1	-24(%ebp)
48*4882a593Smuzhiyun#define FPU_fsqrt_arg_0	-28(%ebp)	/* ls word, at most the ms bit is set */
49*4882a593Smuzhiyun
50*4882a593Smuzhiyun#else
51*4882a593Smuzhiyun/*	Local storage in a static area: */
52*4882a593Smuzhiyun.data
53*4882a593Smuzhiyun	.align 4,0
54*4882a593SmuzhiyunFPU_accum_3:
55*4882a593Smuzhiyun	.long	0		/* ms word */
56*4882a593SmuzhiyunFPU_accum_2:
57*4882a593Smuzhiyun	.long	0
58*4882a593SmuzhiyunFPU_accum_1:
59*4882a593Smuzhiyun	.long	0
60*4882a593SmuzhiyunFPU_accum_0:
61*4882a593Smuzhiyun	.long	0
62*4882a593Smuzhiyun
63*4882a593Smuzhiyun/* The de-normalised argument:
64*4882a593Smuzhiyun                    sq_2                  sq_1              sq_0
65*4882a593Smuzhiyun          b b b b b b b ... b b b   b b b .... b b b   b 0 0 0 ... 0
66*4882a593Smuzhiyun             ^ binary point here
67*4882a593Smuzhiyun */
68*4882a593SmuzhiyunFPU_fsqrt_arg_2:
69*4882a593Smuzhiyun	.long	0		/* ms word */
70*4882a593SmuzhiyunFPU_fsqrt_arg_1:
71*4882a593Smuzhiyun	.long	0
72*4882a593SmuzhiyunFPU_fsqrt_arg_0:
73*4882a593Smuzhiyun	.long	0		/* ls word, at most the ms bit is set */
74*4882a593Smuzhiyun#endif /* NON_REENTRANT_FPU */
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun
77*4882a593Smuzhiyun.text
78*4882a593SmuzhiyunSYM_FUNC_START(wm_sqrt)
79*4882a593Smuzhiyun	pushl	%ebp
80*4882a593Smuzhiyun	movl	%esp,%ebp
81*4882a593Smuzhiyun#ifndef NON_REENTRANT_FPU
82*4882a593Smuzhiyun	subl	$28,%esp
83*4882a593Smuzhiyun#endif /* NON_REENTRANT_FPU */
84*4882a593Smuzhiyun	pushl	%esi
85*4882a593Smuzhiyun	pushl	%edi
86*4882a593Smuzhiyun	pushl	%ebx
87*4882a593Smuzhiyun
88*4882a593Smuzhiyun	movl	PARAM1,%esi
89*4882a593Smuzhiyun
90*4882a593Smuzhiyun	movl	SIGH(%esi),%eax
91*4882a593Smuzhiyun	movl	SIGL(%esi),%ecx
92*4882a593Smuzhiyun	xorl	%edx,%edx
93*4882a593Smuzhiyun
94*4882a593Smuzhiyun/* We use a rough linear estimate for the first guess.. */
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun	cmpw	EXP_BIAS,EXP(%esi)
97*4882a593Smuzhiyun	jnz	sqrt_arg_ge_2
98*4882a593Smuzhiyun
99*4882a593Smuzhiyun	shrl	$1,%eax			/* arg is in the range  [1.0 .. 2.0) */
100*4882a593Smuzhiyun	rcrl	$1,%ecx
101*4882a593Smuzhiyun	rcrl	$1,%edx
102*4882a593Smuzhiyun
103*4882a593Smuzhiyunsqrt_arg_ge_2:
104*4882a593Smuzhiyun/* From here on, n is never accessed directly again until it is
105*4882a593Smuzhiyun   replaced by the answer. */
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun	movl	%eax,FPU_fsqrt_arg_2		/* ms word of n */
108*4882a593Smuzhiyun	movl	%ecx,FPU_fsqrt_arg_1
109*4882a593Smuzhiyun	movl	%edx,FPU_fsqrt_arg_0
110*4882a593Smuzhiyun
111*4882a593Smuzhiyun/* Make a linear first estimate */
112*4882a593Smuzhiyun	shrl	$1,%eax
113*4882a593Smuzhiyun	addl	$0x40000000,%eax
114*4882a593Smuzhiyun	movl	$0xaaaaaaaa,%ecx
115*4882a593Smuzhiyun	mull	%ecx
116*4882a593Smuzhiyun	shll	%edx			/* max result was 7fff... */
117*4882a593Smuzhiyun	testl	$0x80000000,%edx	/* but min was 3fff... */
118*4882a593Smuzhiyun	jnz	sqrt_prelim_no_adjust
119*4882a593Smuzhiyun
120*4882a593Smuzhiyun	movl	$0x80000000,%edx	/* round up */
121*4882a593Smuzhiyun
122*4882a593Smuzhiyunsqrt_prelim_no_adjust:
123*4882a593Smuzhiyun	movl	%edx,%esi	/* Our first guess */
124*4882a593Smuzhiyun
125*4882a593Smuzhiyun/* We have now computed (approx)   (2 + x) / 3, which forms the basis
126*4882a593Smuzhiyun   for a few iterations of Newton's method */
127*4882a593Smuzhiyun
128*4882a593Smuzhiyun	movl	FPU_fsqrt_arg_2,%ecx	/* ms word */
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun/*
131*4882a593Smuzhiyun * From our initial estimate, three iterations are enough to get us
132*4882a593Smuzhiyun * to 30 bits or so. This will then allow two iterations at better
133*4882a593Smuzhiyun * precision to complete the process.
134*4882a593Smuzhiyun */
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun/* Compute  (g + n/g)/2  at each iteration (g is the guess). */
137*4882a593Smuzhiyun	shrl	%ecx		/* Doing this first will prevent a divide */
138*4882a593Smuzhiyun				/* overflow later. */
139*4882a593Smuzhiyun
140*4882a593Smuzhiyun	movl	%ecx,%edx	/* msw of the arg / 2 */
141*4882a593Smuzhiyun	divl	%esi		/* current estimate */
142*4882a593Smuzhiyun	shrl	%esi		/* divide by 2 */
143*4882a593Smuzhiyun	addl	%eax,%esi	/* the new estimate */
144*4882a593Smuzhiyun
145*4882a593Smuzhiyun	movl	%ecx,%edx
146*4882a593Smuzhiyun	divl	%esi
147*4882a593Smuzhiyun	shrl	%esi
148*4882a593Smuzhiyun	addl	%eax,%esi
149*4882a593Smuzhiyun
150*4882a593Smuzhiyun	movl	%ecx,%edx
151*4882a593Smuzhiyun	divl	%esi
152*4882a593Smuzhiyun	shrl	%esi
153*4882a593Smuzhiyun	addl	%eax,%esi
154*4882a593Smuzhiyun
155*4882a593Smuzhiyun/*
156*4882a593Smuzhiyun * Now that an estimate accurate to about 30 bits has been obtained (in %esi),
157*4882a593Smuzhiyun * we improve it to 60 bits or so.
158*4882a593Smuzhiyun *
159*4882a593Smuzhiyun * The strategy from now on is to compute new estimates from
160*4882a593Smuzhiyun *      guess := guess + (n - guess^2) / (2 * guess)
161*4882a593Smuzhiyun */
162*4882a593Smuzhiyun
163*4882a593Smuzhiyun/* First, find the square of the guess */
164*4882a593Smuzhiyun	movl	%esi,%eax
165*4882a593Smuzhiyun	mull	%esi
166*4882a593Smuzhiyun/* guess^2 now in %edx:%eax */
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun	movl	FPU_fsqrt_arg_1,%ecx
169*4882a593Smuzhiyun	subl	%ecx,%eax
170*4882a593Smuzhiyun	movl	FPU_fsqrt_arg_2,%ecx	/* ms word of normalized n */
171*4882a593Smuzhiyun	sbbl	%ecx,%edx
172*4882a593Smuzhiyun	jnc	sqrt_stage_2_positive
173*4882a593Smuzhiyun
174*4882a593Smuzhiyun/* Subtraction gives a negative result,
175*4882a593Smuzhiyun   negate the result before division. */
176*4882a593Smuzhiyun	notl	%edx
177*4882a593Smuzhiyun	notl	%eax
178*4882a593Smuzhiyun	addl	$1,%eax
179*4882a593Smuzhiyun	adcl	$0,%edx
180*4882a593Smuzhiyun
181*4882a593Smuzhiyun	divl	%esi
182*4882a593Smuzhiyun	movl	%eax,%ecx
183*4882a593Smuzhiyun
184*4882a593Smuzhiyun	movl	%edx,%eax
185*4882a593Smuzhiyun	divl	%esi
186*4882a593Smuzhiyun	jmp	sqrt_stage_2_finish
187*4882a593Smuzhiyun
188*4882a593Smuzhiyunsqrt_stage_2_positive:
189*4882a593Smuzhiyun	divl	%esi
190*4882a593Smuzhiyun	movl	%eax,%ecx
191*4882a593Smuzhiyun
192*4882a593Smuzhiyun	movl	%edx,%eax
193*4882a593Smuzhiyun	divl	%esi
194*4882a593Smuzhiyun
195*4882a593Smuzhiyun	notl	%ecx
196*4882a593Smuzhiyun	notl	%eax
197*4882a593Smuzhiyun	addl	$1,%eax
198*4882a593Smuzhiyun	adcl	$0,%ecx
199*4882a593Smuzhiyun
200*4882a593Smuzhiyunsqrt_stage_2_finish:
201*4882a593Smuzhiyun	sarl	$1,%ecx		/* divide by 2 */
202*4882a593Smuzhiyun	rcrl	$1,%eax
203*4882a593Smuzhiyun
204*4882a593Smuzhiyun	/* Form the new estimate in %esi:%edi */
205*4882a593Smuzhiyun	movl	%eax,%edi
206*4882a593Smuzhiyun	addl	%ecx,%esi
207*4882a593Smuzhiyun
208*4882a593Smuzhiyun	jnz	sqrt_stage_2_done	/* result should be [1..2) */
209*4882a593Smuzhiyun
210*4882a593Smuzhiyun#ifdef PARANOID
211*4882a593Smuzhiyun/* It should be possible to get here only if the arg is ffff....ffff */
212*4882a593Smuzhiyun	cmpl	$0xffffffff,FPU_fsqrt_arg_1
213*4882a593Smuzhiyun	jnz	sqrt_stage_2_error
214*4882a593Smuzhiyun#endif /* PARANOID */
215*4882a593Smuzhiyun
216*4882a593Smuzhiyun/* The best rounded result. */
217*4882a593Smuzhiyun	xorl	%eax,%eax
218*4882a593Smuzhiyun	decl	%eax
219*4882a593Smuzhiyun	movl	%eax,%edi
220*4882a593Smuzhiyun	movl	%eax,%esi
221*4882a593Smuzhiyun	movl	$0x7fffffff,%eax
222*4882a593Smuzhiyun	jmp	sqrt_round_result
223*4882a593Smuzhiyun
224*4882a593Smuzhiyun#ifdef PARANOID
225*4882a593Smuzhiyunsqrt_stage_2_error:
226*4882a593Smuzhiyun	pushl	EX_INTERNAL|0x213
227*4882a593Smuzhiyun	call	EXCEPTION
228*4882a593Smuzhiyun#endif /* PARANOID */
229*4882a593Smuzhiyun
230*4882a593Smuzhiyunsqrt_stage_2_done:
231*4882a593Smuzhiyun
232*4882a593Smuzhiyun/* Now the square root has been computed to better than 60 bits. */
233*4882a593Smuzhiyun
234*4882a593Smuzhiyun/* Find the square of the guess. */
235*4882a593Smuzhiyun	movl	%edi,%eax		/* ls word of guess */
236*4882a593Smuzhiyun	mull	%edi
237*4882a593Smuzhiyun	movl	%edx,FPU_accum_1
238*4882a593Smuzhiyun
239*4882a593Smuzhiyun	movl	%esi,%eax
240*4882a593Smuzhiyun	mull	%esi
241*4882a593Smuzhiyun	movl	%edx,FPU_accum_3
242*4882a593Smuzhiyun	movl	%eax,FPU_accum_2
243*4882a593Smuzhiyun
244*4882a593Smuzhiyun	movl	%edi,%eax
245*4882a593Smuzhiyun	mull	%esi
246*4882a593Smuzhiyun	addl	%eax,FPU_accum_1
247*4882a593Smuzhiyun	adcl	%edx,FPU_accum_2
248*4882a593Smuzhiyun	adcl	$0,FPU_accum_3
249*4882a593Smuzhiyun
250*4882a593Smuzhiyun/*	movl	%esi,%eax */
251*4882a593Smuzhiyun/*	mull	%edi */
252*4882a593Smuzhiyun	addl	%eax,FPU_accum_1
253*4882a593Smuzhiyun	adcl	%edx,FPU_accum_2
254*4882a593Smuzhiyun	adcl	$0,FPU_accum_3
255*4882a593Smuzhiyun
256*4882a593Smuzhiyun/* guess^2 now in FPU_accum_3:FPU_accum_2:FPU_accum_1 */
257*4882a593Smuzhiyun
258*4882a593Smuzhiyun	movl	FPU_fsqrt_arg_0,%eax		/* get normalized n */
259*4882a593Smuzhiyun	subl	%eax,FPU_accum_1
260*4882a593Smuzhiyun	movl	FPU_fsqrt_arg_1,%eax
261*4882a593Smuzhiyun	sbbl	%eax,FPU_accum_2
262*4882a593Smuzhiyun	movl	FPU_fsqrt_arg_2,%eax		/* ms word of normalized n */
263*4882a593Smuzhiyun	sbbl	%eax,FPU_accum_3
264*4882a593Smuzhiyun	jnc	sqrt_stage_3_positive
265*4882a593Smuzhiyun
266*4882a593Smuzhiyun/* Subtraction gives a negative result,
267*4882a593Smuzhiyun   negate the result before division */
268*4882a593Smuzhiyun	notl	FPU_accum_1
269*4882a593Smuzhiyun	notl	FPU_accum_2
270*4882a593Smuzhiyun	notl	FPU_accum_3
271*4882a593Smuzhiyun	addl	$1,FPU_accum_1
272*4882a593Smuzhiyun	adcl	$0,FPU_accum_2
273*4882a593Smuzhiyun
274*4882a593Smuzhiyun#ifdef PARANOID
275*4882a593Smuzhiyun	adcl	$0,FPU_accum_3	/* This must be zero */
276*4882a593Smuzhiyun	jz	sqrt_stage_3_no_error
277*4882a593Smuzhiyun
278*4882a593Smuzhiyunsqrt_stage_3_error:
279*4882a593Smuzhiyun	pushl	EX_INTERNAL|0x207
280*4882a593Smuzhiyun	call	EXCEPTION
281*4882a593Smuzhiyun
282*4882a593Smuzhiyunsqrt_stage_3_no_error:
283*4882a593Smuzhiyun#endif /* PARANOID */
284*4882a593Smuzhiyun
285*4882a593Smuzhiyun	movl	FPU_accum_2,%edx
286*4882a593Smuzhiyun	movl	FPU_accum_1,%eax
287*4882a593Smuzhiyun	divl	%esi
288*4882a593Smuzhiyun	movl	%eax,%ecx
289*4882a593Smuzhiyun
290*4882a593Smuzhiyun	movl	%edx,%eax
291*4882a593Smuzhiyun	divl	%esi
292*4882a593Smuzhiyun
293*4882a593Smuzhiyun	sarl	$1,%ecx		/* divide by 2 */
294*4882a593Smuzhiyun	rcrl	$1,%eax
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun	/* prepare to round the result */
297*4882a593Smuzhiyun
298*4882a593Smuzhiyun	addl	%ecx,%edi
299*4882a593Smuzhiyun	adcl	$0,%esi
300*4882a593Smuzhiyun
301*4882a593Smuzhiyun	jmp	sqrt_stage_3_finished
302*4882a593Smuzhiyun
303*4882a593Smuzhiyunsqrt_stage_3_positive:
304*4882a593Smuzhiyun	movl	FPU_accum_2,%edx
305*4882a593Smuzhiyun	movl	FPU_accum_1,%eax
306*4882a593Smuzhiyun	divl	%esi
307*4882a593Smuzhiyun	movl	%eax,%ecx
308*4882a593Smuzhiyun
309*4882a593Smuzhiyun	movl	%edx,%eax
310*4882a593Smuzhiyun	divl	%esi
311*4882a593Smuzhiyun
312*4882a593Smuzhiyun	sarl	$1,%ecx		/* divide by 2 */
313*4882a593Smuzhiyun	rcrl	$1,%eax
314*4882a593Smuzhiyun
315*4882a593Smuzhiyun	/* prepare to round the result */
316*4882a593Smuzhiyun
317*4882a593Smuzhiyun	notl	%eax		/* Negate the correction term */
318*4882a593Smuzhiyun	notl	%ecx
319*4882a593Smuzhiyun	addl	$1,%eax
320*4882a593Smuzhiyun	adcl	$0,%ecx		/* carry here ==> correction == 0 */
321*4882a593Smuzhiyun	adcl	$0xffffffff,%esi
322*4882a593Smuzhiyun
323*4882a593Smuzhiyun	addl	%ecx,%edi
324*4882a593Smuzhiyun	adcl	$0,%esi
325*4882a593Smuzhiyun
326*4882a593Smuzhiyunsqrt_stage_3_finished:
327*4882a593Smuzhiyun
328*4882a593Smuzhiyun/*
329*4882a593Smuzhiyun * The result in %esi:%edi:%esi should be good to about 90 bits here,
330*4882a593Smuzhiyun * and the rounding information here does not have sufficient accuracy
331*4882a593Smuzhiyun * in a few rare cases.
332*4882a593Smuzhiyun */
333*4882a593Smuzhiyun	cmpl	$0xffffffe0,%eax
334*4882a593Smuzhiyun	ja	sqrt_near_exact_x
335*4882a593Smuzhiyun
336*4882a593Smuzhiyun	cmpl	$0x00000020,%eax
337*4882a593Smuzhiyun	jb	sqrt_near_exact
338*4882a593Smuzhiyun
339*4882a593Smuzhiyun	cmpl	$0x7fffffe0,%eax
340*4882a593Smuzhiyun	jb	sqrt_round_result
341*4882a593Smuzhiyun
342*4882a593Smuzhiyun	cmpl	$0x80000020,%eax
343*4882a593Smuzhiyun	jb	sqrt_get_more_precision
344*4882a593Smuzhiyun
345*4882a593Smuzhiyunsqrt_round_result:
346*4882a593Smuzhiyun/* Set up for rounding operations */
347*4882a593Smuzhiyun	movl	%eax,%edx
348*4882a593Smuzhiyun	movl	%esi,%eax
349*4882a593Smuzhiyun	movl	%edi,%ebx
350*4882a593Smuzhiyun	movl	PARAM1,%edi
351*4882a593Smuzhiyun	movw	EXP_BIAS,EXP(%edi)	/* Result is in  [1.0 .. 2.0) */
352*4882a593Smuzhiyun	jmp	fpu_reg_round
353*4882a593Smuzhiyun
354*4882a593Smuzhiyun
355*4882a593Smuzhiyunsqrt_near_exact_x:
356*4882a593Smuzhiyun/* First, the estimate must be rounded up. */
357*4882a593Smuzhiyun	addl	$1,%edi
358*4882a593Smuzhiyun	adcl	$0,%esi
359*4882a593Smuzhiyun
360*4882a593Smuzhiyunsqrt_near_exact:
361*4882a593Smuzhiyun/*
362*4882a593Smuzhiyun * This is an easy case because x^1/2 is monotonic.
363*4882a593Smuzhiyun * We need just find the square of our estimate, compare it
364*4882a593Smuzhiyun * with the argument, and deduce whether our estimate is
365*4882a593Smuzhiyun * above, below, or exact. We use the fact that the estimate
366*4882a593Smuzhiyun * is known to be accurate to about 90 bits.
367*4882a593Smuzhiyun */
368*4882a593Smuzhiyun	movl	%edi,%eax		/* ls word of guess */
369*4882a593Smuzhiyun	mull	%edi
370*4882a593Smuzhiyun	movl	%edx,%ebx		/* 2nd ls word of square */
371*4882a593Smuzhiyun	movl	%eax,%ecx		/* ls word of square */
372*4882a593Smuzhiyun
373*4882a593Smuzhiyun	movl	%edi,%eax
374*4882a593Smuzhiyun	mull	%esi
375*4882a593Smuzhiyun	addl	%eax,%ebx
376*4882a593Smuzhiyun	addl	%eax,%ebx
377*4882a593Smuzhiyun
378*4882a593Smuzhiyun#ifdef PARANOID
379*4882a593Smuzhiyun	cmp	$0xffffffb0,%ebx
380*4882a593Smuzhiyun	jb	sqrt_near_exact_ok
381*4882a593Smuzhiyun
382*4882a593Smuzhiyun	cmp	$0x00000050,%ebx
383*4882a593Smuzhiyun	ja	sqrt_near_exact_ok
384*4882a593Smuzhiyun
385*4882a593Smuzhiyun	pushl	EX_INTERNAL|0x214
386*4882a593Smuzhiyun	call	EXCEPTION
387*4882a593Smuzhiyun
388*4882a593Smuzhiyunsqrt_near_exact_ok:
389*4882a593Smuzhiyun#endif /* PARANOID */
390*4882a593Smuzhiyun
391*4882a593Smuzhiyun	or	%ebx,%ebx
392*4882a593Smuzhiyun	js	sqrt_near_exact_small
393*4882a593Smuzhiyun
394*4882a593Smuzhiyun	jnz	sqrt_near_exact_large
395*4882a593Smuzhiyun
396*4882a593Smuzhiyun	or	%ebx,%edx
397*4882a593Smuzhiyun	jnz	sqrt_near_exact_large
398*4882a593Smuzhiyun
399*4882a593Smuzhiyun/* Our estimate is exactly the right answer */
400*4882a593Smuzhiyun	xorl	%eax,%eax
401*4882a593Smuzhiyun	jmp	sqrt_round_result
402*4882a593Smuzhiyun
403*4882a593Smuzhiyunsqrt_near_exact_small:
404*4882a593Smuzhiyun/* Our estimate is too small */
405*4882a593Smuzhiyun	movl	$0x000000ff,%eax
406*4882a593Smuzhiyun	jmp	sqrt_round_result
407*4882a593Smuzhiyun
408*4882a593Smuzhiyunsqrt_near_exact_large:
409*4882a593Smuzhiyun/* Our estimate is too large, we need to decrement it */
410*4882a593Smuzhiyun	subl	$1,%edi
411*4882a593Smuzhiyun	sbbl	$0,%esi
412*4882a593Smuzhiyun	movl	$0xffffff00,%eax
413*4882a593Smuzhiyun	jmp	sqrt_round_result
414*4882a593Smuzhiyun
415*4882a593Smuzhiyun
416*4882a593Smuzhiyunsqrt_get_more_precision:
417*4882a593Smuzhiyun/* This case is almost the same as the above, except we start
418*4882a593Smuzhiyun   with an extra bit of precision in the estimate. */
419*4882a593Smuzhiyun	stc			/* The extra bit. */
420*4882a593Smuzhiyun	rcll	$1,%edi		/* Shift the estimate left one bit */
421*4882a593Smuzhiyun	rcll	$1,%esi
422*4882a593Smuzhiyun
423*4882a593Smuzhiyun	movl	%edi,%eax		/* ls word of guess */
424*4882a593Smuzhiyun	mull	%edi
425*4882a593Smuzhiyun	movl	%edx,%ebx		/* 2nd ls word of square */
426*4882a593Smuzhiyun	movl	%eax,%ecx		/* ls word of square */
427*4882a593Smuzhiyun
428*4882a593Smuzhiyun	movl	%edi,%eax
429*4882a593Smuzhiyun	mull	%esi
430*4882a593Smuzhiyun	addl	%eax,%ebx
431*4882a593Smuzhiyun	addl	%eax,%ebx
432*4882a593Smuzhiyun
433*4882a593Smuzhiyun/* Put our estimate back to its original value */
434*4882a593Smuzhiyun	stc			/* The ms bit. */
435*4882a593Smuzhiyun	rcrl	$1,%esi		/* Shift the estimate left one bit */
436*4882a593Smuzhiyun	rcrl	$1,%edi
437*4882a593Smuzhiyun
438*4882a593Smuzhiyun#ifdef PARANOID
439*4882a593Smuzhiyun	cmp	$0xffffff60,%ebx
440*4882a593Smuzhiyun	jb	sqrt_more_prec_ok
441*4882a593Smuzhiyun
442*4882a593Smuzhiyun	cmp	$0x000000a0,%ebx
443*4882a593Smuzhiyun	ja	sqrt_more_prec_ok
444*4882a593Smuzhiyun
445*4882a593Smuzhiyun	pushl	EX_INTERNAL|0x215
446*4882a593Smuzhiyun	call	EXCEPTION
447*4882a593Smuzhiyun
448*4882a593Smuzhiyunsqrt_more_prec_ok:
449*4882a593Smuzhiyun#endif /* PARANOID */
450*4882a593Smuzhiyun
451*4882a593Smuzhiyun	or	%ebx,%ebx
452*4882a593Smuzhiyun	js	sqrt_more_prec_small
453*4882a593Smuzhiyun
454*4882a593Smuzhiyun	jnz	sqrt_more_prec_large
455*4882a593Smuzhiyun
456*4882a593Smuzhiyun	or	%ebx,%ecx
457*4882a593Smuzhiyun	jnz	sqrt_more_prec_large
458*4882a593Smuzhiyun
459*4882a593Smuzhiyun/* Our estimate is exactly the right answer */
460*4882a593Smuzhiyun	movl	$0x80000000,%eax
461*4882a593Smuzhiyun	jmp	sqrt_round_result
462*4882a593Smuzhiyun
463*4882a593Smuzhiyunsqrt_more_prec_small:
464*4882a593Smuzhiyun/* Our estimate is too small */
465*4882a593Smuzhiyun	movl	$0x800000ff,%eax
466*4882a593Smuzhiyun	jmp	sqrt_round_result
467*4882a593Smuzhiyun
468*4882a593Smuzhiyunsqrt_more_prec_large:
469*4882a593Smuzhiyun/* Our estimate is too large */
470*4882a593Smuzhiyun	movl	$0x7fffff00,%eax
471*4882a593Smuzhiyun	jmp	sqrt_round_result
472*4882a593SmuzhiyunSYM_FUNC_END(wm_sqrt)
473