xref: /OK3568_Linux_fs/kernel/arch/x86/math-emu/polynom_Xsig.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun/*---------------------------------------------------------------------------+
3*4882a593Smuzhiyun |  polynomial_Xsig.S                                                        |
4*4882a593Smuzhiyun |                                                                           |
5*4882a593Smuzhiyun | Fixed point arithmetic polynomial evaluation.                             |
6*4882a593Smuzhiyun |                                                                           |
7*4882a593Smuzhiyun | Copyright (C) 1992,1993,1994,1995                                         |
8*4882a593Smuzhiyun |                       W. Metzenthen, 22 Parker St, Ormond, Vic 3163,      |
9*4882a593Smuzhiyun |                       Australia.  E-mail billm@jacobi.maths.monash.edu.au |
10*4882a593Smuzhiyun |                                                                           |
11*4882a593Smuzhiyun | Call from C as:                                                           |
12*4882a593Smuzhiyun |   void polynomial_Xsig(Xsig *accum, unsigned long long x,                 |
13*4882a593Smuzhiyun |                        unsigned long long terms[], int n)                 |
14*4882a593Smuzhiyun |                                                                           |
15*4882a593Smuzhiyun | Computes:                                                                 |
16*4882a593Smuzhiyun | terms[0] + (terms[1] + (terms[2] + ... + (terms[n-1]*x)*x)*x)*x) ... )*x  |
17*4882a593Smuzhiyun | and adds the result to the 12 byte Xsig.                                  |
18*4882a593Smuzhiyun | The terms[] are each 8 bytes, but all computation is performed to 12 byte |
19*4882a593Smuzhiyun | precision.                                                                |
20*4882a593Smuzhiyun |                                                                           |
21*4882a593Smuzhiyun | This function must be used carefully: most overflow of intermediate       |
22*4882a593Smuzhiyun | results is controlled, but overflow of the result is not.                 |
23*4882a593Smuzhiyun |                                                                           |
24*4882a593Smuzhiyun +---------------------------------------------------------------------------*/
25*4882a593Smuzhiyun	.file	"polynomial_Xsig.S"
26*4882a593Smuzhiyun
27*4882a593Smuzhiyun#include "fpu_emu.h"
28*4882a593Smuzhiyun
29*4882a593Smuzhiyun
30*4882a593Smuzhiyun#define	TERM_SIZE	$8
31*4882a593Smuzhiyun#define	SUM_MS		-20(%ebp)	/* sum ms long */
32*4882a593Smuzhiyun#define SUM_MIDDLE	-24(%ebp)	/* sum middle long */
33*4882a593Smuzhiyun#define	SUM_LS		-28(%ebp)	/* sum ls long */
34*4882a593Smuzhiyun#define	ACCUM_MS	-4(%ebp)	/* accum ms long */
35*4882a593Smuzhiyun#define	ACCUM_MIDDLE	-8(%ebp)	/* accum middle long */
36*4882a593Smuzhiyun#define	ACCUM_LS	-12(%ebp)	/* accum ls long */
37*4882a593Smuzhiyun#define OVERFLOWED      -16(%ebp)	/* addition overflow flag */
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun.text
40*4882a593SmuzhiyunSYM_FUNC_START(polynomial_Xsig)
41*4882a593Smuzhiyun	pushl	%ebp
42*4882a593Smuzhiyun	movl	%esp,%ebp
43*4882a593Smuzhiyun	subl	$32,%esp
44*4882a593Smuzhiyun	pushl	%esi
45*4882a593Smuzhiyun	pushl	%edi
46*4882a593Smuzhiyun	pushl	%ebx
47*4882a593Smuzhiyun
48*4882a593Smuzhiyun	movl	PARAM2,%esi		/* x */
49*4882a593Smuzhiyun	movl	PARAM3,%edi		/* terms */
50*4882a593Smuzhiyun
51*4882a593Smuzhiyun	movl	TERM_SIZE,%eax
52*4882a593Smuzhiyun	mull	PARAM4			/* n */
53*4882a593Smuzhiyun	addl	%eax,%edi
54*4882a593Smuzhiyun
55*4882a593Smuzhiyun	movl	4(%edi),%edx		/* terms[n] */
56*4882a593Smuzhiyun	movl	%edx,SUM_MS
57*4882a593Smuzhiyun	movl	(%edi),%edx		/* terms[n] */
58*4882a593Smuzhiyun	movl	%edx,SUM_MIDDLE
59*4882a593Smuzhiyun	xor	%eax,%eax
60*4882a593Smuzhiyun	movl	%eax,SUM_LS
61*4882a593Smuzhiyun	movb	%al,OVERFLOWED
62*4882a593Smuzhiyun
63*4882a593Smuzhiyun	subl	TERM_SIZE,%edi
64*4882a593Smuzhiyun	decl	PARAM4
65*4882a593Smuzhiyun	js	L_accum_done
66*4882a593Smuzhiyun
67*4882a593SmuzhiyunL_accum_loop:
68*4882a593Smuzhiyun	xor	%eax,%eax
69*4882a593Smuzhiyun	movl	%eax,ACCUM_MS
70*4882a593Smuzhiyun	movl	%eax,ACCUM_MIDDLE
71*4882a593Smuzhiyun
72*4882a593Smuzhiyun	movl	SUM_MIDDLE,%eax
73*4882a593Smuzhiyun	mull	(%esi)			/* x ls long */
74*4882a593Smuzhiyun	movl	%edx,ACCUM_LS
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun	movl	SUM_MIDDLE,%eax
77*4882a593Smuzhiyun	mull	4(%esi)			/* x ms long */
78*4882a593Smuzhiyun	addl	%eax,ACCUM_LS
79*4882a593Smuzhiyun	adcl	%edx,ACCUM_MIDDLE
80*4882a593Smuzhiyun	adcl	$0,ACCUM_MS
81*4882a593Smuzhiyun
82*4882a593Smuzhiyun	movl	SUM_MS,%eax
83*4882a593Smuzhiyun	mull	(%esi)			/* x ls long */
84*4882a593Smuzhiyun	addl	%eax,ACCUM_LS
85*4882a593Smuzhiyun	adcl	%edx,ACCUM_MIDDLE
86*4882a593Smuzhiyun	adcl	$0,ACCUM_MS
87*4882a593Smuzhiyun
88*4882a593Smuzhiyun	movl	SUM_MS,%eax
89*4882a593Smuzhiyun	mull	4(%esi)			/* x ms long */
90*4882a593Smuzhiyun	addl	%eax,ACCUM_MIDDLE
91*4882a593Smuzhiyun	adcl	%edx,ACCUM_MS
92*4882a593Smuzhiyun
93*4882a593Smuzhiyun	testb	$0xff,OVERFLOWED
94*4882a593Smuzhiyun	jz	L_no_overflow
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun	movl	(%esi),%eax
97*4882a593Smuzhiyun	addl	%eax,ACCUM_MIDDLE
98*4882a593Smuzhiyun	movl	4(%esi),%eax
99*4882a593Smuzhiyun	adcl	%eax,ACCUM_MS		/* This could overflow too */
100*4882a593Smuzhiyun
101*4882a593SmuzhiyunL_no_overflow:
102*4882a593Smuzhiyun
103*4882a593Smuzhiyun/*
104*4882a593Smuzhiyun * Now put the sum of next term and the accumulator
105*4882a593Smuzhiyun * into the sum register
106*4882a593Smuzhiyun */
107*4882a593Smuzhiyun	movl	ACCUM_LS,%eax
108*4882a593Smuzhiyun	addl	(%edi),%eax		/* term ls long */
109*4882a593Smuzhiyun	movl	%eax,SUM_LS
110*4882a593Smuzhiyun	movl	ACCUM_MIDDLE,%eax
111*4882a593Smuzhiyun	adcl	(%edi),%eax		/* term ls long */
112*4882a593Smuzhiyun	movl	%eax,SUM_MIDDLE
113*4882a593Smuzhiyun	movl	ACCUM_MS,%eax
114*4882a593Smuzhiyun	adcl	4(%edi),%eax		/* term ms long */
115*4882a593Smuzhiyun	movl	%eax,SUM_MS
116*4882a593Smuzhiyun	sbbb	%al,%al
117*4882a593Smuzhiyun	movb	%al,OVERFLOWED		/* Used in the next iteration */
118*4882a593Smuzhiyun
119*4882a593Smuzhiyun	subl	TERM_SIZE,%edi
120*4882a593Smuzhiyun	decl	PARAM4
121*4882a593Smuzhiyun	jns	L_accum_loop
122*4882a593Smuzhiyun
123*4882a593SmuzhiyunL_accum_done:
124*4882a593Smuzhiyun	movl	PARAM1,%edi		/* accum */
125*4882a593Smuzhiyun	movl	SUM_LS,%eax
126*4882a593Smuzhiyun	addl	%eax,(%edi)
127*4882a593Smuzhiyun	movl	SUM_MIDDLE,%eax
128*4882a593Smuzhiyun	adcl	%eax,4(%edi)
129*4882a593Smuzhiyun	movl	SUM_MS,%eax
130*4882a593Smuzhiyun	adcl	%eax,8(%edi)
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun	popl	%ebx
133*4882a593Smuzhiyun	popl	%edi
134*4882a593Smuzhiyun	popl	%esi
135*4882a593Smuzhiyun	leave
136*4882a593Smuzhiyun	RET
137*4882a593SmuzhiyunSYM_FUNC_END(polynomial_Xsig)
138