xref: /OK3568_Linux_fs/kernel/arch/m68k/ifpsp060/src/ilsp.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2*4882a593SmuzhiyunMOTOROLA MICROPROCESSOR & MEMORY TECHNOLOGY GROUP
3*4882a593SmuzhiyunM68000 Hi-Performance Microprocessor Division
4*4882a593SmuzhiyunM68060 Software Package
5*4882a593SmuzhiyunProduction Release P1.00 -- October 10, 1994
6*4882a593Smuzhiyun
7*4882a593SmuzhiyunM68060 Software Package Copyright © 1993, 1994 Motorola Inc.  All rights reserved.
8*4882a593Smuzhiyun
9*4882a593SmuzhiyunTHE SOFTWARE is provided on an "AS IS" basis and without warranty.
10*4882a593SmuzhiyunTo the maximum extent permitted by applicable law,
11*4882a593SmuzhiyunMOTOROLA DISCLAIMS ALL WARRANTIES WHETHER EXPRESS OR IMPLIED,
12*4882a593SmuzhiyunINCLUDING IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE
13*4882a593Smuzhiyunand any warranty against infringement with regard to the SOFTWARE
14*4882a593Smuzhiyun(INCLUDING ANY MODIFIED VERSIONS THEREOF) and any accompanying written materials.
15*4882a593Smuzhiyun
16*4882a593SmuzhiyunTo the maximum extent permitted by applicable law,
17*4882a593SmuzhiyunIN NO EVENT SHALL MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER
18*4882a593Smuzhiyun(INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS,
19*4882a593SmuzhiyunBUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY LOSS)
20*4882a593SmuzhiyunARISING OF THE USE OR INABILITY TO USE THE SOFTWARE.
21*4882a593SmuzhiyunMotorola assumes no responsibility for the maintenance and support of the SOFTWARE.
22*4882a593Smuzhiyun
23*4882a593SmuzhiyunYou are hereby granted a copyright license to use, modify, and distribute the SOFTWARE
24*4882a593Smuzhiyunso long as this entire notice is retained without alteration in any modified and/or
25*4882a593Smuzhiyunredistributed versions, and that such modified versions are clearly identified as such.
26*4882a593SmuzhiyunNo licenses are granted by implication, estoppel or otherwise under any patents
27*4882a593Smuzhiyunor trademarks of Motorola, Inc.
28*4882a593Smuzhiyun~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
29*4882a593Smuzhiyun# litop.s:
30*4882a593Smuzhiyun#	This file is appended to the top of the 060FPLSP package
31*4882a593Smuzhiyun# and contains the entry points into the package. The user, in
32*4882a593Smuzhiyun# effect, branches to one of the branch table entries located here.
33*4882a593Smuzhiyun#
34*4882a593Smuzhiyun
35*4882a593Smuzhiyun	bra.l	_060LSP__idivs64_
36*4882a593Smuzhiyun	short	0x0000
37*4882a593Smuzhiyun	bra.l	_060LSP__idivu64_
38*4882a593Smuzhiyun	short	0x0000
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun	bra.l	_060LSP__imuls64_
41*4882a593Smuzhiyun	short	0x0000
42*4882a593Smuzhiyun	bra.l	_060LSP__imulu64_
43*4882a593Smuzhiyun	short	0x0000
44*4882a593Smuzhiyun
45*4882a593Smuzhiyun	bra.l	_060LSP__cmp2_Ab_
46*4882a593Smuzhiyun	short	0x0000
47*4882a593Smuzhiyun	bra.l	_060LSP__cmp2_Aw_
48*4882a593Smuzhiyun	short	0x0000
49*4882a593Smuzhiyun	bra.l	_060LSP__cmp2_Al_
50*4882a593Smuzhiyun	short	0x0000
51*4882a593Smuzhiyun	bra.l	_060LSP__cmp2_Db_
52*4882a593Smuzhiyun	short	0x0000
53*4882a593Smuzhiyun	bra.l	_060LSP__cmp2_Dw_
54*4882a593Smuzhiyun	short	0x0000
55*4882a593Smuzhiyun	bra.l	_060LSP__cmp2_Dl_
56*4882a593Smuzhiyun	short	0x0000
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun# leave room for future possible aditions.
59*4882a593Smuzhiyun	align	0x200
60*4882a593Smuzhiyun
61*4882a593Smuzhiyun#########################################################################
62*4882a593Smuzhiyun# XDEF ****************************************************************	#
63*4882a593Smuzhiyun#	_060LSP__idivu64_(): Emulate 64-bit unsigned div instruction.	#
64*4882a593Smuzhiyun#	_060LSP__idivs64_(): Emulate 64-bit signed div instruction.	#
65*4882a593Smuzhiyun#									#
66*4882a593Smuzhiyun#	This is the library version which is accessed as a subroutine	#
67*4882a593Smuzhiyun#	and therefore does not work exactly like the 680X0 div{s,u}.l	#
68*4882a593Smuzhiyun#	64-bit divide instruction.					#
69*4882a593Smuzhiyun#									#
70*4882a593Smuzhiyun# XREF ****************************************************************	#
71*4882a593Smuzhiyun#	None.								#
72*4882a593Smuzhiyun#									#
73*4882a593Smuzhiyun# INPUT ***************************************************************	#
74*4882a593Smuzhiyun#	0x4(sp)  = divisor						#
75*4882a593Smuzhiyun#	0x8(sp)  = hi(dividend)						#
76*4882a593Smuzhiyun#	0xc(sp)  = lo(dividend)						#
77*4882a593Smuzhiyun#	0x10(sp) = pointer to location to place quotient/remainder	#
78*4882a593Smuzhiyun#									#
79*4882a593Smuzhiyun# OUTPUT **************************************************************	#
80*4882a593Smuzhiyun#	0x10(sp) = points to location of remainder/quotient.		#
81*4882a593Smuzhiyun#		   remainder is in first longword, quotient is in 2nd.	#
82*4882a593Smuzhiyun#									#
83*4882a593Smuzhiyun# ALGORITHM ***********************************************************	#
84*4882a593Smuzhiyun#	If the operands are signed, make them unsigned and save the	#
85*4882a593Smuzhiyun# sign info for later. Separate out special cases like divide-by-zero	#
86*4882a593Smuzhiyun# or 32-bit divides if possible. Else, use a special math algorithm	#
87*4882a593Smuzhiyun# to calculate the result.						#
88*4882a593Smuzhiyun#	Restore sign info if signed instruction. Set the condition	#
89*4882a593Smuzhiyun# codes before performing the final "rts". If the divisor was equal to	#
90*4882a593Smuzhiyun# zero, then perform a divide-by-zero using a 16-bit implemented	#
91*4882a593Smuzhiyun# divide instruction. This way, the operating system can record that	#
92*4882a593Smuzhiyun# the event occurred even though it may not point to the correct place.	#
93*4882a593Smuzhiyun#									#
94*4882a593Smuzhiyun#########################################################################
95*4882a593Smuzhiyun
96*4882a593Smuzhiyunset	POSNEG,		-1
97*4882a593Smuzhiyunset	NDIVISOR,	-2
98*4882a593Smuzhiyunset	NDIVIDEND,	-3
99*4882a593Smuzhiyunset	DDSECOND,	-4
100*4882a593Smuzhiyunset	DDNORMAL,	-8
101*4882a593Smuzhiyunset	DDQUOTIENT,	-12
102*4882a593Smuzhiyunset	DIV64_CC,	-16
103*4882a593Smuzhiyun
104*4882a593Smuzhiyun##########
105*4882a593Smuzhiyun# divs.l #
106*4882a593Smuzhiyun##########
107*4882a593Smuzhiyun	global		_060LSP__idivs64_
108*4882a593Smuzhiyun_060LSP__idivs64_:
109*4882a593Smuzhiyun# PROLOGUE BEGIN ########################################################
110*4882a593Smuzhiyun	link.w		%a6,&-16
111*4882a593Smuzhiyun	movm.l		&0x3f00,-(%sp)		# save d2-d7
112*4882a593Smuzhiyun#	fmovm.l		&0x0,-(%sp)		# save no fpregs
113*4882a593Smuzhiyun# PROLOGUE END ##########################################################
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun	mov.w		%cc,DIV64_CC(%a6)
116*4882a593Smuzhiyun	st		POSNEG(%a6)		# signed operation
117*4882a593Smuzhiyun	bra.b		ldiv64_cont
118*4882a593Smuzhiyun
119*4882a593Smuzhiyun##########
120*4882a593Smuzhiyun# divu.l #
121*4882a593Smuzhiyun##########
122*4882a593Smuzhiyun	global		_060LSP__idivu64_
123*4882a593Smuzhiyun_060LSP__idivu64_:
124*4882a593Smuzhiyun# PROLOGUE BEGIN ########################################################
125*4882a593Smuzhiyun	link.w		%a6,&-16
126*4882a593Smuzhiyun	movm.l		&0x3f00,-(%sp)		# save d2-d7
127*4882a593Smuzhiyun#	fmovm.l		&0x0,-(%sp)		# save no fpregs
128*4882a593Smuzhiyun# PROLOGUE END ##########################################################
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun	mov.w		%cc,DIV64_CC(%a6)
131*4882a593Smuzhiyun	sf		POSNEG(%a6)		# unsigned operation
132*4882a593Smuzhiyun
133*4882a593Smuzhiyunldiv64_cont:
134*4882a593Smuzhiyun	mov.l		0x8(%a6),%d7		# fetch divisor
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun	beq.w		ldiv64eq0		# divisor is = 0!!!
137*4882a593Smuzhiyun
138*4882a593Smuzhiyun	mov.l		0xc(%a6), %d5		# get dividend hi
139*4882a593Smuzhiyun	mov.l		0x10(%a6), %d6		# get dividend lo
140*4882a593Smuzhiyun
141*4882a593Smuzhiyun# separate signed and unsigned divide
142*4882a593Smuzhiyun	tst.b		POSNEG(%a6)		# signed or unsigned?
143*4882a593Smuzhiyun	beq.b		ldspecialcases		# use positive divide
144*4882a593Smuzhiyun
145*4882a593Smuzhiyun# save the sign of the divisor
146*4882a593Smuzhiyun# make divisor unsigned if it's negative
147*4882a593Smuzhiyun	tst.l		%d7			# chk sign of divisor
148*4882a593Smuzhiyun	slt		NDIVISOR(%a6)		# save sign of divisor
149*4882a593Smuzhiyun	bpl.b		ldsgndividend
150*4882a593Smuzhiyun	neg.l		%d7			# complement negative divisor
151*4882a593Smuzhiyun
152*4882a593Smuzhiyun# save the sign of the dividend
153*4882a593Smuzhiyun# make dividend unsigned if it's negative
154*4882a593Smuzhiyunldsgndividend:
155*4882a593Smuzhiyun	tst.l		%d5			# chk sign of hi(dividend)
156*4882a593Smuzhiyun	slt		NDIVIDEND(%a6)		# save sign of dividend
157*4882a593Smuzhiyun	bpl.b		ldspecialcases
158*4882a593Smuzhiyun
159*4882a593Smuzhiyun	mov.w		&0x0, %cc		# clear 'X' cc bit
160*4882a593Smuzhiyun	negx.l		%d6			# complement signed dividend
161*4882a593Smuzhiyun	negx.l		%d5
162*4882a593Smuzhiyun
163*4882a593Smuzhiyun# extract some special cases:
164*4882a593Smuzhiyun#	- is (dividend == 0) ?
165*4882a593Smuzhiyun#	- is (hi(dividend) == 0 && (divisor <= lo(dividend))) ? (32-bit div)
166*4882a593Smuzhiyunldspecialcases:
167*4882a593Smuzhiyun	tst.l		%d5			# is (hi(dividend) == 0)
168*4882a593Smuzhiyun	bne.b		ldnormaldivide		# no, so try it the long way
169*4882a593Smuzhiyun
170*4882a593Smuzhiyun	tst.l		%d6			# is (lo(dividend) == 0), too
171*4882a593Smuzhiyun	beq.w		lddone			# yes, so (dividend == 0)
172*4882a593Smuzhiyun
173*4882a593Smuzhiyun	cmp.l		%d7,%d6			# is (divisor <= lo(dividend))
174*4882a593Smuzhiyun	bls.b		ld32bitdivide		# yes, so use 32 bit divide
175*4882a593Smuzhiyun
176*4882a593Smuzhiyun	exg		%d5,%d6			# q = 0, r = dividend
177*4882a593Smuzhiyun	bra.w		ldivfinish		# can't divide, we're done.
178*4882a593Smuzhiyun
179*4882a593Smuzhiyunld32bitdivide:
180*4882a593Smuzhiyun	tdivu.l		%d7, %d5:%d6		# it's only a 32/32 bit div!
181*4882a593Smuzhiyun
182*4882a593Smuzhiyun	bra.b		ldivfinish
183*4882a593Smuzhiyun
184*4882a593Smuzhiyunldnormaldivide:
185*4882a593Smuzhiyun# last special case:
186*4882a593Smuzhiyun#	- is hi(dividend) >= divisor ? if yes, then overflow
187*4882a593Smuzhiyun	cmp.l		%d7,%d5
188*4882a593Smuzhiyun	bls.b		lddovf			# answer won't fit in 32 bits
189*4882a593Smuzhiyun
190*4882a593Smuzhiyun# perform the divide algorithm:
191*4882a593Smuzhiyun	bsr.l		ldclassical		# do int divide
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun# separate into signed and unsigned finishes.
194*4882a593Smuzhiyunldivfinish:
195*4882a593Smuzhiyun	tst.b		POSNEG(%a6)		# do divs, divu separately
196*4882a593Smuzhiyun	beq.b		lddone			# divu has no processing!!!
197*4882a593Smuzhiyun
198*4882a593Smuzhiyun# it was a divs.l, so ccode setting is a little more complicated...
199*4882a593Smuzhiyun	tst.b		NDIVIDEND(%a6)		# remainder has same sign
200*4882a593Smuzhiyun	beq.b		ldcc			# as dividend.
201*4882a593Smuzhiyun	neg.l		%d5			# sgn(rem) = sgn(dividend)
202*4882a593Smuzhiyunldcc:
203*4882a593Smuzhiyun	mov.b		NDIVISOR(%a6), %d0
204*4882a593Smuzhiyun	eor.b		%d0, NDIVIDEND(%a6)	# chk if quotient is negative
205*4882a593Smuzhiyun	beq.b		ldqpos			# branch to quot positive
206*4882a593Smuzhiyun
207*4882a593Smuzhiyun# 0x80000000 is the largest number representable as a 32-bit negative
208*4882a593Smuzhiyun# number. the negative of 0x80000000 is 0x80000000.
209*4882a593Smuzhiyun	cmpi.l		%d6, &0x80000000	# will (-quot) fit in 32 bits?
210*4882a593Smuzhiyun	bhi.b		lddovf
211*4882a593Smuzhiyun
212*4882a593Smuzhiyun	neg.l		%d6			# make (-quot) 2's comp
213*4882a593Smuzhiyun
214*4882a593Smuzhiyun	bra.b		lddone
215*4882a593Smuzhiyun
216*4882a593Smuzhiyunldqpos:
217*4882a593Smuzhiyun	btst		&0x1f, %d6		# will (+quot) fit in 32 bits?
218*4882a593Smuzhiyun	bne.b		lddovf
219*4882a593Smuzhiyun
220*4882a593Smuzhiyunlddone:
221*4882a593Smuzhiyun# if the register numbers are the same, only the quotient gets saved.
222*4882a593Smuzhiyun# so, if we always save the quotient second, we save ourselves a cmp&beq
223*4882a593Smuzhiyun	andi.w		&0x10,DIV64_CC(%a6)
224*4882a593Smuzhiyun	mov.w		DIV64_CC(%a6),%cc
225*4882a593Smuzhiyun	tst.l		%d6			# may set 'N' ccode bit
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun# here, the result is in d1 and d0. the current strategy is to save
228*4882a593Smuzhiyun# the values at the location pointed to by a0.
229*4882a593Smuzhiyun# use movm here to not disturb the condition codes.
230*4882a593Smuzhiyunldexit:
231*4882a593Smuzhiyun	movm.l		&0x0060,([0x14,%a6])	# save result
232*4882a593Smuzhiyun
233*4882a593Smuzhiyun# EPILOGUE BEGIN ########################################################
234*4882a593Smuzhiyun#	fmovm.l		(%sp)+,&0x0		# restore no fpregs
235*4882a593Smuzhiyun	movm.l		(%sp)+,&0x00fc		# restore d2-d7
236*4882a593Smuzhiyun	unlk		%a6
237*4882a593Smuzhiyun# EPILOGUE END ##########################################################
238*4882a593Smuzhiyun
239*4882a593Smuzhiyun	rts
240*4882a593Smuzhiyun
241*4882a593Smuzhiyun# the result should be the unchanged dividend
242*4882a593Smuzhiyunlddovf:
243*4882a593Smuzhiyun	mov.l		0xc(%a6), %d5		# get dividend hi
244*4882a593Smuzhiyun	mov.l		0x10(%a6), %d6		# get dividend lo
245*4882a593Smuzhiyun
246*4882a593Smuzhiyun	andi.w		&0x1c,DIV64_CC(%a6)
247*4882a593Smuzhiyun	ori.w		&0x02,DIV64_CC(%a6)	# set 'V' ccode bit
248*4882a593Smuzhiyun	mov.w		DIV64_CC(%a6),%cc
249*4882a593Smuzhiyun
250*4882a593Smuzhiyun	bra.b		ldexit
251*4882a593Smuzhiyun
252*4882a593Smuzhiyunldiv64eq0:
253*4882a593Smuzhiyun	mov.l		0xc(%a6),([0x14,%a6])
254*4882a593Smuzhiyun	mov.l		0x10(%a6),([0x14,%a6],0x4)
255*4882a593Smuzhiyun
256*4882a593Smuzhiyun	mov.w		DIV64_CC(%a6),%cc
257*4882a593Smuzhiyun
258*4882a593Smuzhiyun# EPILOGUE BEGIN ########################################################
259*4882a593Smuzhiyun#	fmovm.l		(%sp)+,&0x0		# restore no fpregs
260*4882a593Smuzhiyun	movm.l		(%sp)+,&0x00fc		# restore d2-d7
261*4882a593Smuzhiyun	unlk		%a6
262*4882a593Smuzhiyun# EPILOGUE END ##########################################################
263*4882a593Smuzhiyun
264*4882a593Smuzhiyun	divu.w		&0x0,%d0		# force a divbyzero exception
265*4882a593Smuzhiyun	rts
266*4882a593Smuzhiyun
267*4882a593Smuzhiyun###########################################################################
268*4882a593Smuzhiyun#########################################################################
269*4882a593Smuzhiyun# This routine uses the 'classical' Algorithm D from Donald Knuth's	#
270*4882a593Smuzhiyun# Art of Computer Programming, vol II, Seminumerical Algorithms.	#
271*4882a593Smuzhiyun# For this implementation b=2**16, and the target is U1U2U3U4/V1V2,	#
272*4882a593Smuzhiyun# where U,V are words of the quadword dividend and longword divisor,	#
273*4882a593Smuzhiyun# and U1, V1 are the most significant words.				#
274*4882a593Smuzhiyun#									#
275*4882a593Smuzhiyun# The most sig. longword of the 64 bit dividend must be in %d5, least	#
276*4882a593Smuzhiyun# in %d6. The divisor must be in the variable ddivisor, and the		#
277*4882a593Smuzhiyun# signed/unsigned flag ddusign must be set (0=unsigned,1=signed).	#
278*4882a593Smuzhiyun# The quotient is returned in %d6, remainder in %d5, unless the		#
279*4882a593Smuzhiyun# v (overflow) bit is set in the saved %ccr. If overflow, the dividend	#
280*4882a593Smuzhiyun# is unchanged.								#
281*4882a593Smuzhiyun#########################################################################
282*4882a593Smuzhiyunldclassical:
283*4882a593Smuzhiyun# if the divisor msw is 0, use simpler algorithm then the full blown
284*4882a593Smuzhiyun# one at ddknuth:
285*4882a593Smuzhiyun
286*4882a593Smuzhiyun	cmpi.l		%d7, &0xffff
287*4882a593Smuzhiyun	bhi.b		lddknuth		# go use D. Knuth algorithm
288*4882a593Smuzhiyun
289*4882a593Smuzhiyun# Since the divisor is only a word (and larger than the mslw of the dividend),
290*4882a593Smuzhiyun# a simpler algorithm may be used :
291*4882a593Smuzhiyun# In the general case, four quotient words would be created by
292*4882a593Smuzhiyun# dividing the divisor word into each dividend word. In this case,
293*4882a593Smuzhiyun# the first two quotient words must be zero, or overflow would occur.
294*4882a593Smuzhiyun# Since we already checked this case above, we can treat the most significant
295*4882a593Smuzhiyun# longword of the dividend as (0) remainder (see Knuth) and merely complete
296*4882a593Smuzhiyun# the last two divisions to get a quotient longword and word remainder:
297*4882a593Smuzhiyun
298*4882a593Smuzhiyun	clr.l		%d1
299*4882a593Smuzhiyun	swap		%d5			# same as r*b if previous step rqd
300*4882a593Smuzhiyun	swap		%d6			# get u3 to lsw position
301*4882a593Smuzhiyun	mov.w		%d6, %d5		# rb + u3
302*4882a593Smuzhiyun
303*4882a593Smuzhiyun	divu.w		%d7, %d5
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun	mov.w		%d5, %d1		# first quotient word
306*4882a593Smuzhiyun	swap		%d6			# get u4
307*4882a593Smuzhiyun	mov.w		%d6, %d5		# rb + u4
308*4882a593Smuzhiyun
309*4882a593Smuzhiyun	divu.w		%d7, %d5
310*4882a593Smuzhiyun
311*4882a593Smuzhiyun	swap		%d1
312*4882a593Smuzhiyun	mov.w		%d5, %d1		# 2nd quotient 'digit'
313*4882a593Smuzhiyun	clr.w		%d5
314*4882a593Smuzhiyun	swap		%d5			# now remainder
315*4882a593Smuzhiyun	mov.l		%d1, %d6		# and quotient
316*4882a593Smuzhiyun
317*4882a593Smuzhiyun	rts
318*4882a593Smuzhiyun
319*4882a593Smuzhiyunlddknuth:
320*4882a593Smuzhiyun# In this algorithm, the divisor is treated as a 2 digit (word) number
321*4882a593Smuzhiyun# which is divided into a 3 digit (word) dividend to get one quotient
322*4882a593Smuzhiyun# digit (word). After subtraction, the dividend is shifted and the
323*4882a593Smuzhiyun# process repeated. Before beginning, the divisor and quotient are
324*4882a593Smuzhiyun# 'normalized' so that the process of estimating the quotient digit
325*4882a593Smuzhiyun# will yield verifiably correct results..
326*4882a593Smuzhiyun
327*4882a593Smuzhiyun	clr.l		DDNORMAL(%a6)		# count of shifts for normalization
328*4882a593Smuzhiyun	clr.b		DDSECOND(%a6)		# clear flag for quotient digits
329*4882a593Smuzhiyun	clr.l		%d1			# %d1 will hold trial quotient
330*4882a593Smuzhiyunlddnchk:
331*4882a593Smuzhiyun	btst		&31, %d7		# must we normalize? first word of
332*4882a593Smuzhiyun	bne.b		lddnormalized		# divisor (V1) must be >= 65536/2
333*4882a593Smuzhiyun	addq.l		&0x1, DDNORMAL(%a6)	# count normalization shifts
334*4882a593Smuzhiyun	lsl.l		&0x1, %d7		# shift the divisor
335*4882a593Smuzhiyun	lsl.l		&0x1, %d6		# shift u4,u3 with overflow to u2
336*4882a593Smuzhiyun	roxl.l		&0x1, %d5		# shift u1,u2
337*4882a593Smuzhiyun	bra.w		lddnchk
338*4882a593Smuzhiyunlddnormalized:
339*4882a593Smuzhiyun
340*4882a593Smuzhiyun# Now calculate an estimate of the quotient words (msw first, then lsw).
341*4882a593Smuzhiyun# The comments use subscripts for the first quotient digit determination.
342*4882a593Smuzhiyun	mov.l		%d7, %d3		# divisor
343*4882a593Smuzhiyun	mov.l		%d5, %d2		# dividend mslw
344*4882a593Smuzhiyun	swap		%d2
345*4882a593Smuzhiyun	swap		%d3
346*4882a593Smuzhiyun	cmp.w		%d2, %d3		# V1 = U1 ?
347*4882a593Smuzhiyun	bne.b		lddqcalc1
348*4882a593Smuzhiyun	mov.w		&0xffff, %d1		# use max trial quotient word
349*4882a593Smuzhiyun	bra.b		lddadj0
350*4882a593Smuzhiyunlddqcalc1:
351*4882a593Smuzhiyun	mov.l		%d5, %d1
352*4882a593Smuzhiyun
353*4882a593Smuzhiyun	divu.w		%d3, %d1		# use quotient of mslw/msw
354*4882a593Smuzhiyun
355*4882a593Smuzhiyun	andi.l		&0x0000ffff, %d1	# zero any remainder
356*4882a593Smuzhiyunlddadj0:
357*4882a593Smuzhiyun
358*4882a593Smuzhiyun# now test the trial quotient and adjust. This step plus the
359*4882a593Smuzhiyun# normalization assures (according to Knuth) that the trial
360*4882a593Smuzhiyun# quotient will be at worst 1 too large.
361*4882a593Smuzhiyun	mov.l		%d6, -(%sp)
362*4882a593Smuzhiyun	clr.w		%d6			# word u3 left
363*4882a593Smuzhiyun	swap		%d6			# in lsw position
364*4882a593Smuzhiyunlddadj1: mov.l		%d7, %d3
365*4882a593Smuzhiyun	mov.l		%d1, %d2
366*4882a593Smuzhiyun	mulu.w		%d7, %d2		# V2q
367*4882a593Smuzhiyun	swap		%d3
368*4882a593Smuzhiyun	mulu.w		%d1, %d3		# V1q
369*4882a593Smuzhiyun	mov.l		%d5, %d4		# U1U2
370*4882a593Smuzhiyun	sub.l		%d3, %d4		# U1U2 - V1q
371*4882a593Smuzhiyun
372*4882a593Smuzhiyun	swap		%d4
373*4882a593Smuzhiyun
374*4882a593Smuzhiyun	mov.w		%d4,%d0
375*4882a593Smuzhiyun	mov.w		%d6,%d4			# insert lower word (U3)
376*4882a593Smuzhiyun
377*4882a593Smuzhiyun	tst.w		%d0			# is upper word set?
378*4882a593Smuzhiyun	bne.w		lddadjd1
379*4882a593Smuzhiyun
380*4882a593Smuzhiyun#	add.l		%d6, %d4		# (U1U2 - V1q) + U3
381*4882a593Smuzhiyun
382*4882a593Smuzhiyun	cmp.l		%d2, %d4
383*4882a593Smuzhiyun	bls.b		lddadjd1		# is V2q > (U1U2-V1q) + U3 ?
384*4882a593Smuzhiyun	subq.l		&0x1, %d1		# yes, decrement and recheck
385*4882a593Smuzhiyun	bra.b		lddadj1
386*4882a593Smuzhiyunlddadjd1:
387*4882a593Smuzhiyun# now test the word by multiplying it by the divisor (V1V2) and comparing
388*4882a593Smuzhiyun# the 3 digit (word) result with the current dividend words
389*4882a593Smuzhiyun	mov.l		%d5, -(%sp)		# save %d5 (%d6 already saved)
390*4882a593Smuzhiyun	mov.l		%d1, %d6
391*4882a593Smuzhiyun	swap		%d6			# shift answer to ms 3 words
392*4882a593Smuzhiyun	mov.l		%d7, %d5
393*4882a593Smuzhiyun	bsr.l		ldmm2
394*4882a593Smuzhiyun	mov.l		%d5, %d2		# now %d2,%d3 are trial*divisor
395*4882a593Smuzhiyun	mov.l		%d6, %d3
396*4882a593Smuzhiyun	mov.l		(%sp)+, %d5		# restore dividend
397*4882a593Smuzhiyun	mov.l		(%sp)+, %d6
398*4882a593Smuzhiyun	sub.l		%d3, %d6
399*4882a593Smuzhiyun	subx.l		%d2, %d5		# subtract double precision
400*4882a593Smuzhiyun	bcc		ldd2nd			# no carry, do next quotient digit
401*4882a593Smuzhiyun	subq.l		&0x1, %d1		# q is one too large
402*4882a593Smuzhiyun# need to add back divisor longword to current ms 3 digits of dividend
403*4882a593Smuzhiyun# - according to Knuth, this is done only 2 out of 65536 times for random
404*4882a593Smuzhiyun# divisor, dividend selection.
405*4882a593Smuzhiyun	clr.l		%d2
406*4882a593Smuzhiyun	mov.l		%d7, %d3
407*4882a593Smuzhiyun	swap		%d3
408*4882a593Smuzhiyun	clr.w		%d3			# %d3 now ls word of divisor
409*4882a593Smuzhiyun	add.l		%d3, %d6		# aligned with 3rd word of dividend
410*4882a593Smuzhiyun	addx.l		%d2, %d5
411*4882a593Smuzhiyun	mov.l		%d7, %d3
412*4882a593Smuzhiyun	clr.w		%d3			# %d3 now ms word of divisor
413*4882a593Smuzhiyun	swap		%d3			# aligned with 2nd word of dividend
414*4882a593Smuzhiyun	add.l		%d3, %d5
415*4882a593Smuzhiyunldd2nd:
416*4882a593Smuzhiyun	tst.b		DDSECOND(%a6)	# both q words done?
417*4882a593Smuzhiyun	bne.b		lddremain
418*4882a593Smuzhiyun# first quotient digit now correct. store digit and shift the
419*4882a593Smuzhiyun# (subtracted) dividend
420*4882a593Smuzhiyun	mov.w		%d1, DDQUOTIENT(%a6)
421*4882a593Smuzhiyun	clr.l		%d1
422*4882a593Smuzhiyun	swap		%d5
423*4882a593Smuzhiyun	swap		%d6
424*4882a593Smuzhiyun	mov.w		%d6, %d5
425*4882a593Smuzhiyun	clr.w		%d6
426*4882a593Smuzhiyun	st		DDSECOND(%a6)		# second digit
427*4882a593Smuzhiyun	bra.w		lddnormalized
428*4882a593Smuzhiyunlddremain:
429*4882a593Smuzhiyun# add 2nd word to quotient, get the remainder.
430*4882a593Smuzhiyun	mov.w		%d1, DDQUOTIENT+2(%a6)
431*4882a593Smuzhiyun# shift down one word/digit to renormalize remainder.
432*4882a593Smuzhiyun	mov.w		%d5, %d6
433*4882a593Smuzhiyun	swap		%d6
434*4882a593Smuzhiyun	swap		%d5
435*4882a593Smuzhiyun	mov.l		DDNORMAL(%a6), %d7	# get norm shift count
436*4882a593Smuzhiyun	beq.b		lddrn
437*4882a593Smuzhiyun	subq.l		&0x1, %d7		# set for loop count
438*4882a593Smuzhiyunlddnlp:
439*4882a593Smuzhiyun	lsr.l		&0x1, %d5		# shift into %d6
440*4882a593Smuzhiyun	roxr.l		&0x1, %d6
441*4882a593Smuzhiyun	dbf		%d7, lddnlp
442*4882a593Smuzhiyunlddrn:
443*4882a593Smuzhiyun	mov.l		%d6, %d5		# remainder
444*4882a593Smuzhiyun	mov.l		DDQUOTIENT(%a6), %d6	# quotient
445*4882a593Smuzhiyun
446*4882a593Smuzhiyun	rts
447*4882a593Smuzhiyunldmm2:
448*4882a593Smuzhiyun# factors for the 32X32->64 multiplication are in %d5 and %d6.
449*4882a593Smuzhiyun# returns 64 bit result in %d5 (hi) %d6(lo).
450*4882a593Smuzhiyun# destroys %d2,%d3,%d4.
451*4882a593Smuzhiyun
452*4882a593Smuzhiyun# multiply hi,lo words of each factor to get 4 intermediate products
453*4882a593Smuzhiyun	mov.l		%d6, %d2
454*4882a593Smuzhiyun	mov.l		%d6, %d3
455*4882a593Smuzhiyun	mov.l		%d5, %d4
456*4882a593Smuzhiyun	swap		%d3
457*4882a593Smuzhiyun	swap		%d4
458*4882a593Smuzhiyun	mulu.w		%d5, %d6		# %d6 <- lsw*lsw
459*4882a593Smuzhiyun	mulu.w		%d3, %d5		# %d5 <- msw-dest*lsw-source
460*4882a593Smuzhiyun	mulu.w		%d4, %d2		# %d2 <- msw-source*lsw-dest
461*4882a593Smuzhiyun	mulu.w		%d4, %d3		# %d3 <- msw*msw
462*4882a593Smuzhiyun# now use swap and addx to consolidate to two longwords
463*4882a593Smuzhiyun	clr.l		%d4
464*4882a593Smuzhiyun	swap		%d6
465*4882a593Smuzhiyun	add.w		%d5, %d6		# add msw of l*l to lsw of m*l product
466*4882a593Smuzhiyun	addx.w		%d4, %d3		# add any carry to m*m product
467*4882a593Smuzhiyun	add.w		%d2, %d6		# add in lsw of other m*l product
468*4882a593Smuzhiyun	addx.w		%d4, %d3		# add any carry to m*m product
469*4882a593Smuzhiyun	swap		%d6			# %d6 is low 32 bits of final product
470*4882a593Smuzhiyun	clr.w		%d5
471*4882a593Smuzhiyun	clr.w		%d2			# lsw of two mixed products used,
472*4882a593Smuzhiyun	swap		%d5			# now use msws of longwords
473*4882a593Smuzhiyun	swap		%d2
474*4882a593Smuzhiyun	add.l		%d2, %d5
475*4882a593Smuzhiyun	add.l		%d3, %d5	# %d5 now ms 32 bits of final product
476*4882a593Smuzhiyun	rts
477*4882a593Smuzhiyun
478*4882a593Smuzhiyun#########################################################################
479*4882a593Smuzhiyun# XDEF ****************************************************************	#
480*4882a593Smuzhiyun#	_060LSP__imulu64_(): Emulate 64-bit unsigned mul instruction	#
481*4882a593Smuzhiyun#	_060LSP__imuls64_(): Emulate 64-bit signed mul instruction.	#
482*4882a593Smuzhiyun#									#
483*4882a593Smuzhiyun#	This is the library version which is accessed as a subroutine	#
484*4882a593Smuzhiyun#	and therefore does not work exactly like the 680X0 mul{s,u}.l	#
485*4882a593Smuzhiyun#	64-bit multiply instruction.					#
486*4882a593Smuzhiyun#									#
487*4882a593Smuzhiyun# XREF ****************************************************************	#
488*4882a593Smuzhiyun#	None								#
489*4882a593Smuzhiyun#									#
490*4882a593Smuzhiyun# INPUT ***************************************************************	#
491*4882a593Smuzhiyun#	0x4(sp) = multiplier						#
492*4882a593Smuzhiyun#	0x8(sp) = multiplicand						#
493*4882a593Smuzhiyun#	0xc(sp) = pointer to location to place 64-bit result		#
494*4882a593Smuzhiyun#									#
495*4882a593Smuzhiyun# OUTPUT **************************************************************	#
496*4882a593Smuzhiyun#	0xc(sp) = points to location of 64-bit result			#
497*4882a593Smuzhiyun#									#
498*4882a593Smuzhiyun# ALGORITHM ***********************************************************	#
499*4882a593Smuzhiyun#	Perform the multiply in pieces using 16x16->32 unsigned		#
500*4882a593Smuzhiyun# multiplies and "add" instructions.					#
501*4882a593Smuzhiyun#	Set the condition codes as appropriate before performing an	#
502*4882a593Smuzhiyun# "rts".								#
503*4882a593Smuzhiyun#									#
504*4882a593Smuzhiyun#########################################################################
505*4882a593Smuzhiyun
506*4882a593Smuzhiyunset MUL64_CC, -4
507*4882a593Smuzhiyun
508*4882a593Smuzhiyun	global		_060LSP__imulu64_
509*4882a593Smuzhiyun_060LSP__imulu64_:
510*4882a593Smuzhiyun
511*4882a593Smuzhiyun# PROLOGUE BEGIN ########################################################
512*4882a593Smuzhiyun	link.w		%a6,&-4
513*4882a593Smuzhiyun	movm.l		&0x3800,-(%sp)		# save d2-d4
514*4882a593Smuzhiyun#	fmovm.l		&0x0,-(%sp)		# save no fpregs
515*4882a593Smuzhiyun# PROLOGUE END ##########################################################
516*4882a593Smuzhiyun
517*4882a593Smuzhiyun	mov.w		%cc,MUL64_CC(%a6)	# save incoming ccodes
518*4882a593Smuzhiyun
519*4882a593Smuzhiyun	mov.l		0x8(%a6),%d0		# store multiplier in d0
520*4882a593Smuzhiyun	beq.w		mulu64_zero		# handle zero separately
521*4882a593Smuzhiyun
522*4882a593Smuzhiyun	mov.l		0xc(%a6),%d1		# get multiplicand in d1
523*4882a593Smuzhiyun	beq.w		mulu64_zero		# handle zero separately
524*4882a593Smuzhiyun
525*4882a593Smuzhiyun#########################################################################
526*4882a593Smuzhiyun#	63			   32				0	#
527*4882a593Smuzhiyun#	----------------------------					#
528*4882a593Smuzhiyun#	| hi(mplier) * hi(mplicand)|					#
529*4882a593Smuzhiyun#	----------------------------					#
530*4882a593Smuzhiyun#		     -----------------------------			#
531*4882a593Smuzhiyun#		     | hi(mplier) * lo(mplicand) |			#
532*4882a593Smuzhiyun#		     -----------------------------			#
533*4882a593Smuzhiyun#		     -----------------------------			#
534*4882a593Smuzhiyun#		     | lo(mplier) * hi(mplicand) |			#
535*4882a593Smuzhiyun#		     -----------------------------			#
536*4882a593Smuzhiyun#	  |			   -----------------------------	#
537*4882a593Smuzhiyun#	--|--			   | lo(mplier) * lo(mplicand) |	#
538*4882a593Smuzhiyun#	  |			   -----------------------------	#
539*4882a593Smuzhiyun#	========================================================	#
540*4882a593Smuzhiyun#	--------------------------------------------------------	#
541*4882a593Smuzhiyun#	|	hi(result)	   |	    lo(result)         |	#
542*4882a593Smuzhiyun#	--------------------------------------------------------	#
543*4882a593Smuzhiyun#########################################################################
544*4882a593Smuzhiyunmulu64_alg:
545*4882a593Smuzhiyun# load temp registers with operands
546*4882a593Smuzhiyun	mov.l		%d0,%d2			# mr in d2
547*4882a593Smuzhiyun	mov.l		%d0,%d3			# mr in d3
548*4882a593Smuzhiyun	mov.l		%d1,%d4			# md in d4
549*4882a593Smuzhiyun	swap		%d3			# hi(mr) in lo d3
550*4882a593Smuzhiyun	swap		%d4			# hi(md) in lo d4
551*4882a593Smuzhiyun
552*4882a593Smuzhiyun# complete necessary multiplies:
553*4882a593Smuzhiyun	mulu.w		%d1,%d0			# [1] lo(mr) * lo(md)
554*4882a593Smuzhiyun	mulu.w		%d3,%d1			# [2] hi(mr) * lo(md)
555*4882a593Smuzhiyun	mulu.w		%d4,%d2			# [3] lo(mr) * hi(md)
556*4882a593Smuzhiyun	mulu.w		%d4,%d3			# [4] hi(mr) * hi(md)
557*4882a593Smuzhiyun
558*4882a593Smuzhiyun# add lo portions of [2],[3] to hi portion of [1].
559*4882a593Smuzhiyun# add carries produced from these adds to [4].
560*4882a593Smuzhiyun# lo([1]) is the final lo 16 bits of the result.
561*4882a593Smuzhiyun	clr.l		%d4			# load d4 w/ zero value
562*4882a593Smuzhiyun	swap		%d0			# hi([1]) <==> lo([1])
563*4882a593Smuzhiyun	add.w		%d1,%d0			# hi([1]) + lo([2])
564*4882a593Smuzhiyun	addx.l		%d4,%d3			#    [4]  + carry
565*4882a593Smuzhiyun	add.w		%d2,%d0			# hi([1]) + lo([3])
566*4882a593Smuzhiyun	addx.l		%d4,%d3			#    [4]  + carry
567*4882a593Smuzhiyun	swap		%d0			# lo([1]) <==> hi([1])
568*4882a593Smuzhiyun
569*4882a593Smuzhiyun# lo portions of [2],[3] have been added in to final result.
570*4882a593Smuzhiyun# now, clear lo, put hi in lo reg, and add to [4]
571*4882a593Smuzhiyun	clr.w		%d1			# clear lo([2])
572*4882a593Smuzhiyun	clr.w		%d2			# clear hi([3])
573*4882a593Smuzhiyun	swap		%d1			# hi([2]) in lo d1
574*4882a593Smuzhiyun	swap		%d2			# hi([3]) in lo d2
575*4882a593Smuzhiyun	add.l		%d2,%d1			#    [4]  + hi([2])
576*4882a593Smuzhiyun	add.l		%d3,%d1			#    [4]  + hi([3])
577*4882a593Smuzhiyun
578*4882a593Smuzhiyun# now, grab the condition codes. only one that can be set is 'N'.
579*4882a593Smuzhiyun# 'N' CAN be set if the operation is unsigned if bit 63 is set.
580*4882a593Smuzhiyun	mov.w		MUL64_CC(%a6),%d4
581*4882a593Smuzhiyun	andi.b		&0x10,%d4		# keep old 'X' bit
582*4882a593Smuzhiyun	tst.l		%d1			# may set 'N' bit
583*4882a593Smuzhiyun	bpl.b		mulu64_ddone
584*4882a593Smuzhiyun	ori.b		&0x8,%d4		# set 'N' bit
585*4882a593Smuzhiyunmulu64_ddone:
586*4882a593Smuzhiyun	mov.w		%d4,%cc
587*4882a593Smuzhiyun
588*4882a593Smuzhiyun# here, the result is in d1 and d0. the current strategy is to save
589*4882a593Smuzhiyun# the values at the location pointed to by a0.
590*4882a593Smuzhiyun# use movm here to not disturb the condition codes.
591*4882a593Smuzhiyunmulu64_end:
592*4882a593Smuzhiyun	exg		%d1,%d0
593*4882a593Smuzhiyun	movm.l		&0x0003,([0x10,%a6])		# save result
594*4882a593Smuzhiyun
595*4882a593Smuzhiyun# EPILOGUE BEGIN ########################################################
596*4882a593Smuzhiyun#	fmovm.l		(%sp)+,&0x0		# restore no fpregs
597*4882a593Smuzhiyun	movm.l		(%sp)+,&0x001c		# restore d2-d4
598*4882a593Smuzhiyun	unlk		%a6
599*4882a593Smuzhiyun# EPILOGUE END ##########################################################
600*4882a593Smuzhiyun
601*4882a593Smuzhiyun	rts
602*4882a593Smuzhiyun
603*4882a593Smuzhiyun# one or both of the operands is zero so the result is also zero.
604*4882a593Smuzhiyun# save the zero result to the register file and set the 'Z' ccode bit.
605*4882a593Smuzhiyunmulu64_zero:
606*4882a593Smuzhiyun	clr.l		%d0
607*4882a593Smuzhiyun	clr.l		%d1
608*4882a593Smuzhiyun
609*4882a593Smuzhiyun	mov.w		MUL64_CC(%a6),%d4
610*4882a593Smuzhiyun	andi.b		&0x10,%d4
611*4882a593Smuzhiyun	ori.b		&0x4,%d4
612*4882a593Smuzhiyun	mov.w		%d4,%cc			# set 'Z' ccode bit
613*4882a593Smuzhiyun
614*4882a593Smuzhiyun	bra.b		mulu64_end
615*4882a593Smuzhiyun
616*4882a593Smuzhiyun##########
617*4882a593Smuzhiyun# muls.l #
618*4882a593Smuzhiyun##########
619*4882a593Smuzhiyun	global		_060LSP__imuls64_
620*4882a593Smuzhiyun_060LSP__imuls64_:
621*4882a593Smuzhiyun
622*4882a593Smuzhiyun# PROLOGUE BEGIN ########################################################
623*4882a593Smuzhiyun	link.w		%a6,&-4
624*4882a593Smuzhiyun	movm.l		&0x3c00,-(%sp)		# save d2-d5
625*4882a593Smuzhiyun#	fmovm.l		&0x0,-(%sp)		# save no fpregs
626*4882a593Smuzhiyun# PROLOGUE END ##########################################################
627*4882a593Smuzhiyun
628*4882a593Smuzhiyun	mov.w		%cc,MUL64_CC(%a6)	# save incoming ccodes
629*4882a593Smuzhiyun
630*4882a593Smuzhiyun	mov.l		0x8(%a6),%d0		# store multiplier in d0
631*4882a593Smuzhiyun	beq.b		mulu64_zero		# handle zero separately
632*4882a593Smuzhiyun
633*4882a593Smuzhiyun	mov.l		0xc(%a6),%d1		# get multiplicand in d1
634*4882a593Smuzhiyun	beq.b		mulu64_zero		# handle zero separately
635*4882a593Smuzhiyun
636*4882a593Smuzhiyun	clr.b		%d5			# clear sign tag
637*4882a593Smuzhiyun	tst.l		%d0			# is multiplier negative?
638*4882a593Smuzhiyun	bge.b		muls64_chk_md_sgn	# no
639*4882a593Smuzhiyun	neg.l		%d0			# make multiplier positive
640*4882a593Smuzhiyun
641*4882a593Smuzhiyun	ori.b		&0x1,%d5		# save multiplier sgn
642*4882a593Smuzhiyun
643*4882a593Smuzhiyun# the result sign is the exclusive or of the operand sign bits.
644*4882a593Smuzhiyunmuls64_chk_md_sgn:
645*4882a593Smuzhiyun	tst.l		%d1			# is multiplicand negative?
646*4882a593Smuzhiyun	bge.b		muls64_alg		# no
647*4882a593Smuzhiyun	neg.l		%d1			# make multiplicand positive
648*4882a593Smuzhiyun
649*4882a593Smuzhiyun	eori.b		&0x1,%d5		# calculate correct sign
650*4882a593Smuzhiyun
651*4882a593Smuzhiyun#########################################################################
652*4882a593Smuzhiyun#	63			   32				0	#
653*4882a593Smuzhiyun#	----------------------------					#
654*4882a593Smuzhiyun#	| hi(mplier) * hi(mplicand)|					#
655*4882a593Smuzhiyun#	----------------------------					#
656*4882a593Smuzhiyun#		     -----------------------------			#
657*4882a593Smuzhiyun#		     | hi(mplier) * lo(mplicand) |			#
658*4882a593Smuzhiyun#		     -----------------------------			#
659*4882a593Smuzhiyun#		     -----------------------------			#
660*4882a593Smuzhiyun#		     | lo(mplier) * hi(mplicand) |			#
661*4882a593Smuzhiyun#		     -----------------------------			#
662*4882a593Smuzhiyun#	  |			   -----------------------------	#
663*4882a593Smuzhiyun#	--|--			   | lo(mplier) * lo(mplicand) |	#
664*4882a593Smuzhiyun#	  |			   -----------------------------	#
665*4882a593Smuzhiyun#	========================================================	#
666*4882a593Smuzhiyun#	--------------------------------------------------------	#
667*4882a593Smuzhiyun#	|	hi(result)	   |	    lo(result)         |	#
668*4882a593Smuzhiyun#	--------------------------------------------------------	#
669*4882a593Smuzhiyun#########################################################################
670*4882a593Smuzhiyunmuls64_alg:
671*4882a593Smuzhiyun# load temp registers with operands
672*4882a593Smuzhiyun	mov.l		%d0,%d2			# mr in d2
673*4882a593Smuzhiyun	mov.l		%d0,%d3			# mr in d3
674*4882a593Smuzhiyun	mov.l		%d1,%d4			# md in d4
675*4882a593Smuzhiyun	swap		%d3			# hi(mr) in lo d3
676*4882a593Smuzhiyun	swap		%d4			# hi(md) in lo d4
677*4882a593Smuzhiyun
678*4882a593Smuzhiyun# complete necessary multiplies:
679*4882a593Smuzhiyun	mulu.w		%d1,%d0			# [1] lo(mr) * lo(md)
680*4882a593Smuzhiyun	mulu.w		%d3,%d1			# [2] hi(mr) * lo(md)
681*4882a593Smuzhiyun	mulu.w		%d4,%d2			# [3] lo(mr) * hi(md)
682*4882a593Smuzhiyun	mulu.w		%d4,%d3			# [4] hi(mr) * hi(md)
683*4882a593Smuzhiyun
684*4882a593Smuzhiyun# add lo portions of [2],[3] to hi portion of [1].
685*4882a593Smuzhiyun# add carries produced from these adds to [4].
686*4882a593Smuzhiyun# lo([1]) is the final lo 16 bits of the result.
687*4882a593Smuzhiyun	clr.l		%d4			# load d4 w/ zero value
688*4882a593Smuzhiyun	swap		%d0			# hi([1]) <==> lo([1])
689*4882a593Smuzhiyun	add.w		%d1,%d0			# hi([1]) + lo([2])
690*4882a593Smuzhiyun	addx.l		%d4,%d3			#    [4]  + carry
691*4882a593Smuzhiyun	add.w		%d2,%d0			# hi([1]) + lo([3])
692*4882a593Smuzhiyun	addx.l		%d4,%d3			#    [4]  + carry
693*4882a593Smuzhiyun	swap		%d0			# lo([1]) <==> hi([1])
694*4882a593Smuzhiyun
695*4882a593Smuzhiyun# lo portions of [2],[3] have been added in to final result.
696*4882a593Smuzhiyun# now, clear lo, put hi in lo reg, and add to [4]
697*4882a593Smuzhiyun	clr.w		%d1			# clear lo([2])
698*4882a593Smuzhiyun	clr.w		%d2			# clear hi([3])
699*4882a593Smuzhiyun	swap		%d1			# hi([2]) in lo d1
700*4882a593Smuzhiyun	swap		%d2			# hi([3]) in lo d2
701*4882a593Smuzhiyun	add.l		%d2,%d1			#    [4]  + hi([2])
702*4882a593Smuzhiyun	add.l		%d3,%d1			#    [4]  + hi([3])
703*4882a593Smuzhiyun
704*4882a593Smuzhiyun	tst.b		%d5			# should result be signed?
705*4882a593Smuzhiyun	beq.b		muls64_done		# no
706*4882a593Smuzhiyun
707*4882a593Smuzhiyun# result should be a signed negative number.
708*4882a593Smuzhiyun# compute 2's complement of the unsigned number:
709*4882a593Smuzhiyun#   -negate all bits and add 1
710*4882a593Smuzhiyunmuls64_neg:
711*4882a593Smuzhiyun	not.l		%d0			# negate lo(result) bits
712*4882a593Smuzhiyun	not.l		%d1			# negate hi(result) bits
713*4882a593Smuzhiyun	addq.l		&1,%d0			# add 1 to lo(result)
714*4882a593Smuzhiyun	addx.l		%d4,%d1			# add carry to hi(result)
715*4882a593Smuzhiyun
716*4882a593Smuzhiyunmuls64_done:
717*4882a593Smuzhiyun	mov.w		MUL64_CC(%a6),%d4
718*4882a593Smuzhiyun	andi.b		&0x10,%d4		# keep old 'X' bit
719*4882a593Smuzhiyun	tst.l		%d1			# may set 'N' bit
720*4882a593Smuzhiyun	bpl.b		muls64_ddone
721*4882a593Smuzhiyun	ori.b		&0x8,%d4		# set 'N' bit
722*4882a593Smuzhiyunmuls64_ddone:
723*4882a593Smuzhiyun	mov.w		%d4,%cc
724*4882a593Smuzhiyun
725*4882a593Smuzhiyun# here, the result is in d1 and d0. the current strategy is to save
726*4882a593Smuzhiyun# the values at the location pointed to by a0.
727*4882a593Smuzhiyun# use movm here to not disturb the condition codes.
728*4882a593Smuzhiyunmuls64_end:
729*4882a593Smuzhiyun	exg		%d1,%d0
730*4882a593Smuzhiyun	movm.l		&0x0003,([0x10,%a6])	# save result at (a0)
731*4882a593Smuzhiyun
732*4882a593Smuzhiyun# EPILOGUE BEGIN ########################################################
733*4882a593Smuzhiyun#	fmovm.l		(%sp)+,&0x0		# restore no fpregs
734*4882a593Smuzhiyun	movm.l		(%sp)+,&0x003c		# restore d2-d5
735*4882a593Smuzhiyun	unlk		%a6
736*4882a593Smuzhiyun# EPILOGUE END ##########################################################
737*4882a593Smuzhiyun
738*4882a593Smuzhiyun	rts
739*4882a593Smuzhiyun
740*4882a593Smuzhiyun# one or both of the operands is zero so the result is also zero.
741*4882a593Smuzhiyun# save the zero result to the register file and set the 'Z' ccode bit.
742*4882a593Smuzhiyunmuls64_zero:
743*4882a593Smuzhiyun	clr.l		%d0
744*4882a593Smuzhiyun	clr.l		%d1
745*4882a593Smuzhiyun
746*4882a593Smuzhiyun	mov.w		MUL64_CC(%a6),%d4
747*4882a593Smuzhiyun	andi.b		&0x10,%d4
748*4882a593Smuzhiyun	ori.b		&0x4,%d4
749*4882a593Smuzhiyun	mov.w		%d4,%cc			# set 'Z' ccode bit
750*4882a593Smuzhiyun
751*4882a593Smuzhiyun	bra.b		muls64_end
752*4882a593Smuzhiyun
753*4882a593Smuzhiyun#########################################################################
754*4882a593Smuzhiyun# XDEF ****************************************************************	#
755*4882a593Smuzhiyun#	_060LSP__cmp2_Ab_(): Emulate "cmp2.b An,<ea>".			#
756*4882a593Smuzhiyun#	_060LSP__cmp2_Aw_(): Emulate "cmp2.w An,<ea>".			#
757*4882a593Smuzhiyun#	_060LSP__cmp2_Al_(): Emulate "cmp2.l An,<ea>".			#
758*4882a593Smuzhiyun#	_060LSP__cmp2_Db_(): Emulate "cmp2.b Dn,<ea>".			#
759*4882a593Smuzhiyun#	_060LSP__cmp2_Dw_(): Emulate "cmp2.w Dn,<ea>".			#
760*4882a593Smuzhiyun#	_060LSP__cmp2_Dl_(): Emulate "cmp2.l Dn,<ea>".			#
761*4882a593Smuzhiyun#									#
762*4882a593Smuzhiyun#	This is the library version which is accessed as a subroutine	#
763*4882a593Smuzhiyun#	and therefore does not work exactly like the 680X0 "cmp2"	#
764*4882a593Smuzhiyun#	instruction.							#
765*4882a593Smuzhiyun#									#
766*4882a593Smuzhiyun# XREF ****************************************************************	#
767*4882a593Smuzhiyun#	None								#
768*4882a593Smuzhiyun#									#
769*4882a593Smuzhiyun# INPUT ***************************************************************	#
770*4882a593Smuzhiyun#	0x4(sp) = Rn							#
771*4882a593Smuzhiyun#	0x8(sp) = pointer to boundary pair				#
772*4882a593Smuzhiyun#									#
773*4882a593Smuzhiyun# OUTPUT **************************************************************	#
774*4882a593Smuzhiyun#	cc = condition codes are set correctly				#
775*4882a593Smuzhiyun#									#
776*4882a593Smuzhiyun# ALGORITHM ***********************************************************	#
777*4882a593Smuzhiyun#	In the interest of simplicity, all operands are converted to	#
778*4882a593Smuzhiyun# longword size whether the operation is byte, word, or long. The	#
779*4882a593Smuzhiyun# bounds are sign extended accordingly. If Rn is a data register, Rn is #
780*4882a593Smuzhiyun# also sign extended. If Rn is an address register, it need not be sign #
781*4882a593Smuzhiyun# extended since the full register is always used.			#
782*4882a593Smuzhiyun#	The condition codes are set correctly before the final "rts".	#
783*4882a593Smuzhiyun#									#
784*4882a593Smuzhiyun#########################################################################
785*4882a593Smuzhiyun
786*4882a593Smuzhiyunset	CMP2_CC,	-4
787*4882a593Smuzhiyun
788*4882a593Smuzhiyun	global		_060LSP__cmp2_Ab_
789*4882a593Smuzhiyun_060LSP__cmp2_Ab_:
790*4882a593Smuzhiyun
791*4882a593Smuzhiyun# PROLOGUE BEGIN ########################################################
792*4882a593Smuzhiyun	link.w		%a6,&-4
793*4882a593Smuzhiyun	movm.l		&0x3800,-(%sp)		# save d2-d4
794*4882a593Smuzhiyun#	fmovm.l		&0x0,-(%sp)		# save no fpregs
795*4882a593Smuzhiyun# PROLOGUE END ##########################################################
796*4882a593Smuzhiyun
797*4882a593Smuzhiyun	mov.w		%cc,CMP2_CC(%a6)
798*4882a593Smuzhiyun	mov.l		0x8(%a6), %d2		# get regval
799*4882a593Smuzhiyun
800*4882a593Smuzhiyun	mov.b		([0xc,%a6],0x0),%d0
801*4882a593Smuzhiyun	mov.b		([0xc,%a6],0x1),%d1
802*4882a593Smuzhiyun
803*4882a593Smuzhiyun	extb.l		%d0			# sign extend lo bnd
804*4882a593Smuzhiyun	extb.l		%d1			# sign extend hi bnd
805*4882a593Smuzhiyun	bra.w		l_cmp2_cmp		# go do the compare emulation
806*4882a593Smuzhiyun
807*4882a593Smuzhiyun	global		_060LSP__cmp2_Aw_
808*4882a593Smuzhiyun_060LSP__cmp2_Aw_:
809*4882a593Smuzhiyun
810*4882a593Smuzhiyun# PROLOGUE BEGIN ########################################################
811*4882a593Smuzhiyun	link.w		%a6,&-4
812*4882a593Smuzhiyun	movm.l		&0x3800,-(%sp)		# save d2-d4
813*4882a593Smuzhiyun#	fmovm.l		&0x0,-(%sp)		# save no fpregs
814*4882a593Smuzhiyun# PROLOGUE END ##########################################################
815*4882a593Smuzhiyun
816*4882a593Smuzhiyun	mov.w		%cc,CMP2_CC(%a6)
817*4882a593Smuzhiyun	mov.l		0x8(%a6), %d2		# get regval
818*4882a593Smuzhiyun
819*4882a593Smuzhiyun	mov.w		([0xc,%a6],0x0),%d0
820*4882a593Smuzhiyun	mov.w		([0xc,%a6],0x2),%d1
821*4882a593Smuzhiyun
822*4882a593Smuzhiyun	ext.l		%d0			# sign extend lo bnd
823*4882a593Smuzhiyun	ext.l		%d1			# sign extend hi bnd
824*4882a593Smuzhiyun	bra.w		l_cmp2_cmp		# go do the compare emulation
825*4882a593Smuzhiyun
826*4882a593Smuzhiyun	global		_060LSP__cmp2_Al_
827*4882a593Smuzhiyun_060LSP__cmp2_Al_:
828*4882a593Smuzhiyun
829*4882a593Smuzhiyun# PROLOGUE BEGIN ########################################################
830*4882a593Smuzhiyun	link.w		%a6,&-4
831*4882a593Smuzhiyun	movm.l		&0x3800,-(%sp)		# save d2-d4
832*4882a593Smuzhiyun#	fmovm.l		&0x0,-(%sp)		# save no fpregs
833*4882a593Smuzhiyun# PROLOGUE END ##########################################################
834*4882a593Smuzhiyun
835*4882a593Smuzhiyun	mov.w		%cc,CMP2_CC(%a6)
836*4882a593Smuzhiyun	mov.l		0x8(%a6), %d2		# get regval
837*4882a593Smuzhiyun
838*4882a593Smuzhiyun	mov.l		([0xc,%a6],0x0),%d0
839*4882a593Smuzhiyun	mov.l		([0xc,%a6],0x4),%d1
840*4882a593Smuzhiyun	bra.w		l_cmp2_cmp		# go do the compare emulation
841*4882a593Smuzhiyun
842*4882a593Smuzhiyun	global		_060LSP__cmp2_Db_
843*4882a593Smuzhiyun_060LSP__cmp2_Db_:
844*4882a593Smuzhiyun
845*4882a593Smuzhiyun# PROLOGUE BEGIN ########################################################
846*4882a593Smuzhiyun	link.w		%a6,&-4
847*4882a593Smuzhiyun	movm.l		&0x3800,-(%sp)		# save d2-d4
848*4882a593Smuzhiyun#	fmovm.l		&0x0,-(%sp)		# save no fpregs
849*4882a593Smuzhiyun# PROLOGUE END ##########################################################
850*4882a593Smuzhiyun
851*4882a593Smuzhiyun	mov.w		%cc,CMP2_CC(%a6)
852*4882a593Smuzhiyun	mov.l		0x8(%a6), %d2		# get regval
853*4882a593Smuzhiyun
854*4882a593Smuzhiyun	mov.b		([0xc,%a6],0x0),%d0
855*4882a593Smuzhiyun	mov.b		([0xc,%a6],0x1),%d1
856*4882a593Smuzhiyun
857*4882a593Smuzhiyun	extb.l		%d0			# sign extend lo bnd
858*4882a593Smuzhiyun	extb.l		%d1			# sign extend hi bnd
859*4882a593Smuzhiyun
860*4882a593Smuzhiyun# operation is a data register compare.
861*4882a593Smuzhiyun# sign extend byte to long so we can do simple longword compares.
862*4882a593Smuzhiyun	extb.l		%d2			# sign extend data byte
863*4882a593Smuzhiyun	bra.w		l_cmp2_cmp		# go do the compare emulation
864*4882a593Smuzhiyun
865*4882a593Smuzhiyun	global		_060LSP__cmp2_Dw_
866*4882a593Smuzhiyun_060LSP__cmp2_Dw_:
867*4882a593Smuzhiyun
868*4882a593Smuzhiyun# PROLOGUE BEGIN ########################################################
869*4882a593Smuzhiyun	link.w		%a6,&-4
870*4882a593Smuzhiyun	movm.l		&0x3800,-(%sp)		# save d2-d4
871*4882a593Smuzhiyun#	fmovm.l		&0x0,-(%sp)		# save no fpregs
872*4882a593Smuzhiyun# PROLOGUE END ##########################################################
873*4882a593Smuzhiyun
874*4882a593Smuzhiyun	mov.w		%cc,CMP2_CC(%a6)
875*4882a593Smuzhiyun	mov.l		0x8(%a6), %d2		# get regval
876*4882a593Smuzhiyun
877*4882a593Smuzhiyun	mov.w		([0xc,%a6],0x0),%d0
878*4882a593Smuzhiyun	mov.w		([0xc,%a6],0x2),%d1
879*4882a593Smuzhiyun
880*4882a593Smuzhiyun	ext.l		%d0			# sign extend lo bnd
881*4882a593Smuzhiyun	ext.l		%d1			# sign extend hi bnd
882*4882a593Smuzhiyun
883*4882a593Smuzhiyun# operation is a data register compare.
884*4882a593Smuzhiyun# sign extend word to long so we can do simple longword compares.
885*4882a593Smuzhiyun	ext.l		%d2			# sign extend data word
886*4882a593Smuzhiyun	bra.w		l_cmp2_cmp		# go emulate compare
887*4882a593Smuzhiyun
888*4882a593Smuzhiyun	global		_060LSP__cmp2_Dl_
889*4882a593Smuzhiyun_060LSP__cmp2_Dl_:
890*4882a593Smuzhiyun
891*4882a593Smuzhiyun# PROLOGUE BEGIN ########################################################
892*4882a593Smuzhiyun	link.w		%a6,&-4
893*4882a593Smuzhiyun	movm.l		&0x3800,-(%sp)		# save d2-d4
894*4882a593Smuzhiyun#	fmovm.l		&0x0,-(%sp)		# save no fpregs
895*4882a593Smuzhiyun# PROLOGUE END ##########################################################
896*4882a593Smuzhiyun
897*4882a593Smuzhiyun	mov.w		%cc,CMP2_CC(%a6)
898*4882a593Smuzhiyun	mov.l		0x8(%a6), %d2		# get regval
899*4882a593Smuzhiyun
900*4882a593Smuzhiyun	mov.l		([0xc,%a6],0x0),%d0
901*4882a593Smuzhiyun	mov.l		([0xc,%a6],0x4),%d1
902*4882a593Smuzhiyun
903*4882a593Smuzhiyun#
904*4882a593Smuzhiyun# To set the ccodes correctly:
905*4882a593Smuzhiyun#	(1) save 'Z' bit from (Rn - lo)
906*4882a593Smuzhiyun#	(2) save 'Z' and 'N' bits from ((hi - lo) - (Rn - hi))
907*4882a593Smuzhiyun#	(3) keep 'X', 'N', and 'V' from before instruction
908*4882a593Smuzhiyun#	(4) combine ccodes
909*4882a593Smuzhiyun#
910*4882a593Smuzhiyunl_cmp2_cmp:
911*4882a593Smuzhiyun	sub.l		%d0, %d2		# (Rn - lo)
912*4882a593Smuzhiyun	mov.w		%cc, %d3		# fetch resulting ccodes
913*4882a593Smuzhiyun	andi.b		&0x4, %d3		# keep 'Z' bit
914*4882a593Smuzhiyun	sub.l		%d0, %d1		# (hi - lo)
915*4882a593Smuzhiyun	cmp.l		%d1,%d2			# ((hi - lo) - (Rn - hi))
916*4882a593Smuzhiyun
917*4882a593Smuzhiyun	mov.w		%cc, %d4		# fetch resulting ccodes
918*4882a593Smuzhiyun	or.b		%d4, %d3		# combine w/ earlier ccodes
919*4882a593Smuzhiyun	andi.b		&0x5, %d3		# keep 'Z' and 'N'
920*4882a593Smuzhiyun
921*4882a593Smuzhiyun	mov.w		CMP2_CC(%a6), %d4	# fetch old ccodes
922*4882a593Smuzhiyun	andi.b		&0x1a, %d4		# keep 'X','N','V' bits
923*4882a593Smuzhiyun	or.b		%d3, %d4		# insert new ccodes
924*4882a593Smuzhiyun	mov.w		%d4,%cc			# save new ccodes
925*4882a593Smuzhiyun
926*4882a593Smuzhiyun# EPILOGUE BEGIN ########################################################
927*4882a593Smuzhiyun#	fmovm.l		(%sp)+,&0x0		# restore no fpregs
928*4882a593Smuzhiyun	movm.l		(%sp)+,&0x001c		# restore d2-d4
929*4882a593Smuzhiyun	unlk		%a6
930*4882a593Smuzhiyun# EPILOGUE END ##########################################################
931*4882a593Smuzhiyun
932*4882a593Smuzhiyun	rts
933