1*4882a593Smuzhiyun~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 2*4882a593SmuzhiyunMOTOROLA MICROPROCESSOR & MEMORY TECHNOLOGY GROUP 3*4882a593SmuzhiyunM68000 Hi-Performance Microprocessor Division 4*4882a593SmuzhiyunM68060 Software Package 5*4882a593SmuzhiyunProduction Release P1.00 -- October 10, 1994 6*4882a593Smuzhiyun 7*4882a593SmuzhiyunM68060 Software Package Copyright © 1993, 1994 Motorola Inc. All rights reserved. 8*4882a593Smuzhiyun 9*4882a593SmuzhiyunTHE SOFTWARE is provided on an "AS IS" basis and without warranty. 10*4882a593SmuzhiyunTo the maximum extent permitted by applicable law, 11*4882a593SmuzhiyunMOTOROLA DISCLAIMS ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, 12*4882a593SmuzhiyunINCLUDING IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE 13*4882a593Smuzhiyunand any warranty against infringement with regard to the SOFTWARE 14*4882a593Smuzhiyun(INCLUDING ANY MODIFIED VERSIONS THEREOF) and any accompanying written materials. 15*4882a593Smuzhiyun 16*4882a593SmuzhiyunTo the maximum extent permitted by applicable law, 17*4882a593SmuzhiyunIN NO EVENT SHALL MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER 18*4882a593Smuzhiyun(INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, 19*4882a593SmuzhiyunBUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY LOSS) 20*4882a593SmuzhiyunARISING OF THE USE OR INABILITY TO USE THE SOFTWARE. 21*4882a593SmuzhiyunMotorola assumes no responsibility for the maintenance and support of the SOFTWARE. 22*4882a593Smuzhiyun 23*4882a593SmuzhiyunYou are hereby granted a copyright license to use, modify, and distribute the SOFTWARE 24*4882a593Smuzhiyunso long as this entire notice is retained without alteration in any modified and/or 25*4882a593Smuzhiyunredistributed versions, and that such modified versions are clearly identified as such. 26*4882a593SmuzhiyunNo licenses are granted by implication, estoppel or otherwise under any patents 27*4882a593Smuzhiyunor trademarks of Motorola, Inc. 28*4882a593Smuzhiyun~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 29*4882a593Smuzhiyun# litop.s: 30*4882a593Smuzhiyun# This file is appended to the top of the 060FPLSP package 31*4882a593Smuzhiyun# and contains the entry points into the package. The user, in 32*4882a593Smuzhiyun# effect, branches to one of the branch table entries located here. 33*4882a593Smuzhiyun# 34*4882a593Smuzhiyun 35*4882a593Smuzhiyun bra.l _060LSP__idivs64_ 36*4882a593Smuzhiyun short 0x0000 37*4882a593Smuzhiyun bra.l _060LSP__idivu64_ 38*4882a593Smuzhiyun short 0x0000 39*4882a593Smuzhiyun 40*4882a593Smuzhiyun bra.l _060LSP__imuls64_ 41*4882a593Smuzhiyun short 0x0000 42*4882a593Smuzhiyun bra.l _060LSP__imulu64_ 43*4882a593Smuzhiyun short 0x0000 44*4882a593Smuzhiyun 45*4882a593Smuzhiyun bra.l _060LSP__cmp2_Ab_ 46*4882a593Smuzhiyun short 0x0000 47*4882a593Smuzhiyun bra.l _060LSP__cmp2_Aw_ 48*4882a593Smuzhiyun short 0x0000 49*4882a593Smuzhiyun bra.l _060LSP__cmp2_Al_ 50*4882a593Smuzhiyun short 0x0000 51*4882a593Smuzhiyun bra.l _060LSP__cmp2_Db_ 52*4882a593Smuzhiyun short 0x0000 53*4882a593Smuzhiyun bra.l _060LSP__cmp2_Dw_ 54*4882a593Smuzhiyun short 0x0000 55*4882a593Smuzhiyun bra.l _060LSP__cmp2_Dl_ 56*4882a593Smuzhiyun short 0x0000 57*4882a593Smuzhiyun 58*4882a593Smuzhiyun# leave room for future possible aditions. 59*4882a593Smuzhiyun align 0x200 60*4882a593Smuzhiyun 61*4882a593Smuzhiyun######################################################################### 62*4882a593Smuzhiyun# XDEF **************************************************************** # 63*4882a593Smuzhiyun# _060LSP__idivu64_(): Emulate 64-bit unsigned div instruction. # 64*4882a593Smuzhiyun# _060LSP__idivs64_(): Emulate 64-bit signed div instruction. # 65*4882a593Smuzhiyun# # 66*4882a593Smuzhiyun# This is the library version which is accessed as a subroutine # 67*4882a593Smuzhiyun# and therefore does not work exactly like the 680X0 div{s,u}.l # 68*4882a593Smuzhiyun# 64-bit divide instruction. # 69*4882a593Smuzhiyun# # 70*4882a593Smuzhiyun# XREF **************************************************************** # 71*4882a593Smuzhiyun# None. # 72*4882a593Smuzhiyun# # 73*4882a593Smuzhiyun# INPUT *************************************************************** # 74*4882a593Smuzhiyun# 0x4(sp) = divisor # 75*4882a593Smuzhiyun# 0x8(sp) = hi(dividend) # 76*4882a593Smuzhiyun# 0xc(sp) = lo(dividend) # 77*4882a593Smuzhiyun# 0x10(sp) = pointer to location to place quotient/remainder # 78*4882a593Smuzhiyun# # 79*4882a593Smuzhiyun# OUTPUT ************************************************************** # 80*4882a593Smuzhiyun# 0x10(sp) = points to location of remainder/quotient. # 81*4882a593Smuzhiyun# remainder is in first longword, quotient is in 2nd. # 82*4882a593Smuzhiyun# # 83*4882a593Smuzhiyun# ALGORITHM *********************************************************** # 84*4882a593Smuzhiyun# If the operands are signed, make them unsigned and save the # 85*4882a593Smuzhiyun# sign info for later. Separate out special cases like divide-by-zero # 86*4882a593Smuzhiyun# or 32-bit divides if possible. Else, use a special math algorithm # 87*4882a593Smuzhiyun# to calculate the result. # 88*4882a593Smuzhiyun# Restore sign info if signed instruction. Set the condition # 89*4882a593Smuzhiyun# codes before performing the final "rts". If the divisor was equal to # 90*4882a593Smuzhiyun# zero, then perform a divide-by-zero using a 16-bit implemented # 91*4882a593Smuzhiyun# divide instruction. This way, the operating system can record that # 92*4882a593Smuzhiyun# the event occurred even though it may not point to the correct place. # 93*4882a593Smuzhiyun# # 94*4882a593Smuzhiyun######################################################################### 95*4882a593Smuzhiyun 96*4882a593Smuzhiyunset POSNEG, -1 97*4882a593Smuzhiyunset NDIVISOR, -2 98*4882a593Smuzhiyunset NDIVIDEND, -3 99*4882a593Smuzhiyunset DDSECOND, -4 100*4882a593Smuzhiyunset DDNORMAL, -8 101*4882a593Smuzhiyunset DDQUOTIENT, -12 102*4882a593Smuzhiyunset DIV64_CC, -16 103*4882a593Smuzhiyun 104*4882a593Smuzhiyun########## 105*4882a593Smuzhiyun# divs.l # 106*4882a593Smuzhiyun########## 107*4882a593Smuzhiyun global _060LSP__idivs64_ 108*4882a593Smuzhiyun_060LSP__idivs64_: 109*4882a593Smuzhiyun# PROLOGUE BEGIN ######################################################## 110*4882a593Smuzhiyun link.w %a6,&-16 111*4882a593Smuzhiyun movm.l &0x3f00,-(%sp) # save d2-d7 112*4882a593Smuzhiyun# fmovm.l &0x0,-(%sp) # save no fpregs 113*4882a593Smuzhiyun# PROLOGUE END ########################################################## 114*4882a593Smuzhiyun 115*4882a593Smuzhiyun mov.w %cc,DIV64_CC(%a6) 116*4882a593Smuzhiyun st POSNEG(%a6) # signed operation 117*4882a593Smuzhiyun bra.b ldiv64_cont 118*4882a593Smuzhiyun 119*4882a593Smuzhiyun########## 120*4882a593Smuzhiyun# divu.l # 121*4882a593Smuzhiyun########## 122*4882a593Smuzhiyun global _060LSP__idivu64_ 123*4882a593Smuzhiyun_060LSP__idivu64_: 124*4882a593Smuzhiyun# PROLOGUE BEGIN ######################################################## 125*4882a593Smuzhiyun link.w %a6,&-16 126*4882a593Smuzhiyun movm.l &0x3f00,-(%sp) # save d2-d7 127*4882a593Smuzhiyun# fmovm.l &0x0,-(%sp) # save no fpregs 128*4882a593Smuzhiyun# PROLOGUE END ########################################################## 129*4882a593Smuzhiyun 130*4882a593Smuzhiyun mov.w %cc,DIV64_CC(%a6) 131*4882a593Smuzhiyun sf POSNEG(%a6) # unsigned operation 132*4882a593Smuzhiyun 133*4882a593Smuzhiyunldiv64_cont: 134*4882a593Smuzhiyun mov.l 0x8(%a6),%d7 # fetch divisor 135*4882a593Smuzhiyun 136*4882a593Smuzhiyun beq.w ldiv64eq0 # divisor is = 0!!! 137*4882a593Smuzhiyun 138*4882a593Smuzhiyun mov.l 0xc(%a6), %d5 # get dividend hi 139*4882a593Smuzhiyun mov.l 0x10(%a6), %d6 # get dividend lo 140*4882a593Smuzhiyun 141*4882a593Smuzhiyun# separate signed and unsigned divide 142*4882a593Smuzhiyun tst.b POSNEG(%a6) # signed or unsigned? 143*4882a593Smuzhiyun beq.b ldspecialcases # use positive divide 144*4882a593Smuzhiyun 145*4882a593Smuzhiyun# save the sign of the divisor 146*4882a593Smuzhiyun# make divisor unsigned if it's negative 147*4882a593Smuzhiyun tst.l %d7 # chk sign of divisor 148*4882a593Smuzhiyun slt NDIVISOR(%a6) # save sign of divisor 149*4882a593Smuzhiyun bpl.b ldsgndividend 150*4882a593Smuzhiyun neg.l %d7 # complement negative divisor 151*4882a593Smuzhiyun 152*4882a593Smuzhiyun# save the sign of the dividend 153*4882a593Smuzhiyun# make dividend unsigned if it's negative 154*4882a593Smuzhiyunldsgndividend: 155*4882a593Smuzhiyun tst.l %d5 # chk sign of hi(dividend) 156*4882a593Smuzhiyun slt NDIVIDEND(%a6) # save sign of dividend 157*4882a593Smuzhiyun bpl.b ldspecialcases 158*4882a593Smuzhiyun 159*4882a593Smuzhiyun mov.w &0x0, %cc # clear 'X' cc bit 160*4882a593Smuzhiyun negx.l %d6 # complement signed dividend 161*4882a593Smuzhiyun negx.l %d5 162*4882a593Smuzhiyun 163*4882a593Smuzhiyun# extract some special cases: 164*4882a593Smuzhiyun# - is (dividend == 0) ? 165*4882a593Smuzhiyun# - is (hi(dividend) == 0 && (divisor <= lo(dividend))) ? (32-bit div) 166*4882a593Smuzhiyunldspecialcases: 167*4882a593Smuzhiyun tst.l %d5 # is (hi(dividend) == 0) 168*4882a593Smuzhiyun bne.b ldnormaldivide # no, so try it the long way 169*4882a593Smuzhiyun 170*4882a593Smuzhiyun tst.l %d6 # is (lo(dividend) == 0), too 171*4882a593Smuzhiyun beq.w lddone # yes, so (dividend == 0) 172*4882a593Smuzhiyun 173*4882a593Smuzhiyun cmp.l %d7,%d6 # is (divisor <= lo(dividend)) 174*4882a593Smuzhiyun bls.b ld32bitdivide # yes, so use 32 bit divide 175*4882a593Smuzhiyun 176*4882a593Smuzhiyun exg %d5,%d6 # q = 0, r = dividend 177*4882a593Smuzhiyun bra.w ldivfinish # can't divide, we're done. 178*4882a593Smuzhiyun 179*4882a593Smuzhiyunld32bitdivide: 180*4882a593Smuzhiyun tdivu.l %d7, %d5:%d6 # it's only a 32/32 bit div! 181*4882a593Smuzhiyun 182*4882a593Smuzhiyun bra.b ldivfinish 183*4882a593Smuzhiyun 184*4882a593Smuzhiyunldnormaldivide: 185*4882a593Smuzhiyun# last special case: 186*4882a593Smuzhiyun# - is hi(dividend) >= divisor ? if yes, then overflow 187*4882a593Smuzhiyun cmp.l %d7,%d5 188*4882a593Smuzhiyun bls.b lddovf # answer won't fit in 32 bits 189*4882a593Smuzhiyun 190*4882a593Smuzhiyun# perform the divide algorithm: 191*4882a593Smuzhiyun bsr.l ldclassical # do int divide 192*4882a593Smuzhiyun 193*4882a593Smuzhiyun# separate into signed and unsigned finishes. 194*4882a593Smuzhiyunldivfinish: 195*4882a593Smuzhiyun tst.b POSNEG(%a6) # do divs, divu separately 196*4882a593Smuzhiyun beq.b lddone # divu has no processing!!! 197*4882a593Smuzhiyun 198*4882a593Smuzhiyun# it was a divs.l, so ccode setting is a little more complicated... 199*4882a593Smuzhiyun tst.b NDIVIDEND(%a6) # remainder has same sign 200*4882a593Smuzhiyun beq.b ldcc # as dividend. 201*4882a593Smuzhiyun neg.l %d5 # sgn(rem) = sgn(dividend) 202*4882a593Smuzhiyunldcc: 203*4882a593Smuzhiyun mov.b NDIVISOR(%a6), %d0 204*4882a593Smuzhiyun eor.b %d0, NDIVIDEND(%a6) # chk if quotient is negative 205*4882a593Smuzhiyun beq.b ldqpos # branch to quot positive 206*4882a593Smuzhiyun 207*4882a593Smuzhiyun# 0x80000000 is the largest number representable as a 32-bit negative 208*4882a593Smuzhiyun# number. the negative of 0x80000000 is 0x80000000. 209*4882a593Smuzhiyun cmpi.l %d6, &0x80000000 # will (-quot) fit in 32 bits? 210*4882a593Smuzhiyun bhi.b lddovf 211*4882a593Smuzhiyun 212*4882a593Smuzhiyun neg.l %d6 # make (-quot) 2's comp 213*4882a593Smuzhiyun 214*4882a593Smuzhiyun bra.b lddone 215*4882a593Smuzhiyun 216*4882a593Smuzhiyunldqpos: 217*4882a593Smuzhiyun btst &0x1f, %d6 # will (+quot) fit in 32 bits? 218*4882a593Smuzhiyun bne.b lddovf 219*4882a593Smuzhiyun 220*4882a593Smuzhiyunlddone: 221*4882a593Smuzhiyun# if the register numbers are the same, only the quotient gets saved. 222*4882a593Smuzhiyun# so, if we always save the quotient second, we save ourselves a cmp&beq 223*4882a593Smuzhiyun andi.w &0x10,DIV64_CC(%a6) 224*4882a593Smuzhiyun mov.w DIV64_CC(%a6),%cc 225*4882a593Smuzhiyun tst.l %d6 # may set 'N' ccode bit 226*4882a593Smuzhiyun 227*4882a593Smuzhiyun# here, the result is in d1 and d0. the current strategy is to save 228*4882a593Smuzhiyun# the values at the location pointed to by a0. 229*4882a593Smuzhiyun# use movm here to not disturb the condition codes. 230*4882a593Smuzhiyunldexit: 231*4882a593Smuzhiyun movm.l &0x0060,([0x14,%a6]) # save result 232*4882a593Smuzhiyun 233*4882a593Smuzhiyun# EPILOGUE BEGIN ######################################################## 234*4882a593Smuzhiyun# fmovm.l (%sp)+,&0x0 # restore no fpregs 235*4882a593Smuzhiyun movm.l (%sp)+,&0x00fc # restore d2-d7 236*4882a593Smuzhiyun unlk %a6 237*4882a593Smuzhiyun# EPILOGUE END ########################################################## 238*4882a593Smuzhiyun 239*4882a593Smuzhiyun rts 240*4882a593Smuzhiyun 241*4882a593Smuzhiyun# the result should be the unchanged dividend 242*4882a593Smuzhiyunlddovf: 243*4882a593Smuzhiyun mov.l 0xc(%a6), %d5 # get dividend hi 244*4882a593Smuzhiyun mov.l 0x10(%a6), %d6 # get dividend lo 245*4882a593Smuzhiyun 246*4882a593Smuzhiyun andi.w &0x1c,DIV64_CC(%a6) 247*4882a593Smuzhiyun ori.w &0x02,DIV64_CC(%a6) # set 'V' ccode bit 248*4882a593Smuzhiyun mov.w DIV64_CC(%a6),%cc 249*4882a593Smuzhiyun 250*4882a593Smuzhiyun bra.b ldexit 251*4882a593Smuzhiyun 252*4882a593Smuzhiyunldiv64eq0: 253*4882a593Smuzhiyun mov.l 0xc(%a6),([0x14,%a6]) 254*4882a593Smuzhiyun mov.l 0x10(%a6),([0x14,%a6],0x4) 255*4882a593Smuzhiyun 256*4882a593Smuzhiyun mov.w DIV64_CC(%a6),%cc 257*4882a593Smuzhiyun 258*4882a593Smuzhiyun# EPILOGUE BEGIN ######################################################## 259*4882a593Smuzhiyun# fmovm.l (%sp)+,&0x0 # restore no fpregs 260*4882a593Smuzhiyun movm.l (%sp)+,&0x00fc # restore d2-d7 261*4882a593Smuzhiyun unlk %a6 262*4882a593Smuzhiyun# EPILOGUE END ########################################################## 263*4882a593Smuzhiyun 264*4882a593Smuzhiyun divu.w &0x0,%d0 # force a divbyzero exception 265*4882a593Smuzhiyun rts 266*4882a593Smuzhiyun 267*4882a593Smuzhiyun########################################################################### 268*4882a593Smuzhiyun######################################################################### 269*4882a593Smuzhiyun# This routine uses the 'classical' Algorithm D from Donald Knuth's # 270*4882a593Smuzhiyun# Art of Computer Programming, vol II, Seminumerical Algorithms. # 271*4882a593Smuzhiyun# For this implementation b=2**16, and the target is U1U2U3U4/V1V2, # 272*4882a593Smuzhiyun# where U,V are words of the quadword dividend and longword divisor, # 273*4882a593Smuzhiyun# and U1, V1 are the most significant words. # 274*4882a593Smuzhiyun# # 275*4882a593Smuzhiyun# The most sig. longword of the 64 bit dividend must be in %d5, least # 276*4882a593Smuzhiyun# in %d6. The divisor must be in the variable ddivisor, and the # 277*4882a593Smuzhiyun# signed/unsigned flag ddusign must be set (0=unsigned,1=signed). # 278*4882a593Smuzhiyun# The quotient is returned in %d6, remainder in %d5, unless the # 279*4882a593Smuzhiyun# v (overflow) bit is set in the saved %ccr. If overflow, the dividend # 280*4882a593Smuzhiyun# is unchanged. # 281*4882a593Smuzhiyun######################################################################### 282*4882a593Smuzhiyunldclassical: 283*4882a593Smuzhiyun# if the divisor msw is 0, use simpler algorithm then the full blown 284*4882a593Smuzhiyun# one at ddknuth: 285*4882a593Smuzhiyun 286*4882a593Smuzhiyun cmpi.l %d7, &0xffff 287*4882a593Smuzhiyun bhi.b lddknuth # go use D. Knuth algorithm 288*4882a593Smuzhiyun 289*4882a593Smuzhiyun# Since the divisor is only a word (and larger than the mslw of the dividend), 290*4882a593Smuzhiyun# a simpler algorithm may be used : 291*4882a593Smuzhiyun# In the general case, four quotient words would be created by 292*4882a593Smuzhiyun# dividing the divisor word into each dividend word. In this case, 293*4882a593Smuzhiyun# the first two quotient words must be zero, or overflow would occur. 294*4882a593Smuzhiyun# Since we already checked this case above, we can treat the most significant 295*4882a593Smuzhiyun# longword of the dividend as (0) remainder (see Knuth) and merely complete 296*4882a593Smuzhiyun# the last two divisions to get a quotient longword and word remainder: 297*4882a593Smuzhiyun 298*4882a593Smuzhiyun clr.l %d1 299*4882a593Smuzhiyun swap %d5 # same as r*b if previous step rqd 300*4882a593Smuzhiyun swap %d6 # get u3 to lsw position 301*4882a593Smuzhiyun mov.w %d6, %d5 # rb + u3 302*4882a593Smuzhiyun 303*4882a593Smuzhiyun divu.w %d7, %d5 304*4882a593Smuzhiyun 305*4882a593Smuzhiyun mov.w %d5, %d1 # first quotient word 306*4882a593Smuzhiyun swap %d6 # get u4 307*4882a593Smuzhiyun mov.w %d6, %d5 # rb + u4 308*4882a593Smuzhiyun 309*4882a593Smuzhiyun divu.w %d7, %d5 310*4882a593Smuzhiyun 311*4882a593Smuzhiyun swap %d1 312*4882a593Smuzhiyun mov.w %d5, %d1 # 2nd quotient 'digit' 313*4882a593Smuzhiyun clr.w %d5 314*4882a593Smuzhiyun swap %d5 # now remainder 315*4882a593Smuzhiyun mov.l %d1, %d6 # and quotient 316*4882a593Smuzhiyun 317*4882a593Smuzhiyun rts 318*4882a593Smuzhiyun 319*4882a593Smuzhiyunlddknuth: 320*4882a593Smuzhiyun# In this algorithm, the divisor is treated as a 2 digit (word) number 321*4882a593Smuzhiyun# which is divided into a 3 digit (word) dividend to get one quotient 322*4882a593Smuzhiyun# digit (word). After subtraction, the dividend is shifted and the 323*4882a593Smuzhiyun# process repeated. Before beginning, the divisor and quotient are 324*4882a593Smuzhiyun# 'normalized' so that the process of estimating the quotient digit 325*4882a593Smuzhiyun# will yield verifiably correct results.. 326*4882a593Smuzhiyun 327*4882a593Smuzhiyun clr.l DDNORMAL(%a6) # count of shifts for normalization 328*4882a593Smuzhiyun clr.b DDSECOND(%a6) # clear flag for quotient digits 329*4882a593Smuzhiyun clr.l %d1 # %d1 will hold trial quotient 330*4882a593Smuzhiyunlddnchk: 331*4882a593Smuzhiyun btst &31, %d7 # must we normalize? first word of 332*4882a593Smuzhiyun bne.b lddnormalized # divisor (V1) must be >= 65536/2 333*4882a593Smuzhiyun addq.l &0x1, DDNORMAL(%a6) # count normalization shifts 334*4882a593Smuzhiyun lsl.l &0x1, %d7 # shift the divisor 335*4882a593Smuzhiyun lsl.l &0x1, %d6 # shift u4,u3 with overflow to u2 336*4882a593Smuzhiyun roxl.l &0x1, %d5 # shift u1,u2 337*4882a593Smuzhiyun bra.w lddnchk 338*4882a593Smuzhiyunlddnormalized: 339*4882a593Smuzhiyun 340*4882a593Smuzhiyun# Now calculate an estimate of the quotient words (msw first, then lsw). 341*4882a593Smuzhiyun# The comments use subscripts for the first quotient digit determination. 342*4882a593Smuzhiyun mov.l %d7, %d3 # divisor 343*4882a593Smuzhiyun mov.l %d5, %d2 # dividend mslw 344*4882a593Smuzhiyun swap %d2 345*4882a593Smuzhiyun swap %d3 346*4882a593Smuzhiyun cmp.w %d2, %d3 # V1 = U1 ? 347*4882a593Smuzhiyun bne.b lddqcalc1 348*4882a593Smuzhiyun mov.w &0xffff, %d1 # use max trial quotient word 349*4882a593Smuzhiyun bra.b lddadj0 350*4882a593Smuzhiyunlddqcalc1: 351*4882a593Smuzhiyun mov.l %d5, %d1 352*4882a593Smuzhiyun 353*4882a593Smuzhiyun divu.w %d3, %d1 # use quotient of mslw/msw 354*4882a593Smuzhiyun 355*4882a593Smuzhiyun andi.l &0x0000ffff, %d1 # zero any remainder 356*4882a593Smuzhiyunlddadj0: 357*4882a593Smuzhiyun 358*4882a593Smuzhiyun# now test the trial quotient and adjust. This step plus the 359*4882a593Smuzhiyun# normalization assures (according to Knuth) that the trial 360*4882a593Smuzhiyun# quotient will be at worst 1 too large. 361*4882a593Smuzhiyun mov.l %d6, -(%sp) 362*4882a593Smuzhiyun clr.w %d6 # word u3 left 363*4882a593Smuzhiyun swap %d6 # in lsw position 364*4882a593Smuzhiyunlddadj1: mov.l %d7, %d3 365*4882a593Smuzhiyun mov.l %d1, %d2 366*4882a593Smuzhiyun mulu.w %d7, %d2 # V2q 367*4882a593Smuzhiyun swap %d3 368*4882a593Smuzhiyun mulu.w %d1, %d3 # V1q 369*4882a593Smuzhiyun mov.l %d5, %d4 # U1U2 370*4882a593Smuzhiyun sub.l %d3, %d4 # U1U2 - V1q 371*4882a593Smuzhiyun 372*4882a593Smuzhiyun swap %d4 373*4882a593Smuzhiyun 374*4882a593Smuzhiyun mov.w %d4,%d0 375*4882a593Smuzhiyun mov.w %d6,%d4 # insert lower word (U3) 376*4882a593Smuzhiyun 377*4882a593Smuzhiyun tst.w %d0 # is upper word set? 378*4882a593Smuzhiyun bne.w lddadjd1 379*4882a593Smuzhiyun 380*4882a593Smuzhiyun# add.l %d6, %d4 # (U1U2 - V1q) + U3 381*4882a593Smuzhiyun 382*4882a593Smuzhiyun cmp.l %d2, %d4 383*4882a593Smuzhiyun bls.b lddadjd1 # is V2q > (U1U2-V1q) + U3 ? 384*4882a593Smuzhiyun subq.l &0x1, %d1 # yes, decrement and recheck 385*4882a593Smuzhiyun bra.b lddadj1 386*4882a593Smuzhiyunlddadjd1: 387*4882a593Smuzhiyun# now test the word by multiplying it by the divisor (V1V2) and comparing 388*4882a593Smuzhiyun# the 3 digit (word) result with the current dividend words 389*4882a593Smuzhiyun mov.l %d5, -(%sp) # save %d5 (%d6 already saved) 390*4882a593Smuzhiyun mov.l %d1, %d6 391*4882a593Smuzhiyun swap %d6 # shift answer to ms 3 words 392*4882a593Smuzhiyun mov.l %d7, %d5 393*4882a593Smuzhiyun bsr.l ldmm2 394*4882a593Smuzhiyun mov.l %d5, %d2 # now %d2,%d3 are trial*divisor 395*4882a593Smuzhiyun mov.l %d6, %d3 396*4882a593Smuzhiyun mov.l (%sp)+, %d5 # restore dividend 397*4882a593Smuzhiyun mov.l (%sp)+, %d6 398*4882a593Smuzhiyun sub.l %d3, %d6 399*4882a593Smuzhiyun subx.l %d2, %d5 # subtract double precision 400*4882a593Smuzhiyun bcc ldd2nd # no carry, do next quotient digit 401*4882a593Smuzhiyun subq.l &0x1, %d1 # q is one too large 402*4882a593Smuzhiyun# need to add back divisor longword to current ms 3 digits of dividend 403*4882a593Smuzhiyun# - according to Knuth, this is done only 2 out of 65536 times for random 404*4882a593Smuzhiyun# divisor, dividend selection. 405*4882a593Smuzhiyun clr.l %d2 406*4882a593Smuzhiyun mov.l %d7, %d3 407*4882a593Smuzhiyun swap %d3 408*4882a593Smuzhiyun clr.w %d3 # %d3 now ls word of divisor 409*4882a593Smuzhiyun add.l %d3, %d6 # aligned with 3rd word of dividend 410*4882a593Smuzhiyun addx.l %d2, %d5 411*4882a593Smuzhiyun mov.l %d7, %d3 412*4882a593Smuzhiyun clr.w %d3 # %d3 now ms word of divisor 413*4882a593Smuzhiyun swap %d3 # aligned with 2nd word of dividend 414*4882a593Smuzhiyun add.l %d3, %d5 415*4882a593Smuzhiyunldd2nd: 416*4882a593Smuzhiyun tst.b DDSECOND(%a6) # both q words done? 417*4882a593Smuzhiyun bne.b lddremain 418*4882a593Smuzhiyun# first quotient digit now correct. store digit and shift the 419*4882a593Smuzhiyun# (subtracted) dividend 420*4882a593Smuzhiyun mov.w %d1, DDQUOTIENT(%a6) 421*4882a593Smuzhiyun clr.l %d1 422*4882a593Smuzhiyun swap %d5 423*4882a593Smuzhiyun swap %d6 424*4882a593Smuzhiyun mov.w %d6, %d5 425*4882a593Smuzhiyun clr.w %d6 426*4882a593Smuzhiyun st DDSECOND(%a6) # second digit 427*4882a593Smuzhiyun bra.w lddnormalized 428*4882a593Smuzhiyunlddremain: 429*4882a593Smuzhiyun# add 2nd word to quotient, get the remainder. 430*4882a593Smuzhiyun mov.w %d1, DDQUOTIENT+2(%a6) 431*4882a593Smuzhiyun# shift down one word/digit to renormalize remainder. 432*4882a593Smuzhiyun mov.w %d5, %d6 433*4882a593Smuzhiyun swap %d6 434*4882a593Smuzhiyun swap %d5 435*4882a593Smuzhiyun mov.l DDNORMAL(%a6), %d7 # get norm shift count 436*4882a593Smuzhiyun beq.b lddrn 437*4882a593Smuzhiyun subq.l &0x1, %d7 # set for loop count 438*4882a593Smuzhiyunlddnlp: 439*4882a593Smuzhiyun lsr.l &0x1, %d5 # shift into %d6 440*4882a593Smuzhiyun roxr.l &0x1, %d6 441*4882a593Smuzhiyun dbf %d7, lddnlp 442*4882a593Smuzhiyunlddrn: 443*4882a593Smuzhiyun mov.l %d6, %d5 # remainder 444*4882a593Smuzhiyun mov.l DDQUOTIENT(%a6), %d6 # quotient 445*4882a593Smuzhiyun 446*4882a593Smuzhiyun rts 447*4882a593Smuzhiyunldmm2: 448*4882a593Smuzhiyun# factors for the 32X32->64 multiplication are in %d5 and %d6. 449*4882a593Smuzhiyun# returns 64 bit result in %d5 (hi) %d6(lo). 450*4882a593Smuzhiyun# destroys %d2,%d3,%d4. 451*4882a593Smuzhiyun 452*4882a593Smuzhiyun# multiply hi,lo words of each factor to get 4 intermediate products 453*4882a593Smuzhiyun mov.l %d6, %d2 454*4882a593Smuzhiyun mov.l %d6, %d3 455*4882a593Smuzhiyun mov.l %d5, %d4 456*4882a593Smuzhiyun swap %d3 457*4882a593Smuzhiyun swap %d4 458*4882a593Smuzhiyun mulu.w %d5, %d6 # %d6 <- lsw*lsw 459*4882a593Smuzhiyun mulu.w %d3, %d5 # %d5 <- msw-dest*lsw-source 460*4882a593Smuzhiyun mulu.w %d4, %d2 # %d2 <- msw-source*lsw-dest 461*4882a593Smuzhiyun mulu.w %d4, %d3 # %d3 <- msw*msw 462*4882a593Smuzhiyun# now use swap and addx to consolidate to two longwords 463*4882a593Smuzhiyun clr.l %d4 464*4882a593Smuzhiyun swap %d6 465*4882a593Smuzhiyun add.w %d5, %d6 # add msw of l*l to lsw of m*l product 466*4882a593Smuzhiyun addx.w %d4, %d3 # add any carry to m*m product 467*4882a593Smuzhiyun add.w %d2, %d6 # add in lsw of other m*l product 468*4882a593Smuzhiyun addx.w %d4, %d3 # add any carry to m*m product 469*4882a593Smuzhiyun swap %d6 # %d6 is low 32 bits of final product 470*4882a593Smuzhiyun clr.w %d5 471*4882a593Smuzhiyun clr.w %d2 # lsw of two mixed products used, 472*4882a593Smuzhiyun swap %d5 # now use msws of longwords 473*4882a593Smuzhiyun swap %d2 474*4882a593Smuzhiyun add.l %d2, %d5 475*4882a593Smuzhiyun add.l %d3, %d5 # %d5 now ms 32 bits of final product 476*4882a593Smuzhiyun rts 477*4882a593Smuzhiyun 478*4882a593Smuzhiyun######################################################################### 479*4882a593Smuzhiyun# XDEF **************************************************************** # 480*4882a593Smuzhiyun# _060LSP__imulu64_(): Emulate 64-bit unsigned mul instruction # 481*4882a593Smuzhiyun# _060LSP__imuls64_(): Emulate 64-bit signed mul instruction. # 482*4882a593Smuzhiyun# # 483*4882a593Smuzhiyun# This is the library version which is accessed as a subroutine # 484*4882a593Smuzhiyun# and therefore does not work exactly like the 680X0 mul{s,u}.l # 485*4882a593Smuzhiyun# 64-bit multiply instruction. # 486*4882a593Smuzhiyun# # 487*4882a593Smuzhiyun# XREF **************************************************************** # 488*4882a593Smuzhiyun# None # 489*4882a593Smuzhiyun# # 490*4882a593Smuzhiyun# INPUT *************************************************************** # 491*4882a593Smuzhiyun# 0x4(sp) = multiplier # 492*4882a593Smuzhiyun# 0x8(sp) = multiplicand # 493*4882a593Smuzhiyun# 0xc(sp) = pointer to location to place 64-bit result # 494*4882a593Smuzhiyun# # 495*4882a593Smuzhiyun# OUTPUT ************************************************************** # 496*4882a593Smuzhiyun# 0xc(sp) = points to location of 64-bit result # 497*4882a593Smuzhiyun# # 498*4882a593Smuzhiyun# ALGORITHM *********************************************************** # 499*4882a593Smuzhiyun# Perform the multiply in pieces using 16x16->32 unsigned # 500*4882a593Smuzhiyun# multiplies and "add" instructions. # 501*4882a593Smuzhiyun# Set the condition codes as appropriate before performing an # 502*4882a593Smuzhiyun# "rts". # 503*4882a593Smuzhiyun# # 504*4882a593Smuzhiyun######################################################################### 505*4882a593Smuzhiyun 506*4882a593Smuzhiyunset MUL64_CC, -4 507*4882a593Smuzhiyun 508*4882a593Smuzhiyun global _060LSP__imulu64_ 509*4882a593Smuzhiyun_060LSP__imulu64_: 510*4882a593Smuzhiyun 511*4882a593Smuzhiyun# PROLOGUE BEGIN ######################################################## 512*4882a593Smuzhiyun link.w %a6,&-4 513*4882a593Smuzhiyun movm.l &0x3800,-(%sp) # save d2-d4 514*4882a593Smuzhiyun# fmovm.l &0x0,-(%sp) # save no fpregs 515*4882a593Smuzhiyun# PROLOGUE END ########################################################## 516*4882a593Smuzhiyun 517*4882a593Smuzhiyun mov.w %cc,MUL64_CC(%a6) # save incoming ccodes 518*4882a593Smuzhiyun 519*4882a593Smuzhiyun mov.l 0x8(%a6),%d0 # store multiplier in d0 520*4882a593Smuzhiyun beq.w mulu64_zero # handle zero separately 521*4882a593Smuzhiyun 522*4882a593Smuzhiyun mov.l 0xc(%a6),%d1 # get multiplicand in d1 523*4882a593Smuzhiyun beq.w mulu64_zero # handle zero separately 524*4882a593Smuzhiyun 525*4882a593Smuzhiyun######################################################################### 526*4882a593Smuzhiyun# 63 32 0 # 527*4882a593Smuzhiyun# ---------------------------- # 528*4882a593Smuzhiyun# | hi(mplier) * hi(mplicand)| # 529*4882a593Smuzhiyun# ---------------------------- # 530*4882a593Smuzhiyun# ----------------------------- # 531*4882a593Smuzhiyun# | hi(mplier) * lo(mplicand) | # 532*4882a593Smuzhiyun# ----------------------------- # 533*4882a593Smuzhiyun# ----------------------------- # 534*4882a593Smuzhiyun# | lo(mplier) * hi(mplicand) | # 535*4882a593Smuzhiyun# ----------------------------- # 536*4882a593Smuzhiyun# | ----------------------------- # 537*4882a593Smuzhiyun# --|-- | lo(mplier) * lo(mplicand) | # 538*4882a593Smuzhiyun# | ----------------------------- # 539*4882a593Smuzhiyun# ======================================================== # 540*4882a593Smuzhiyun# -------------------------------------------------------- # 541*4882a593Smuzhiyun# | hi(result) | lo(result) | # 542*4882a593Smuzhiyun# -------------------------------------------------------- # 543*4882a593Smuzhiyun######################################################################### 544*4882a593Smuzhiyunmulu64_alg: 545*4882a593Smuzhiyun# load temp registers with operands 546*4882a593Smuzhiyun mov.l %d0,%d2 # mr in d2 547*4882a593Smuzhiyun mov.l %d0,%d3 # mr in d3 548*4882a593Smuzhiyun mov.l %d1,%d4 # md in d4 549*4882a593Smuzhiyun swap %d3 # hi(mr) in lo d3 550*4882a593Smuzhiyun swap %d4 # hi(md) in lo d4 551*4882a593Smuzhiyun 552*4882a593Smuzhiyun# complete necessary multiplies: 553*4882a593Smuzhiyun mulu.w %d1,%d0 # [1] lo(mr) * lo(md) 554*4882a593Smuzhiyun mulu.w %d3,%d1 # [2] hi(mr) * lo(md) 555*4882a593Smuzhiyun mulu.w %d4,%d2 # [3] lo(mr) * hi(md) 556*4882a593Smuzhiyun mulu.w %d4,%d3 # [4] hi(mr) * hi(md) 557*4882a593Smuzhiyun 558*4882a593Smuzhiyun# add lo portions of [2],[3] to hi portion of [1]. 559*4882a593Smuzhiyun# add carries produced from these adds to [4]. 560*4882a593Smuzhiyun# lo([1]) is the final lo 16 bits of the result. 561*4882a593Smuzhiyun clr.l %d4 # load d4 w/ zero value 562*4882a593Smuzhiyun swap %d0 # hi([1]) <==> lo([1]) 563*4882a593Smuzhiyun add.w %d1,%d0 # hi([1]) + lo([2]) 564*4882a593Smuzhiyun addx.l %d4,%d3 # [4] + carry 565*4882a593Smuzhiyun add.w %d2,%d0 # hi([1]) + lo([3]) 566*4882a593Smuzhiyun addx.l %d4,%d3 # [4] + carry 567*4882a593Smuzhiyun swap %d0 # lo([1]) <==> hi([1]) 568*4882a593Smuzhiyun 569*4882a593Smuzhiyun# lo portions of [2],[3] have been added in to final result. 570*4882a593Smuzhiyun# now, clear lo, put hi in lo reg, and add to [4] 571*4882a593Smuzhiyun clr.w %d1 # clear lo([2]) 572*4882a593Smuzhiyun clr.w %d2 # clear hi([3]) 573*4882a593Smuzhiyun swap %d1 # hi([2]) in lo d1 574*4882a593Smuzhiyun swap %d2 # hi([3]) in lo d2 575*4882a593Smuzhiyun add.l %d2,%d1 # [4] + hi([2]) 576*4882a593Smuzhiyun add.l %d3,%d1 # [4] + hi([3]) 577*4882a593Smuzhiyun 578*4882a593Smuzhiyun# now, grab the condition codes. only one that can be set is 'N'. 579*4882a593Smuzhiyun# 'N' CAN be set if the operation is unsigned if bit 63 is set. 580*4882a593Smuzhiyun mov.w MUL64_CC(%a6),%d4 581*4882a593Smuzhiyun andi.b &0x10,%d4 # keep old 'X' bit 582*4882a593Smuzhiyun tst.l %d1 # may set 'N' bit 583*4882a593Smuzhiyun bpl.b mulu64_ddone 584*4882a593Smuzhiyun ori.b &0x8,%d4 # set 'N' bit 585*4882a593Smuzhiyunmulu64_ddone: 586*4882a593Smuzhiyun mov.w %d4,%cc 587*4882a593Smuzhiyun 588*4882a593Smuzhiyun# here, the result is in d1 and d0. the current strategy is to save 589*4882a593Smuzhiyun# the values at the location pointed to by a0. 590*4882a593Smuzhiyun# use movm here to not disturb the condition codes. 591*4882a593Smuzhiyunmulu64_end: 592*4882a593Smuzhiyun exg %d1,%d0 593*4882a593Smuzhiyun movm.l &0x0003,([0x10,%a6]) # save result 594*4882a593Smuzhiyun 595*4882a593Smuzhiyun# EPILOGUE BEGIN ######################################################## 596*4882a593Smuzhiyun# fmovm.l (%sp)+,&0x0 # restore no fpregs 597*4882a593Smuzhiyun movm.l (%sp)+,&0x001c # restore d2-d4 598*4882a593Smuzhiyun unlk %a6 599*4882a593Smuzhiyun# EPILOGUE END ########################################################## 600*4882a593Smuzhiyun 601*4882a593Smuzhiyun rts 602*4882a593Smuzhiyun 603*4882a593Smuzhiyun# one or both of the operands is zero so the result is also zero. 604*4882a593Smuzhiyun# save the zero result to the register file and set the 'Z' ccode bit. 605*4882a593Smuzhiyunmulu64_zero: 606*4882a593Smuzhiyun clr.l %d0 607*4882a593Smuzhiyun clr.l %d1 608*4882a593Smuzhiyun 609*4882a593Smuzhiyun mov.w MUL64_CC(%a6),%d4 610*4882a593Smuzhiyun andi.b &0x10,%d4 611*4882a593Smuzhiyun ori.b &0x4,%d4 612*4882a593Smuzhiyun mov.w %d4,%cc # set 'Z' ccode bit 613*4882a593Smuzhiyun 614*4882a593Smuzhiyun bra.b mulu64_end 615*4882a593Smuzhiyun 616*4882a593Smuzhiyun########## 617*4882a593Smuzhiyun# muls.l # 618*4882a593Smuzhiyun########## 619*4882a593Smuzhiyun global _060LSP__imuls64_ 620*4882a593Smuzhiyun_060LSP__imuls64_: 621*4882a593Smuzhiyun 622*4882a593Smuzhiyun# PROLOGUE BEGIN ######################################################## 623*4882a593Smuzhiyun link.w %a6,&-4 624*4882a593Smuzhiyun movm.l &0x3c00,-(%sp) # save d2-d5 625*4882a593Smuzhiyun# fmovm.l &0x0,-(%sp) # save no fpregs 626*4882a593Smuzhiyun# PROLOGUE END ########################################################## 627*4882a593Smuzhiyun 628*4882a593Smuzhiyun mov.w %cc,MUL64_CC(%a6) # save incoming ccodes 629*4882a593Smuzhiyun 630*4882a593Smuzhiyun mov.l 0x8(%a6),%d0 # store multiplier in d0 631*4882a593Smuzhiyun beq.b mulu64_zero # handle zero separately 632*4882a593Smuzhiyun 633*4882a593Smuzhiyun mov.l 0xc(%a6),%d1 # get multiplicand in d1 634*4882a593Smuzhiyun beq.b mulu64_zero # handle zero separately 635*4882a593Smuzhiyun 636*4882a593Smuzhiyun clr.b %d5 # clear sign tag 637*4882a593Smuzhiyun tst.l %d0 # is multiplier negative? 638*4882a593Smuzhiyun bge.b muls64_chk_md_sgn # no 639*4882a593Smuzhiyun neg.l %d0 # make multiplier positive 640*4882a593Smuzhiyun 641*4882a593Smuzhiyun ori.b &0x1,%d5 # save multiplier sgn 642*4882a593Smuzhiyun 643*4882a593Smuzhiyun# the result sign is the exclusive or of the operand sign bits. 644*4882a593Smuzhiyunmuls64_chk_md_sgn: 645*4882a593Smuzhiyun tst.l %d1 # is multiplicand negative? 646*4882a593Smuzhiyun bge.b muls64_alg # no 647*4882a593Smuzhiyun neg.l %d1 # make multiplicand positive 648*4882a593Smuzhiyun 649*4882a593Smuzhiyun eori.b &0x1,%d5 # calculate correct sign 650*4882a593Smuzhiyun 651*4882a593Smuzhiyun######################################################################### 652*4882a593Smuzhiyun# 63 32 0 # 653*4882a593Smuzhiyun# ---------------------------- # 654*4882a593Smuzhiyun# | hi(mplier) * hi(mplicand)| # 655*4882a593Smuzhiyun# ---------------------------- # 656*4882a593Smuzhiyun# ----------------------------- # 657*4882a593Smuzhiyun# | hi(mplier) * lo(mplicand) | # 658*4882a593Smuzhiyun# ----------------------------- # 659*4882a593Smuzhiyun# ----------------------------- # 660*4882a593Smuzhiyun# | lo(mplier) * hi(mplicand) | # 661*4882a593Smuzhiyun# ----------------------------- # 662*4882a593Smuzhiyun# | ----------------------------- # 663*4882a593Smuzhiyun# --|-- | lo(mplier) * lo(mplicand) | # 664*4882a593Smuzhiyun# | ----------------------------- # 665*4882a593Smuzhiyun# ======================================================== # 666*4882a593Smuzhiyun# -------------------------------------------------------- # 667*4882a593Smuzhiyun# | hi(result) | lo(result) | # 668*4882a593Smuzhiyun# -------------------------------------------------------- # 669*4882a593Smuzhiyun######################################################################### 670*4882a593Smuzhiyunmuls64_alg: 671*4882a593Smuzhiyun# load temp registers with operands 672*4882a593Smuzhiyun mov.l %d0,%d2 # mr in d2 673*4882a593Smuzhiyun mov.l %d0,%d3 # mr in d3 674*4882a593Smuzhiyun mov.l %d1,%d4 # md in d4 675*4882a593Smuzhiyun swap %d3 # hi(mr) in lo d3 676*4882a593Smuzhiyun swap %d4 # hi(md) in lo d4 677*4882a593Smuzhiyun 678*4882a593Smuzhiyun# complete necessary multiplies: 679*4882a593Smuzhiyun mulu.w %d1,%d0 # [1] lo(mr) * lo(md) 680*4882a593Smuzhiyun mulu.w %d3,%d1 # [2] hi(mr) * lo(md) 681*4882a593Smuzhiyun mulu.w %d4,%d2 # [3] lo(mr) * hi(md) 682*4882a593Smuzhiyun mulu.w %d4,%d3 # [4] hi(mr) * hi(md) 683*4882a593Smuzhiyun 684*4882a593Smuzhiyun# add lo portions of [2],[3] to hi portion of [1]. 685*4882a593Smuzhiyun# add carries produced from these adds to [4]. 686*4882a593Smuzhiyun# lo([1]) is the final lo 16 bits of the result. 687*4882a593Smuzhiyun clr.l %d4 # load d4 w/ zero value 688*4882a593Smuzhiyun swap %d0 # hi([1]) <==> lo([1]) 689*4882a593Smuzhiyun add.w %d1,%d0 # hi([1]) + lo([2]) 690*4882a593Smuzhiyun addx.l %d4,%d3 # [4] + carry 691*4882a593Smuzhiyun add.w %d2,%d0 # hi([1]) + lo([3]) 692*4882a593Smuzhiyun addx.l %d4,%d3 # [4] + carry 693*4882a593Smuzhiyun swap %d0 # lo([1]) <==> hi([1]) 694*4882a593Smuzhiyun 695*4882a593Smuzhiyun# lo portions of [2],[3] have been added in to final result. 696*4882a593Smuzhiyun# now, clear lo, put hi in lo reg, and add to [4] 697*4882a593Smuzhiyun clr.w %d1 # clear lo([2]) 698*4882a593Smuzhiyun clr.w %d2 # clear hi([3]) 699*4882a593Smuzhiyun swap %d1 # hi([2]) in lo d1 700*4882a593Smuzhiyun swap %d2 # hi([3]) in lo d2 701*4882a593Smuzhiyun add.l %d2,%d1 # [4] + hi([2]) 702*4882a593Smuzhiyun add.l %d3,%d1 # [4] + hi([3]) 703*4882a593Smuzhiyun 704*4882a593Smuzhiyun tst.b %d5 # should result be signed? 705*4882a593Smuzhiyun beq.b muls64_done # no 706*4882a593Smuzhiyun 707*4882a593Smuzhiyun# result should be a signed negative number. 708*4882a593Smuzhiyun# compute 2's complement of the unsigned number: 709*4882a593Smuzhiyun# -negate all bits and add 1 710*4882a593Smuzhiyunmuls64_neg: 711*4882a593Smuzhiyun not.l %d0 # negate lo(result) bits 712*4882a593Smuzhiyun not.l %d1 # negate hi(result) bits 713*4882a593Smuzhiyun addq.l &1,%d0 # add 1 to lo(result) 714*4882a593Smuzhiyun addx.l %d4,%d1 # add carry to hi(result) 715*4882a593Smuzhiyun 716*4882a593Smuzhiyunmuls64_done: 717*4882a593Smuzhiyun mov.w MUL64_CC(%a6),%d4 718*4882a593Smuzhiyun andi.b &0x10,%d4 # keep old 'X' bit 719*4882a593Smuzhiyun tst.l %d1 # may set 'N' bit 720*4882a593Smuzhiyun bpl.b muls64_ddone 721*4882a593Smuzhiyun ori.b &0x8,%d4 # set 'N' bit 722*4882a593Smuzhiyunmuls64_ddone: 723*4882a593Smuzhiyun mov.w %d4,%cc 724*4882a593Smuzhiyun 725*4882a593Smuzhiyun# here, the result is in d1 and d0. the current strategy is to save 726*4882a593Smuzhiyun# the values at the location pointed to by a0. 727*4882a593Smuzhiyun# use movm here to not disturb the condition codes. 728*4882a593Smuzhiyunmuls64_end: 729*4882a593Smuzhiyun exg %d1,%d0 730*4882a593Smuzhiyun movm.l &0x0003,([0x10,%a6]) # save result at (a0) 731*4882a593Smuzhiyun 732*4882a593Smuzhiyun# EPILOGUE BEGIN ######################################################## 733*4882a593Smuzhiyun# fmovm.l (%sp)+,&0x0 # restore no fpregs 734*4882a593Smuzhiyun movm.l (%sp)+,&0x003c # restore d2-d5 735*4882a593Smuzhiyun unlk %a6 736*4882a593Smuzhiyun# EPILOGUE END ########################################################## 737*4882a593Smuzhiyun 738*4882a593Smuzhiyun rts 739*4882a593Smuzhiyun 740*4882a593Smuzhiyun# one or both of the operands is zero so the result is also zero. 741*4882a593Smuzhiyun# save the zero result to the register file and set the 'Z' ccode bit. 742*4882a593Smuzhiyunmuls64_zero: 743*4882a593Smuzhiyun clr.l %d0 744*4882a593Smuzhiyun clr.l %d1 745*4882a593Smuzhiyun 746*4882a593Smuzhiyun mov.w MUL64_CC(%a6),%d4 747*4882a593Smuzhiyun andi.b &0x10,%d4 748*4882a593Smuzhiyun ori.b &0x4,%d4 749*4882a593Smuzhiyun mov.w %d4,%cc # set 'Z' ccode bit 750*4882a593Smuzhiyun 751*4882a593Smuzhiyun bra.b muls64_end 752*4882a593Smuzhiyun 753*4882a593Smuzhiyun######################################################################### 754*4882a593Smuzhiyun# XDEF **************************************************************** # 755*4882a593Smuzhiyun# _060LSP__cmp2_Ab_(): Emulate "cmp2.b An,<ea>". # 756*4882a593Smuzhiyun# _060LSP__cmp2_Aw_(): Emulate "cmp2.w An,<ea>". # 757*4882a593Smuzhiyun# _060LSP__cmp2_Al_(): Emulate "cmp2.l An,<ea>". # 758*4882a593Smuzhiyun# _060LSP__cmp2_Db_(): Emulate "cmp2.b Dn,<ea>". # 759*4882a593Smuzhiyun# _060LSP__cmp2_Dw_(): Emulate "cmp2.w Dn,<ea>". # 760*4882a593Smuzhiyun# _060LSP__cmp2_Dl_(): Emulate "cmp2.l Dn,<ea>". # 761*4882a593Smuzhiyun# # 762*4882a593Smuzhiyun# This is the library version which is accessed as a subroutine # 763*4882a593Smuzhiyun# and therefore does not work exactly like the 680X0 "cmp2" # 764*4882a593Smuzhiyun# instruction. # 765*4882a593Smuzhiyun# # 766*4882a593Smuzhiyun# XREF **************************************************************** # 767*4882a593Smuzhiyun# None # 768*4882a593Smuzhiyun# # 769*4882a593Smuzhiyun# INPUT *************************************************************** # 770*4882a593Smuzhiyun# 0x4(sp) = Rn # 771*4882a593Smuzhiyun# 0x8(sp) = pointer to boundary pair # 772*4882a593Smuzhiyun# # 773*4882a593Smuzhiyun# OUTPUT ************************************************************** # 774*4882a593Smuzhiyun# cc = condition codes are set correctly # 775*4882a593Smuzhiyun# # 776*4882a593Smuzhiyun# ALGORITHM *********************************************************** # 777*4882a593Smuzhiyun# In the interest of simplicity, all operands are converted to # 778*4882a593Smuzhiyun# longword size whether the operation is byte, word, or long. The # 779*4882a593Smuzhiyun# bounds are sign extended accordingly. If Rn is a data register, Rn is # 780*4882a593Smuzhiyun# also sign extended. If Rn is an address register, it need not be sign # 781*4882a593Smuzhiyun# extended since the full register is always used. # 782*4882a593Smuzhiyun# The condition codes are set correctly before the final "rts". # 783*4882a593Smuzhiyun# # 784*4882a593Smuzhiyun######################################################################### 785*4882a593Smuzhiyun 786*4882a593Smuzhiyunset CMP2_CC, -4 787*4882a593Smuzhiyun 788*4882a593Smuzhiyun global _060LSP__cmp2_Ab_ 789*4882a593Smuzhiyun_060LSP__cmp2_Ab_: 790*4882a593Smuzhiyun 791*4882a593Smuzhiyun# PROLOGUE BEGIN ######################################################## 792*4882a593Smuzhiyun link.w %a6,&-4 793*4882a593Smuzhiyun movm.l &0x3800,-(%sp) # save d2-d4 794*4882a593Smuzhiyun# fmovm.l &0x0,-(%sp) # save no fpregs 795*4882a593Smuzhiyun# PROLOGUE END ########################################################## 796*4882a593Smuzhiyun 797*4882a593Smuzhiyun mov.w %cc,CMP2_CC(%a6) 798*4882a593Smuzhiyun mov.l 0x8(%a6), %d2 # get regval 799*4882a593Smuzhiyun 800*4882a593Smuzhiyun mov.b ([0xc,%a6],0x0),%d0 801*4882a593Smuzhiyun mov.b ([0xc,%a6],0x1),%d1 802*4882a593Smuzhiyun 803*4882a593Smuzhiyun extb.l %d0 # sign extend lo bnd 804*4882a593Smuzhiyun extb.l %d1 # sign extend hi bnd 805*4882a593Smuzhiyun bra.w l_cmp2_cmp # go do the compare emulation 806*4882a593Smuzhiyun 807*4882a593Smuzhiyun global _060LSP__cmp2_Aw_ 808*4882a593Smuzhiyun_060LSP__cmp2_Aw_: 809*4882a593Smuzhiyun 810*4882a593Smuzhiyun# PROLOGUE BEGIN ######################################################## 811*4882a593Smuzhiyun link.w %a6,&-4 812*4882a593Smuzhiyun movm.l &0x3800,-(%sp) # save d2-d4 813*4882a593Smuzhiyun# fmovm.l &0x0,-(%sp) # save no fpregs 814*4882a593Smuzhiyun# PROLOGUE END ########################################################## 815*4882a593Smuzhiyun 816*4882a593Smuzhiyun mov.w %cc,CMP2_CC(%a6) 817*4882a593Smuzhiyun mov.l 0x8(%a6), %d2 # get regval 818*4882a593Smuzhiyun 819*4882a593Smuzhiyun mov.w ([0xc,%a6],0x0),%d0 820*4882a593Smuzhiyun mov.w ([0xc,%a6],0x2),%d1 821*4882a593Smuzhiyun 822*4882a593Smuzhiyun ext.l %d0 # sign extend lo bnd 823*4882a593Smuzhiyun ext.l %d1 # sign extend hi bnd 824*4882a593Smuzhiyun bra.w l_cmp2_cmp # go do the compare emulation 825*4882a593Smuzhiyun 826*4882a593Smuzhiyun global _060LSP__cmp2_Al_ 827*4882a593Smuzhiyun_060LSP__cmp2_Al_: 828*4882a593Smuzhiyun 829*4882a593Smuzhiyun# PROLOGUE BEGIN ######################################################## 830*4882a593Smuzhiyun link.w %a6,&-4 831*4882a593Smuzhiyun movm.l &0x3800,-(%sp) # save d2-d4 832*4882a593Smuzhiyun# fmovm.l &0x0,-(%sp) # save no fpregs 833*4882a593Smuzhiyun# PROLOGUE END ########################################################## 834*4882a593Smuzhiyun 835*4882a593Smuzhiyun mov.w %cc,CMP2_CC(%a6) 836*4882a593Smuzhiyun mov.l 0x8(%a6), %d2 # get regval 837*4882a593Smuzhiyun 838*4882a593Smuzhiyun mov.l ([0xc,%a6],0x0),%d0 839*4882a593Smuzhiyun mov.l ([0xc,%a6],0x4),%d1 840*4882a593Smuzhiyun bra.w l_cmp2_cmp # go do the compare emulation 841*4882a593Smuzhiyun 842*4882a593Smuzhiyun global _060LSP__cmp2_Db_ 843*4882a593Smuzhiyun_060LSP__cmp2_Db_: 844*4882a593Smuzhiyun 845*4882a593Smuzhiyun# PROLOGUE BEGIN ######################################################## 846*4882a593Smuzhiyun link.w %a6,&-4 847*4882a593Smuzhiyun movm.l &0x3800,-(%sp) # save d2-d4 848*4882a593Smuzhiyun# fmovm.l &0x0,-(%sp) # save no fpregs 849*4882a593Smuzhiyun# PROLOGUE END ########################################################## 850*4882a593Smuzhiyun 851*4882a593Smuzhiyun mov.w %cc,CMP2_CC(%a6) 852*4882a593Smuzhiyun mov.l 0x8(%a6), %d2 # get regval 853*4882a593Smuzhiyun 854*4882a593Smuzhiyun mov.b ([0xc,%a6],0x0),%d0 855*4882a593Smuzhiyun mov.b ([0xc,%a6],0x1),%d1 856*4882a593Smuzhiyun 857*4882a593Smuzhiyun extb.l %d0 # sign extend lo bnd 858*4882a593Smuzhiyun extb.l %d1 # sign extend hi bnd 859*4882a593Smuzhiyun 860*4882a593Smuzhiyun# operation is a data register compare. 861*4882a593Smuzhiyun# sign extend byte to long so we can do simple longword compares. 862*4882a593Smuzhiyun extb.l %d2 # sign extend data byte 863*4882a593Smuzhiyun bra.w l_cmp2_cmp # go do the compare emulation 864*4882a593Smuzhiyun 865*4882a593Smuzhiyun global _060LSP__cmp2_Dw_ 866*4882a593Smuzhiyun_060LSP__cmp2_Dw_: 867*4882a593Smuzhiyun 868*4882a593Smuzhiyun# PROLOGUE BEGIN ######################################################## 869*4882a593Smuzhiyun link.w %a6,&-4 870*4882a593Smuzhiyun movm.l &0x3800,-(%sp) # save d2-d4 871*4882a593Smuzhiyun# fmovm.l &0x0,-(%sp) # save no fpregs 872*4882a593Smuzhiyun# PROLOGUE END ########################################################## 873*4882a593Smuzhiyun 874*4882a593Smuzhiyun mov.w %cc,CMP2_CC(%a6) 875*4882a593Smuzhiyun mov.l 0x8(%a6), %d2 # get regval 876*4882a593Smuzhiyun 877*4882a593Smuzhiyun mov.w ([0xc,%a6],0x0),%d0 878*4882a593Smuzhiyun mov.w ([0xc,%a6],0x2),%d1 879*4882a593Smuzhiyun 880*4882a593Smuzhiyun ext.l %d0 # sign extend lo bnd 881*4882a593Smuzhiyun ext.l %d1 # sign extend hi bnd 882*4882a593Smuzhiyun 883*4882a593Smuzhiyun# operation is a data register compare. 884*4882a593Smuzhiyun# sign extend word to long so we can do simple longword compares. 885*4882a593Smuzhiyun ext.l %d2 # sign extend data word 886*4882a593Smuzhiyun bra.w l_cmp2_cmp # go emulate compare 887*4882a593Smuzhiyun 888*4882a593Smuzhiyun global _060LSP__cmp2_Dl_ 889*4882a593Smuzhiyun_060LSP__cmp2_Dl_: 890*4882a593Smuzhiyun 891*4882a593Smuzhiyun# PROLOGUE BEGIN ######################################################## 892*4882a593Smuzhiyun link.w %a6,&-4 893*4882a593Smuzhiyun movm.l &0x3800,-(%sp) # save d2-d4 894*4882a593Smuzhiyun# fmovm.l &0x0,-(%sp) # save no fpregs 895*4882a593Smuzhiyun# PROLOGUE END ########################################################## 896*4882a593Smuzhiyun 897*4882a593Smuzhiyun mov.w %cc,CMP2_CC(%a6) 898*4882a593Smuzhiyun mov.l 0x8(%a6), %d2 # get regval 899*4882a593Smuzhiyun 900*4882a593Smuzhiyun mov.l ([0xc,%a6],0x0),%d0 901*4882a593Smuzhiyun mov.l ([0xc,%a6],0x4),%d1 902*4882a593Smuzhiyun 903*4882a593Smuzhiyun# 904*4882a593Smuzhiyun# To set the ccodes correctly: 905*4882a593Smuzhiyun# (1) save 'Z' bit from (Rn - lo) 906*4882a593Smuzhiyun# (2) save 'Z' and 'N' bits from ((hi - lo) - (Rn - hi)) 907*4882a593Smuzhiyun# (3) keep 'X', 'N', and 'V' from before instruction 908*4882a593Smuzhiyun# (4) combine ccodes 909*4882a593Smuzhiyun# 910*4882a593Smuzhiyunl_cmp2_cmp: 911*4882a593Smuzhiyun sub.l %d0, %d2 # (Rn - lo) 912*4882a593Smuzhiyun mov.w %cc, %d3 # fetch resulting ccodes 913*4882a593Smuzhiyun andi.b &0x4, %d3 # keep 'Z' bit 914*4882a593Smuzhiyun sub.l %d0, %d1 # (hi - lo) 915*4882a593Smuzhiyun cmp.l %d1,%d2 # ((hi - lo) - (Rn - hi)) 916*4882a593Smuzhiyun 917*4882a593Smuzhiyun mov.w %cc, %d4 # fetch resulting ccodes 918*4882a593Smuzhiyun or.b %d4, %d3 # combine w/ earlier ccodes 919*4882a593Smuzhiyun andi.b &0x5, %d3 # keep 'Z' and 'N' 920*4882a593Smuzhiyun 921*4882a593Smuzhiyun mov.w CMP2_CC(%a6), %d4 # fetch old ccodes 922*4882a593Smuzhiyun andi.b &0x1a, %d4 # keep 'X','N','V' bits 923*4882a593Smuzhiyun or.b %d3, %d4 # insert new ccodes 924*4882a593Smuzhiyun mov.w %d4,%cc # save new ccodes 925*4882a593Smuzhiyun 926*4882a593Smuzhiyun# EPILOGUE BEGIN ######################################################## 927*4882a593Smuzhiyun# fmovm.l (%sp)+,&0x0 # restore no fpregs 928*4882a593Smuzhiyun movm.l (%sp)+,&0x001c # restore d2-d4 929*4882a593Smuzhiyun unlk %a6 930*4882a593Smuzhiyun# EPILOGUE END ########################################################## 931*4882a593Smuzhiyun 932*4882a593Smuzhiyun rts 933