1 /* This is an assembly language implementation of mulsi3, divsi3, and modsi3
2 for the sparc processor.
4 These routines are derived from the SPARC Architecture Manual, version 8,
5 slightly edited to match the desired calling convention, and also to
6 optimize them for our purposes. */
14 or %o0, %o1, %o4 ! logical or of multiplier and multiplicand
15 mov %o0, %y ! multiplier to Y register
16 andncc %o4, 0xfff, %o5 ! mask out lower 12 bits
17 be mul_shortway ! can do it the short way
18 andcc %g0, %g0, %o4 ! zero the partial product and clear NV cc
22 mulscc %o4, %o1, %o4 ! first iteration of 33
53 mulscc %o4, %o1, %o4 ! 32nd iteration
54 mulscc %o4, %g0, %o4 ! last iteration only shifts
55 ! the upper 32 bits of product are wrong, but we do not care
62 mulscc %o4, %o1, %o4 ! first iteration of 13
73 mulscc %o4, %o1, %o4 ! 12th iteration
74 mulscc %o4, %g0, %o4 ! last iteration only shifts
76 sll %o4, 12, %o4 ! left shift partial product by 12 bits
77 srl %o5, 20, %o5 ! right shift partial product by 20 bits
79 or %o5, %o4, %o0 ! merge for true product
84 * Division and remainder, from Appendix E of the SPARC Version 8
85 * Architecture Manual, with fixes from Gordon Irlam.
89 * Input: dividend and divisor in %o0 and %o1 respectively.
92 * .div name of function to generate
93 * div div=div => %o0 / %o1; div=rem => %o0 % %o1
94 * true true=true => signed; true=false => unsigned
96 * Algorithm parameters:
97 * N how many bits per iteration we try to get (4)
98 * WORDSIZE total number of bits (32)
101 * TOPBITS number of bits in the top decade of a number
103 * Important variables:
104 * Q the partial quotient under development (initially 0)
105 * R the remainder so far, initially the dividend
106 * ITER number of main division loop iterations required;
107 * equal to ceil(log2(quotient) / N). Note that this
108 * is the log base (2^N) of the quotient.
109 * V the current comparand, initially divisor*2^(ITER*N-1)
112 * Current estimate for non-large dividend is
113 * ceil(log2(quotient) / N) * (10 + 7N/2) + C
114 * A large dividend is one greater than 2^(31-TOPBITS) and takes a
115 * different path, as the upper bits of the quotient must be developed
124 mov 0, %g3 ! result is always positive
131 ! compute sign of result; if neither is negative, no problem
132 orcc %o1, %o0, %g0 ! either negative?
133 bge ready_to_divide ! no, go do the divide
134 xor %o1, %o0, %g3 ! compute sign in any case
138 ! %o1 is definitely negative; %o0 might also be negative
139 bge ready_to_divide ! if %o0 not negative...
140 sub %g0, %o1, %o1 ! in any case, make %o1 nonneg
141 1: ! %o0 is negative, %o1 is nonnegative
142 sub %g0, %o0, %o0 ! make %o0 nonnegative
147 ! Ready to divide. Compute size of quotient; scale comparand.
152 ! Divide by zero trap. If it returns, return 0 (about as
153 ! wrong as possible, but that is what SunOS does...).
159 cmp %o3, %o5 ! if %o1 exceeds %o0, done
160 blu got_result ! (and algorithm fails otherwise)
162 sethi %hi(1 << (32 - 4 - 1)), %g1
167 ! Here the dividend is >= 2**(31-N) or so. We must be careful here,
168 ! as our usual N-at-a-shot divide step will cause overflow and havoc.
169 ! The number of bits in the result here is N*ITER+SC, where SC <= N.
170 ! Compute ITER in an unorthodox manner: know we need to shift V into
171 ! the top decade: so do not even bother to compare to R.
181 2: addcc %o5, %o5, %o5
185 ! We get here if the %o1 overflowed while shifting.
186 ! This means that %o3 has the high-order bit set.
187 ! Restore %o5 and subtract from %o3.
188 sll %g1, 4, %g1 ! high order bit
189 srl %o5, 1, %o5 ! rest of %o5
200 /* NB: these are commented out in the V8-SPARC manual as well */
201 /* (I do not understand this) */
202 ! %o5 > %o3: went too far: back up 1 step
205 ! do single-bit divide steps
207 ! We have to be careful here. We know that %o3 >= %o5, so we can do the
208 ! first divide step without thinking. BUT, the others are conditional,
209 ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
210 ! order bit set in the first step, just falling into the regular
211 ! division loop will mess up the first time around.
212 ! So we unroll slightly...
215 bl end_regular_divide
237 b,a end_regular_divide
248 tst %o3 ! set up for initial iteration
251 ! depth 1, accumulated bits 0
254 ! remainder is positive
256 ! depth 2, accumulated bits 1
259 ! remainder is positive
261 ! depth 3, accumulated bits 3
264 ! remainder is positive
266 ! depth 4, accumulated bits 7
269 ! remainder is positive
272 add %o2, (7*2+1), %o2
275 ! remainder is negative
278 add %o2, (7*2-1), %o2
282 ! remainder is negative
284 ! depth 4, accumulated bits 5
287 ! remainder is positive
290 add %o2, (5*2+1), %o2
293 ! remainder is negative
296 add %o2, (5*2-1), %o2
299 ! remainder is negative
301 ! depth 3, accumulated bits 1
304 ! remainder is positive
306 ! depth 4, accumulated bits 3
309 ! remainder is positive
312 add %o2, (3*2+1), %o2
315 ! remainder is negative
318 add %o2, (3*2-1), %o2
321 ! remainder is negative
323 ! depth 4, accumulated bits 1
326 ! remainder is positive
329 add %o2, (1*2+1), %o2
332 ! remainder is negative
335 add %o2, (1*2-1), %o2
338 ! remainder is negative
340 ! depth 2, accumulated bits -1
343 ! remainder is positive
345 ! depth 3, accumulated bits -1
348 ! remainder is positive
350 ! depth 4, accumulated bits -1
353 ! remainder is positive
356 add %o2, (-1*2+1), %o2
359 ! remainder is negative
362 add %o2, (-1*2-1), %o2
365 ! remainder is negative
367 ! depth 4, accumulated bits -3
370 ! remainder is positive
373 add %o2, (-3*2+1), %o2
376 ! remainder is negative
379 add %o2, (-3*2-1), %o2
382 ! remainder is negative
384 ! depth 3, accumulated bits -3
387 ! remainder is positive
389 ! depth 4, accumulated bits -5
392 ! remainder is positive
395 add %o2, (-5*2+1), %o2
398 ! remainder is negative
401 add %o2, (-5*2-1), %o2
404 ! remainder is negative
406 ! depth 4, accumulated bits -7
409 ! remainder is positive
412 add %o2, (-7*2+1), %o2
415 ! remainder is negative
418 add %o2, (-7*2-1), %o2
426 ! non-restoring fixup here (one instruction only!)
431 ! check to see if answer should be < 0
441 /* This implementation was taken from glibc:
443 * Input: dividend and divisor in %o0 and %o1 respectively.
445 * Algorithm parameters:
446 * N how many bits per iteration we try to get (4)
447 * WORDSIZE total number of bits (32)
450 * TOPBITS number of bits in the top decade of a number
452 * Important variables:
453 * Q the partial quotient under development (initially 0)
454 * R the remainder so far, initially the dividend
455 * ITER number of main division loop iterations required;
456 * equal to ceil(log2(quotient) / N). Note that this
457 * is the log base (2^N) of the quotient.
458 * V the current comparand, initially divisor*2^(ITER*N-1)
461 * Current estimate for non-large dividend is
462 * ceil(log2(quotient) / N) * (10 + 7N/2) + C
463 * A large dividend is one greater than 2^(31-TOPBITS) and takes a
464 * different path, as the upper bits of the quotient must be developed
473 mov 0, %g3 ! result always positive
479 ! compute sign of result; if neither is negative, no problem
480 orcc %o1, %o0, %g0 ! either negative?
481 bge 2f ! no, go do the divide
482 mov %o0, %g3 ! sign of remainder matches %o0
486 ! %o1 is definitely negative; %o0 might also be negative
487 bge 2f ! if %o0 not negative...
488 sub %g0, %o1, %o1 ! in any case, make %o1 nonneg
489 1: ! %o0 is negative, %o1 is nonnegative
490 sub %g0, %o0, %o0 ! make %o0 nonnegative
493 ! Ready to divide. Compute size of quotient; scale comparand.
499 ! Divide by zero trap. If it returns, return 0 (about as
500 ! wrong as possible, but that is what SunOS does...).
506 cmp %o3, %o5 ! if %o1 exceeds %o0, done
507 blu got_result ! (and algorithm fails otherwise)
509 sethi %hi(1 << (32 - 4 - 1)), %g1
514 ! Here the dividend is >= 2**(31-N) or so. We must be careful here,
515 ! as our usual N-at-a-shot divide step will cause overflow and havoc.
516 ! The number of bits in the result here is N*ITER+SC, where SC <= N.
517 ! Compute ITER in an unorthodox manner: know we need to shift V into
518 ! the top decade: so do not even bother to compare to R.
528 2: addcc %o5, %o5, %o5
532 ! We get here if the %o1 overflowed while shifting.
533 ! This means that %o3 has the high-order bit set.
534 ! Restore %o5 and subtract from %o3.
535 sll %g1, 4, %g1 ! high order bit
536 srl %o5, 1, %o5 ! rest of %o5
547 /* NB: these are commented out in the V8-SPARC manual as well */
548 /* (I do not understand this) */
549 ! %o5 > %o3: went too far: back up 1 step
552 ! do single-bit divide steps
554 ! We have to be careful here. We know that %o3 >= %o5, so we can do the
555 ! first divide step without thinking. BUT, the others are conditional,
556 ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
557 ! order bit set in the first step, just falling into the regular
558 ! division loop will mess up the first time around.
559 ! So we unroll slightly...
562 bl end_regular_divide
584 b,a end_regular_divide
595 tst %o3 ! set up for initial iteration
598 ! depth 1, accumulated bits 0
601 ! remainder is positive
603 ! depth 2, accumulated bits 1
606 ! remainder is positive
608 ! depth 3, accumulated bits 3
611 ! remainder is positive
613 ! depth 4, accumulated bits 7
616 ! remainder is positive
619 add %o2, (7*2+1), %o2
621 ! remainder is negative
624 add %o2, (7*2-1), %o2
627 ! remainder is negative
629 ! depth 4, accumulated bits 5
632 ! remainder is positive
635 add %o2, (5*2+1), %o2
638 ! remainder is negative
641 add %o2, (5*2-1), %o2
644 ! remainder is negative
646 ! depth 3, accumulated bits 1
649 ! remainder is positive
651 ! depth 4, accumulated bits 3
654 ! remainder is positive
657 add %o2, (3*2+1), %o2
660 ! remainder is negative
663 add %o2, (3*2-1), %o2
666 ! remainder is negative
668 ! depth 4, accumulated bits 1
671 ! remainder is positive
674 add %o2, (1*2+1), %o2
677 ! remainder is negative
680 add %o2, (1*2-1), %o2
683 ! remainder is negative
685 ! depth 2, accumulated bits -1
688 ! remainder is positive
690 ! depth 3, accumulated bits -1
693 ! remainder is positive
695 ! depth 4, accumulated bits -1
698 ! remainder is positive
701 add %o2, (-1*2+1), %o2
704 ! remainder is negative
707 add %o2, (-1*2-1), %o2
710 ! remainder is negative
712 ! depth 4, accumulated bits -3
715 ! remainder is positive
718 add %o2, (-3*2+1), %o2
721 ! remainder is negative
724 add %o2, (-3*2-1), %o2
727 ! remainder is negative
729 ! depth 3, accumulated bits -3
732 ! remainder is positive
734 ! depth 4, accumulated bits -5
737 ! remainder is positive
740 add %o2, (-5*2+1), %o2
743 ! remainder is negative
746 add %o2, (-5*2-1), %o2
749 ! remainder is negative
751 ! depth 4, accumulated bits -7
754 ! remainder is positive
757 add %o2, (-7*2+1), %o2
760 ! remainder is negative
763 add %o2, (-7*2-1), %o2
771 ! non-restoring fixup here (one instruction only!)
775 ! check to see if answer should be < 0