1 /* This is an assembly language implementation of mulsi3
, divsi3
, and modsi3
2 for the sparc processor.
4 These routines are derived from the SPARC Architecture Manual
, version
8,
5 slightly edited to match the desired calling convention
, and also to
6 optimize them for our purposes.
*/
14 or %o0
, %o1
, %o4
! logical
or of multiplier
and multiplicand
15 mov %o0
, %y
! multiplier to Y register
16 andncc
%o4
, 0xfff, %o5
! mask out lower
12 bits
17 be mul_shortway
! can do it the
short way
18 andcc
%g0
, %g0
, %o4
! zero the partial product
and clear NV cc
22 mulscc
%o4
, %o1
, %o4
! first iteration of
33
53 mulscc
%o4
, %o1
, %o4
! 32nd iteration
54 mulscc
%o4
, %g0
, %o4
! last iteration only shifts
55 ! the upper
32 bits of product are wrong
, but we do
not care
62 mulscc
%o4
, %o1
, %o4
! first iteration of
13
73 mulscc
%o4
, %o1
, %o4
! 12th iteration
74 mulscc
%o4
, %g0
, %o4
! last iteration only shifts
76 sll
%o4
, 12, %o4
! left shift partial product by
12 bits
77 srl
%o5
, 20, %o5
! right shift partial product by
20 bits
79 or %o5
, %o4
, %o0
! merge for true product
84 * Division
and remainder
, from Appendix E of the SPARC Version
8
85 * Architecture Manual
, with fixes from Gordon Irlam.
89 * Input: dividend
and divisor
in %o0
and %o1 respectively.
92 * .
div name of function to generate
93 * div div=div => %o0
/ %o1
; div=rem => %o0 % %o1
94 * true true
=true
=> signed
; true=false => unsigned
96 * Algorithm
parameters:
97 * N how many bits per iteration we try to get
(4)
98 * WORDSIZE total number of bits
(32)
101 * TOPBITS number of bits
in the top decade of a number
103 * Important
variables:
104 * Q the partial quotient under development
(initially
0)
105 * R the remainder so
far, initially the dividend
106 * ITER number of main division
loop iterations required
;
107 * equal to ceil
(log2
(quotient
) / N
). Note that
this
108 * is the log base
(2^N
) of the quotient.
109 * V the current comparand
, initially divisor
*2^
(ITER
*N
-1)
112 * Current estimate for non
-large dividend is
113 * ceil
(log2
(quotient
) / N
) * (10 + 7N
/2) + C
114 * A
large dividend is one greater than
2^
(31-TOPBITS
) and takes a
115 * different path
, as the upper bits of the quotient must be developed
124 mov 0, %g3
! result is always positive
131 ! compute sign of result
; if neither is negative, no problem
132 orcc
%o1
, %o0
, %g0
! either negative
?
133 bge ready_to_divide
! no
, go do the divide
134 xor %o1
, %o0
, %g3
! compute sign
in any case
138 ! %o1 is definitely negative
; %o0 might also be negative
139 bge ready_to_divide
! if
%o0
not negative...
140 sub %g0
, %o1
, %o1
! in any case
, make
%o1 nonneg
141 1: ! %o0 is negative
, %o1 is nonnegative
142 sub %g0
, %o0
, %o0
! make
%o0 nonnegative
147 ! Ready to divide. Compute
size of quotient
; scale comparand.
152 ! Divide by zero trap. If it returns
, return
0 (about as
153 ! wrong as possible
, but that is what SunOS does...
).
159 cmp %o3
, %o5
! if
%o1 exceeds
%o0
, done
160 blu got_result
! (and algorithm fails otherwise
)
162 sethi
%hi
(1 << (32 - 4 - 1)), %g1
167 ! Here the dividend is
>= 2**(31-N
) or so. We must be careful here
,
168 ! as our usual N
-at
-a
-shot divide step will cause overflow
and havoc.
169 ! The number of bits
in the result here is N
*ITER
+SC
, where SC
<= N.
170 ! Compute ITER
in an unorthodox
manner: know we need to shift V
into
171 ! the top
decade: so do
not even bother to compare to R.
181 2: addcc
%o5
, %o5
, %o5
185 ! We get here if the
%o1 overflowed while shifting.
186 ! This means that
%o3 has the
high-order bit set.
187 ! Restore
%o5
and subtract from
%o3.
188 sll
%g1
, 4, %g1
! high order bit
189 srl
%o5
, 1, %o5
! rest of
%o5
200 /* NB: these are commented
out in the V8
-SPARC manual as well
*/
201 /* (I do
not understand
this) */
202 ! %o5
> %o3: went too
far: back up
1 step
205 ! do single
-bit divide steps
207 ! We have to be careful here. We know that
%o3
>= %o5
, so we can do the
208 ! first divide step without thinking. BUT
, the others are conditional
,
209 ! and are only done if
%o3
>= 0. Because both
%o3
and %o5 may have the
high-
210 ! order bit set
in the first step
, just falling
into the regular
211 ! division
loop will mess up the first time around.
212 ! So we unroll slightly...
215 bl end_regular_divide
237 b
,a end_regular_divide
248 tst
%o3
! set up for initial iteration
251 ! depth
1, accumulated bits
0
254 ! remainder is positive
256 ! depth
2, accumulated bits
1
259 ! remainder is positive
261 ! depth
3, accumulated bits
3
264 ! remainder is positive
266 ! depth
4, accumulated bits
7
269 ! remainder is positive
272 add %o2
, (7*2+1), %o2
275 ! remainder is negative
278 add %o2
, (7*2-1), %o2
282 ! remainder is negative
284 ! depth
4, accumulated bits
5
287 ! remainder is positive
290 add %o2
, (5*2+1), %o2
293 ! remainder is negative
296 add %o2
, (5*2-1), %o2
299 ! remainder is negative
301 ! depth
3, accumulated bits
1
304 ! remainder is positive
306 ! depth
4, accumulated bits
3
309 ! remainder is positive
312 add %o2
, (3*2+1), %o2
315 ! remainder is negative
318 add %o2
, (3*2-1), %o2
321 ! remainder is negative
323 ! depth
4, accumulated bits
1
326 ! remainder is positive
329 add %o2
, (1*2+1), %o2
332 ! remainder is negative
335 add %o2
, (1*2-1), %o2
338 ! remainder is negative
340 ! depth
2, accumulated bits
-1
343 ! remainder is positive
345 ! depth
3, accumulated bits
-1
348 ! remainder is positive
350 ! depth
4, accumulated bits
-1
353 ! remainder is positive
356 add %o2
, (-1*2+1), %o2
359 ! remainder is negative
362 add %o2
, (-1*2-1), %o2
365 ! remainder is negative
367 ! depth
4, accumulated bits
-3
370 ! remainder is positive
373 add %o2
, (-3*2+1), %o2
376 ! remainder is negative
379 add %o2
, (-3*2-1), %o2
382 ! remainder is negative
384 ! depth
3, accumulated bits
-3
387 ! remainder is positive
389 ! depth
4, accumulated bits
-5
392 ! remainder is positive
395 add %o2
, (-5*2+1), %o2
398 ! remainder is negative
401 add %o2
, (-5*2-1), %o2
404 ! remainder is negative
406 ! depth
4, accumulated bits
-7
409 ! remainder is positive
412 add %o2
, (-7*2+1), %o2
415 ! remainder is negative
418 add %o2
, (-7*2-1), %o2
426 ! non
-restoring fixup here
(one instruction only
!)
431 ! check to see if answer should be
< 0
441 /* This implementation was taken from
glibc:
443 * Input: dividend
and divisor
in %o0
and %o1 respectively.
445 * Algorithm
parameters:
446 * N how many bits per iteration we try to get
(4)
447 * WORDSIZE total number of bits
(32)
450 * TOPBITS number of bits
in the top decade of a number
452 * Important
variables:
453 * Q the partial quotient under development
(initially
0)
454 * R the remainder so
far, initially the dividend
455 * ITER number of main division
loop iterations required
;
456 * equal to ceil
(log2
(quotient
) / N
). Note that
this
457 * is the log base
(2^N
) of the quotient.
458 * V the current comparand
, initially divisor
*2^
(ITER
*N
-1)
461 * Current estimate for non
-large dividend is
462 * ceil
(log2
(quotient
) / N
) * (10 + 7N
/2) + C
463 * A
large dividend is one greater than
2^
(31-TOPBITS
) and takes a
464 * different path
, as the upper bits of the quotient must be developed
473 mov 0, %g3
! result always positive
479 ! compute sign of result
; if neither is negative, no problem
480 orcc
%o1
, %o0
, %g0
! either negative
?
481 bge
2f
! no
, go do the divide
482 mov %o0
, %g3
! sign of remainder matches
%o0
486 ! %o1 is definitely negative
; %o0 might also be negative
487 bge
2f
! if
%o0
not negative...
488 sub %g0
, %o1
, %o1
! in any case
, make
%o1 nonneg
489 1: ! %o0 is negative
, %o1 is nonnegative
490 sub %g0
, %o0
, %o0
! make
%o0 nonnegative
493 ! Ready to divide. Compute
size of quotient
; scale comparand.
499 ! Divide by zero trap. If it returns
, return
0 (about as
500 ! wrong as possible
, but that is what SunOS does...
).
506 cmp %o3
, %o5
! if
%o1 exceeds
%o0
, done
507 blu got_result
! (and algorithm fails otherwise
)
509 sethi
%hi
(1 << (32 - 4 - 1)), %g1
514 ! Here the dividend is
>= 2**(31-N
) or so. We must be careful here
,
515 ! as our usual N
-at
-a
-shot divide step will cause overflow
and havoc.
516 ! The number of bits
in the result here is N
*ITER
+SC
, where SC
<= N.
517 ! Compute ITER
in an unorthodox
manner: know we need to shift V
into
518 ! the top
decade: so do
not even bother to compare to R.
528 2: addcc
%o5
, %o5
, %o5
532 ! We get here if the
%o1 overflowed while shifting.
533 ! This means that
%o3 has the
high-order bit set.
534 ! Restore
%o5
and subtract from
%o3.
535 sll
%g1
, 4, %g1
! high order bit
536 srl
%o5
, 1, %o5
! rest of
%o5
547 /* NB: these are commented
out in the V8
-SPARC manual as well
*/
548 /* (I do
not understand
this) */
549 ! %o5
> %o3: went too
far: back up
1 step
552 ! do single
-bit divide steps
554 ! We have to be careful here. We know that
%o3
>= %o5
, so we can do the
555 ! first divide step without thinking. BUT
, the others are conditional
,
556 ! and are only done if
%o3
>= 0. Because both
%o3
and %o5 may have the
high-
557 ! order bit set
in the first step
, just falling
into the regular
558 ! division
loop will mess up the first time around.
559 ! So we unroll slightly...
562 bl end_regular_divide
584 b
,a end_regular_divide
595 tst
%o3
! set up for initial iteration
598 ! depth
1, accumulated bits
0
601 ! remainder is positive
603 ! depth
2, accumulated bits
1
606 ! remainder is positive
608 ! depth
3, accumulated bits
3
611 ! remainder is positive
613 ! depth
4, accumulated bits
7
616 ! remainder is positive
619 add %o2
, (7*2+1), %o2
621 ! remainder is negative
624 add %o2
, (7*2-1), %o2
627 ! remainder is negative
629 ! depth
4, accumulated bits
5
632 ! remainder is positive
635 add %o2
, (5*2+1), %o2
638 ! remainder is negative
641 add %o2
, (5*2-1), %o2
644 ! remainder is negative
646 ! depth
3, accumulated bits
1
649 ! remainder is positive
651 ! depth
4, accumulated bits
3
654 ! remainder is positive
657 add %o2
, (3*2+1), %o2
660 ! remainder is negative
663 add %o2
, (3*2-1), %o2
666 ! remainder is negative
668 ! depth
4, accumulated bits
1
671 ! remainder is positive
674 add %o2
, (1*2+1), %o2
677 ! remainder is negative
680 add %o2
, (1*2-1), %o2
683 ! remainder is negative
685 ! depth
2, accumulated bits
-1
688 ! remainder is positive
690 ! depth
3, accumulated bits
-1
693 ! remainder is positive
695 ! depth
4, accumulated bits
-1
698 ! remainder is positive
701 add %o2
, (-1*2+1), %o2
704 ! remainder is negative
707 add %o2
, (-1*2-1), %o2
710 ! remainder is negative
712 ! depth
4, accumulated bits
-3
715 ! remainder is positive
718 add %o2
, (-3*2+1), %o2
721 ! remainder is negative
724 add %o2
, (-3*2-1), %o2
727 ! remainder is negative
729 ! depth
3, accumulated bits
-3
732 ! remainder is positive
734 ! depth
4, accumulated bits
-5
737 ! remainder is positive
740 add %o2
, (-5*2+1), %o2
743 ! remainder is negative
746 add %o2
, (-5*2-1), %o2
749 ! remainder is negative
751 ! depth
4, accumulated bits
-7
754 ! remainder is positive
757 add %o2
, (-7*2+1), %o2
760 ! remainder is negative
763 add %o2
, (-7*2-1), %o2
771 ! non
-restoring fixup here
(one instruction only
!)
775 ! check to see if answer should be
< 0