Add AMD SSE5 support; Add iterator over function arguments; Add stdarg_p, prototype_p...
[official-gcc.git] / gcc / config / sparc / lb1spc.asm
blobb60bd5740e76bce5e8bfc2187b4ba2523a3500b9
1 /* This is an assembly language implementation of mulsi3, divsi3, and modsi3
2 for the sparc processor.
4 These routines are derived from the SPARC Architecture Manual, version 8,
5 slightly edited to match the desired calling convention, and also to
6 optimize them for our purposes. */
8 #ifdef L_mulsi3
9 .text
10 .align 4
11 .global .umul
12 .proc 4
13 .umul:
14 or %o0, %o1, %o4 ! logical or of multiplier and multiplicand
15 mov %o0, %y ! multiplier to Y register
16 andncc %o4, 0xfff, %o5 ! mask out lower 12 bits
17 be mul_shortway ! can do it the short way
18 andcc %g0, %g0, %o4 ! zero the partial product and clear NV cc
20 ! long multiply
22 mulscc %o4, %o1, %o4 ! first iteration of 33
23 mulscc %o4, %o1, %o4
24 mulscc %o4, %o1, %o4
25 mulscc %o4, %o1, %o4
26 mulscc %o4, %o1, %o4
27 mulscc %o4, %o1, %o4
28 mulscc %o4, %o1, %o4
29 mulscc %o4, %o1, %o4
30 mulscc %o4, %o1, %o4
31 mulscc %o4, %o1, %o4
32 mulscc %o4, %o1, %o4
33 mulscc %o4, %o1, %o4
34 mulscc %o4, %o1, %o4
35 mulscc %o4, %o1, %o4
36 mulscc %o4, %o1, %o4
37 mulscc %o4, %o1, %o4
38 mulscc %o4, %o1, %o4
39 mulscc %o4, %o1, %o4
40 mulscc %o4, %o1, %o4
41 mulscc %o4, %o1, %o4
42 mulscc %o4, %o1, %o4
43 mulscc %o4, %o1, %o4
44 mulscc %o4, %o1, %o4
45 mulscc %o4, %o1, %o4
46 mulscc %o4, %o1, %o4
47 mulscc %o4, %o1, %o4
48 mulscc %o4, %o1, %o4
49 mulscc %o4, %o1, %o4
50 mulscc %o4, %o1, %o4
51 mulscc %o4, %o1, %o4
52 mulscc %o4, %o1, %o4
53 mulscc %o4, %o1, %o4 ! 32nd iteration
54 mulscc %o4, %g0, %o4 ! last iteration only shifts
55 ! the upper 32 bits of product are wrong, but we do not care
56 retl
57 rd %y, %o0
59 ! short multiply
61 mul_shortway:
62 mulscc %o4, %o1, %o4 ! first iteration of 13
63 mulscc %o4, %o1, %o4
64 mulscc %o4, %o1, %o4
65 mulscc %o4, %o1, %o4
66 mulscc %o4, %o1, %o4
67 mulscc %o4, %o1, %o4
68 mulscc %o4, %o1, %o4
69 mulscc %o4, %o1, %o4
70 mulscc %o4, %o1, %o4
71 mulscc %o4, %o1, %o4
72 mulscc %o4, %o1, %o4
73 mulscc %o4, %o1, %o4 ! 12th iteration
74 mulscc %o4, %g0, %o4 ! last iteration only shifts
75 rd %y, %o5
76 sll %o4, 12, %o4 ! left shift partial product by 12 bits
77 srl %o5, 20, %o5 ! right shift partial product by 20 bits
78 retl
79 or %o5, %o4, %o0 ! merge for true product
80 #endif
82 #ifdef L_divsi3
84 * Division and remainder, from Appendix E of the SPARC Version 8
85 * Architecture Manual, with fixes from Gordon Irlam.
89 * Input: dividend and divisor in %o0 and %o1 respectively.
91 * m4 parameters:
92 * .div name of function to generate
93 * div div=div => %o0 / %o1; div=rem => %o0 % %o1
94 * true true=true => signed; true=false => unsigned
96 * Algorithm parameters:
97 * N how many bits per iteration we try to get (4)
98 * WORDSIZE total number of bits (32)
100 * Derived constants:
101 * TOPBITS number of bits in the top decade of a number
103 * Important variables:
104 * Q the partial quotient under development (initially 0)
105 * R the remainder so far, initially the dividend
106 * ITER number of main division loop iterations required;
107 * equal to ceil(log2(quotient) / N). Note that this
108 * is the log base (2^N) of the quotient.
109 * V the current comparand, initially divisor*2^(ITER*N-1)
111 * Cost:
112 * Current estimate for non-large dividend is
113 * ceil(log2(quotient) / N) * (10 + 7N/2) + C
114 * A large dividend is one greater than 2^(31-TOPBITS) and takes a
115 * different path, as the upper bits of the quotient must be developed
116 * one bit at a time.
118 .global .udiv
119 .align 4
120 .proc 4
121 .text
122 .udiv:
123 b ready_to_divide
124 mov 0, %g3 ! result is always positive
126 .global .div
127 .align 4
128 .proc 4
129 .text
130 .div:
131 ! compute sign of result; if neither is negative, no problem
132 orcc %o1, %o0, %g0 ! either negative?
133 bge ready_to_divide ! no, go do the divide
134 xor %o1, %o0, %g3 ! compute sign in any case
135 tst %o1
136 bge 1f
137 tst %o0
138 ! %o1 is definitely negative; %o0 might also be negative
139 bge ready_to_divide ! if %o0 not negative...
140 sub %g0, %o1, %o1 ! in any case, make %o1 nonneg
141 1: ! %o0 is negative, %o1 is nonnegative
142 sub %g0, %o0, %o0 ! make %o0 nonnegative
145 ready_to_divide:
147 ! Ready to divide. Compute size of quotient; scale comparand.
148 orcc %o1, %g0, %o5
149 bne 1f
150 mov %o0, %o3
152 ! Divide by zero trap. If it returns, return 0 (about as
153 ! wrong as possible, but that is what SunOS does...).
154 ta 0x2 ! ST_DIV0
155 retl
156 clr %o0
159 cmp %o3, %o5 ! if %o1 exceeds %o0, done
160 blu got_result ! (and algorithm fails otherwise)
161 clr %o2
162 sethi %hi(1 << (32 - 4 - 1)), %g1
163 cmp %o3, %g1
164 blu not_really_big
165 clr %o4
167 ! Here the dividend is >= 2**(31-N) or so. We must be careful here,
168 ! as our usual N-at-a-shot divide step will cause overflow and havoc.
169 ! The number of bits in the result here is N*ITER+SC, where SC <= N.
170 ! Compute ITER in an unorthodox manner: know we need to shift V into
171 ! the top decade: so do not even bother to compare to R.
173 cmp %o5, %g1
174 bgeu 3f
175 mov 1, %g2
176 sll %o5, 4, %o5
177 b 1b
178 add %o4, 1, %o4
180 ! Now compute %g2.
181 2: addcc %o5, %o5, %o5
182 bcc not_too_big
183 add %g2, 1, %g2
185 ! We get here if the %o1 overflowed while shifting.
186 ! This means that %o3 has the high-order bit set.
187 ! Restore %o5 and subtract from %o3.
188 sll %g1, 4, %g1 ! high order bit
189 srl %o5, 1, %o5 ! rest of %o5
190 add %o5, %g1, %o5
191 b do_single_div
192 sub %g2, 1, %g2
194 not_too_big:
195 3: cmp %o5, %o3
196 blu 2b
198 be do_single_div
200 /* NB: these are commented out in the V8-SPARC manual as well */
201 /* (I do not understand this) */
202 ! %o5 > %o3: went too far: back up 1 step
203 ! srl %o5, 1, %o5
204 ! dec %g2
205 ! do single-bit divide steps
207 ! We have to be careful here. We know that %o3 >= %o5, so we can do the
208 ! first divide step without thinking. BUT, the others are conditional,
209 ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
210 ! order bit set in the first step, just falling into the regular
211 ! division loop will mess up the first time around.
212 ! So we unroll slightly...
213 do_single_div:
214 subcc %g2, 1, %g2
215 bl end_regular_divide
217 sub %o3, %o5, %o3
218 mov 1, %o2
219 b end_single_divloop
221 single_divloop:
222 sll %o2, 1, %o2
223 bl 1f
224 srl %o5, 1, %o5
225 ! %o3 >= 0
226 sub %o3, %o5, %o3
227 b 2f
228 add %o2, 1, %o2
229 1: ! %o3 < 0
230 add %o3, %o5, %o3
231 sub %o2, 1, %o2
233 end_single_divloop:
234 subcc %g2, 1, %g2
235 bge single_divloop
236 tst %o3
237 b,a end_regular_divide
239 not_really_big:
241 sll %o5, 4, %o5
242 cmp %o5, %o3
243 bleu 1b
244 addcc %o4, 1, %o4
245 be got_result
246 sub %o4, 1, %o4
248 tst %o3 ! set up for initial iteration
249 divloop:
250 sll %o2, 4, %o2
251 ! depth 1, accumulated bits 0
252 bl L1.16
253 srl %o5,1,%o5
254 ! remainder is positive
255 subcc %o3,%o5,%o3
256 ! depth 2, accumulated bits 1
257 bl L2.17
258 srl %o5,1,%o5
259 ! remainder is positive
260 subcc %o3,%o5,%o3
261 ! depth 3, accumulated bits 3
262 bl L3.19
263 srl %o5,1,%o5
264 ! remainder is positive
265 subcc %o3,%o5,%o3
266 ! depth 4, accumulated bits 7
267 bl L4.23
268 srl %o5,1,%o5
269 ! remainder is positive
270 subcc %o3,%o5,%o3
271 b 9f
272 add %o2, (7*2+1), %o2
274 L4.23:
275 ! remainder is negative
276 addcc %o3,%o5,%o3
277 b 9f
278 add %o2, (7*2-1), %o2
281 L3.19:
282 ! remainder is negative
283 addcc %o3,%o5,%o3
284 ! depth 4, accumulated bits 5
285 bl L4.21
286 srl %o5,1,%o5
287 ! remainder is positive
288 subcc %o3,%o5,%o3
289 b 9f
290 add %o2, (5*2+1), %o2
292 L4.21:
293 ! remainder is negative
294 addcc %o3,%o5,%o3
295 b 9f
296 add %o2, (5*2-1), %o2
298 L2.17:
299 ! remainder is negative
300 addcc %o3,%o5,%o3
301 ! depth 3, accumulated bits 1
302 bl L3.17
303 srl %o5,1,%o5
304 ! remainder is positive
305 subcc %o3,%o5,%o3
306 ! depth 4, accumulated bits 3
307 bl L4.19
308 srl %o5,1,%o5
309 ! remainder is positive
310 subcc %o3,%o5,%o3
311 b 9f
312 add %o2, (3*2+1), %o2
314 L4.19:
315 ! remainder is negative
316 addcc %o3,%o5,%o3
317 b 9f
318 add %o2, (3*2-1), %o2
320 L3.17:
321 ! remainder is negative
322 addcc %o3,%o5,%o3
323 ! depth 4, accumulated bits 1
324 bl L4.17
325 srl %o5,1,%o5
326 ! remainder is positive
327 subcc %o3,%o5,%o3
328 b 9f
329 add %o2, (1*2+1), %o2
331 L4.17:
332 ! remainder is negative
333 addcc %o3,%o5,%o3
334 b 9f
335 add %o2, (1*2-1), %o2
337 L1.16:
338 ! remainder is negative
339 addcc %o3,%o5,%o3
340 ! depth 2, accumulated bits -1
341 bl L2.15
342 srl %o5,1,%o5
343 ! remainder is positive
344 subcc %o3,%o5,%o3
345 ! depth 3, accumulated bits -1
346 bl L3.15
347 srl %o5,1,%o5
348 ! remainder is positive
349 subcc %o3,%o5,%o3
350 ! depth 4, accumulated bits -1
351 bl L4.15
352 srl %o5,1,%o5
353 ! remainder is positive
354 subcc %o3,%o5,%o3
355 b 9f
356 add %o2, (-1*2+1), %o2
358 L4.15:
359 ! remainder is negative
360 addcc %o3,%o5,%o3
361 b 9f
362 add %o2, (-1*2-1), %o2
364 L3.15:
365 ! remainder is negative
366 addcc %o3,%o5,%o3
367 ! depth 4, accumulated bits -3
368 bl L4.13
369 srl %o5,1,%o5
370 ! remainder is positive
371 subcc %o3,%o5,%o3
372 b 9f
373 add %o2, (-3*2+1), %o2
375 L4.13:
376 ! remainder is negative
377 addcc %o3,%o5,%o3
378 b 9f
379 add %o2, (-3*2-1), %o2
381 L2.15:
382 ! remainder is negative
383 addcc %o3,%o5,%o3
384 ! depth 3, accumulated bits -3
385 bl L3.13
386 srl %o5,1,%o5
387 ! remainder is positive
388 subcc %o3,%o5,%o3
389 ! depth 4, accumulated bits -5
390 bl L4.11
391 srl %o5,1,%o5
392 ! remainder is positive
393 subcc %o3,%o5,%o3
394 b 9f
395 add %o2, (-5*2+1), %o2
397 L4.11:
398 ! remainder is negative
399 addcc %o3,%o5,%o3
400 b 9f
401 add %o2, (-5*2-1), %o2
403 L3.13:
404 ! remainder is negative
405 addcc %o3,%o5,%o3
406 ! depth 4, accumulated bits -7
407 bl L4.9
408 srl %o5,1,%o5
409 ! remainder is positive
410 subcc %o3,%o5,%o3
411 b 9f
412 add %o2, (-7*2+1), %o2
414 L4.9:
415 ! remainder is negative
416 addcc %o3,%o5,%o3
417 b 9f
418 add %o2, (-7*2-1), %o2
421 end_regular_divide:
422 subcc %o4, 1, %o4
423 bge divloop
424 tst %o3
425 bl,a got_result
426 ! non-restoring fixup here (one instruction only!)
427 sub %o2, 1, %o2
430 got_result:
431 ! check to see if answer should be < 0
432 tst %g3
433 bl,a 1f
434 sub %g0, %o2, %o2
436 retl
437 mov %o2, %o0
438 #endif
440 #ifdef L_modsi3
441 /* This implementation was taken from glibc:
443 * Input: dividend and divisor in %o0 and %o1 respectively.
445 * Algorithm parameters:
446 * N how many bits per iteration we try to get (4)
447 * WORDSIZE total number of bits (32)
449 * Derived constants:
450 * TOPBITS number of bits in the top decade of a number
452 * Important variables:
453 * Q the partial quotient under development (initially 0)
454 * R the remainder so far, initially the dividend
455 * ITER number of main division loop iterations required;
456 * equal to ceil(log2(quotient) / N). Note that this
457 * is the log base (2^N) of the quotient.
458 * V the current comparand, initially divisor*2^(ITER*N-1)
460 * Cost:
461 * Current estimate for non-large dividend is
462 * ceil(log2(quotient) / N) * (10 + 7N/2) + C
463 * A large dividend is one greater than 2^(31-TOPBITS) and takes a
464 * different path, as the upper bits of the quotient must be developed
465 * one bit at a time.
467 .text
468 .align 4
469 .global .urem
470 .proc 4
471 .urem:
472 b divide
473 mov 0, %g3 ! result always positive
475 .align 4
476 .global .rem
477 .proc 4
478 .rem:
479 ! compute sign of result; if neither is negative, no problem
480 orcc %o1, %o0, %g0 ! either negative?
481 bge 2f ! no, go do the divide
482 mov %o0, %g3 ! sign of remainder matches %o0
483 tst %o1
484 bge 1f
485 tst %o0
486 ! %o1 is definitely negative; %o0 might also be negative
487 bge 2f ! if %o0 not negative...
488 sub %g0, %o1, %o1 ! in any case, make %o1 nonneg
489 1: ! %o0 is negative, %o1 is nonnegative
490 sub %g0, %o0, %o0 ! make %o0 nonnegative
493 ! Ready to divide. Compute size of quotient; scale comparand.
494 divide:
495 orcc %o1, %g0, %o5
496 bne 1f
497 mov %o0, %o3
499 ! Divide by zero trap. If it returns, return 0 (about as
500 ! wrong as possible, but that is what SunOS does...).
501 ta 0x2 !ST_DIV0
502 retl
503 clr %o0
506 cmp %o3, %o5 ! if %o1 exceeds %o0, done
507 blu got_result ! (and algorithm fails otherwise)
508 clr %o2
509 sethi %hi(1 << (32 - 4 - 1)), %g1
510 cmp %o3, %g1
511 blu not_really_big
512 clr %o4
514 ! Here the dividend is >= 2**(31-N) or so. We must be careful here,
515 ! as our usual N-at-a-shot divide step will cause overflow and havoc.
516 ! The number of bits in the result here is N*ITER+SC, where SC <= N.
517 ! Compute ITER in an unorthodox manner: know we need to shift V into
518 ! the top decade: so do not even bother to compare to R.
520 cmp %o5, %g1
521 bgeu 3f
522 mov 1, %g2
523 sll %o5, 4, %o5
524 b 1b
525 add %o4, 1, %o4
527 ! Now compute %g2.
528 2: addcc %o5, %o5, %o5
529 bcc not_too_big
530 add %g2, 1, %g2
532 ! We get here if the %o1 overflowed while shifting.
533 ! This means that %o3 has the high-order bit set.
534 ! Restore %o5 and subtract from %o3.
535 sll %g1, 4, %g1 ! high order bit
536 srl %o5, 1, %o5 ! rest of %o5
537 add %o5, %g1, %o5
538 b do_single_div
539 sub %g2, 1, %g2
541 not_too_big:
542 3: cmp %o5, %o3
543 blu 2b
545 be do_single_div
547 /* NB: these are commented out in the V8-SPARC manual as well */
548 /* (I do not understand this) */
549 ! %o5 > %o3: went too far: back up 1 step
550 ! srl %o5, 1, %o5
551 ! dec %g2
552 ! do single-bit divide steps
554 ! We have to be careful here. We know that %o3 >= %o5, so we can do the
555 ! first divide step without thinking. BUT, the others are conditional,
556 ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
557 ! order bit set in the first step, just falling into the regular
558 ! division loop will mess up the first time around.
559 ! So we unroll slightly...
560 do_single_div:
561 subcc %g2, 1, %g2
562 bl end_regular_divide
564 sub %o3, %o5, %o3
565 mov 1, %o2
566 b end_single_divloop
568 single_divloop:
569 sll %o2, 1, %o2
570 bl 1f
571 srl %o5, 1, %o5
572 ! %o3 >= 0
573 sub %o3, %o5, %o3
574 b 2f
575 add %o2, 1, %o2
576 1: ! %o3 < 0
577 add %o3, %o5, %o3
578 sub %o2, 1, %o2
580 end_single_divloop:
581 subcc %g2, 1, %g2
582 bge single_divloop
583 tst %o3
584 b,a end_regular_divide
586 not_really_big:
588 sll %o5, 4, %o5
589 cmp %o5, %o3
590 bleu 1b
591 addcc %o4, 1, %o4
592 be got_result
593 sub %o4, 1, %o4
595 tst %o3 ! set up for initial iteration
596 divloop:
597 sll %o2, 4, %o2
598 ! depth 1, accumulated bits 0
599 bl L1.16
600 srl %o5,1,%o5
601 ! remainder is positive
602 subcc %o3,%o5,%o3
603 ! depth 2, accumulated bits 1
604 bl L2.17
605 srl %o5,1,%o5
606 ! remainder is positive
607 subcc %o3,%o5,%o3
608 ! depth 3, accumulated bits 3
609 bl L3.19
610 srl %o5,1,%o5
611 ! remainder is positive
612 subcc %o3,%o5,%o3
613 ! depth 4, accumulated bits 7
614 bl L4.23
615 srl %o5,1,%o5
616 ! remainder is positive
617 subcc %o3,%o5,%o3
618 b 9f
619 add %o2, (7*2+1), %o2
620 L4.23:
621 ! remainder is negative
622 addcc %o3,%o5,%o3
623 b 9f
624 add %o2, (7*2-1), %o2
626 L3.19:
627 ! remainder is negative
628 addcc %o3,%o5,%o3
629 ! depth 4, accumulated bits 5
630 bl L4.21
631 srl %o5,1,%o5
632 ! remainder is positive
633 subcc %o3,%o5,%o3
634 b 9f
635 add %o2, (5*2+1), %o2
637 L4.21:
638 ! remainder is negative
639 addcc %o3,%o5,%o3
640 b 9f
641 add %o2, (5*2-1), %o2
643 L2.17:
644 ! remainder is negative
645 addcc %o3,%o5,%o3
646 ! depth 3, accumulated bits 1
647 bl L3.17
648 srl %o5,1,%o5
649 ! remainder is positive
650 subcc %o3,%o5,%o3
651 ! depth 4, accumulated bits 3
652 bl L4.19
653 srl %o5,1,%o5
654 ! remainder is positive
655 subcc %o3,%o5,%o3
656 b 9f
657 add %o2, (3*2+1), %o2
659 L4.19:
660 ! remainder is negative
661 addcc %o3,%o5,%o3
662 b 9f
663 add %o2, (3*2-1), %o2
665 L3.17:
666 ! remainder is negative
667 addcc %o3,%o5,%o3
668 ! depth 4, accumulated bits 1
669 bl L4.17
670 srl %o5,1,%o5
671 ! remainder is positive
672 subcc %o3,%o5,%o3
673 b 9f
674 add %o2, (1*2+1), %o2
676 L4.17:
677 ! remainder is negative
678 addcc %o3,%o5,%o3
679 b 9f
680 add %o2, (1*2-1), %o2
682 L1.16:
683 ! remainder is negative
684 addcc %o3,%o5,%o3
685 ! depth 2, accumulated bits -1
686 bl L2.15
687 srl %o5,1,%o5
688 ! remainder is positive
689 subcc %o3,%o5,%o3
690 ! depth 3, accumulated bits -1
691 bl L3.15
692 srl %o5,1,%o5
693 ! remainder is positive
694 subcc %o3,%o5,%o3
695 ! depth 4, accumulated bits -1
696 bl L4.15
697 srl %o5,1,%o5
698 ! remainder is positive
699 subcc %o3,%o5,%o3
700 b 9f
701 add %o2, (-1*2+1), %o2
703 L4.15:
704 ! remainder is negative
705 addcc %o3,%o5,%o3
706 b 9f
707 add %o2, (-1*2-1), %o2
709 L3.15:
710 ! remainder is negative
711 addcc %o3,%o5,%o3
712 ! depth 4, accumulated bits -3
713 bl L4.13
714 srl %o5,1,%o5
715 ! remainder is positive
716 subcc %o3,%o5,%o3
717 b 9f
718 add %o2, (-3*2+1), %o2
720 L4.13:
721 ! remainder is negative
722 addcc %o3,%o5,%o3
723 b 9f
724 add %o2, (-3*2-1), %o2
726 L2.15:
727 ! remainder is negative
728 addcc %o3,%o5,%o3
729 ! depth 3, accumulated bits -3
730 bl L3.13
731 srl %o5,1,%o5
732 ! remainder is positive
733 subcc %o3,%o5,%o3
734 ! depth 4, accumulated bits -5
735 bl L4.11
736 srl %o5,1,%o5
737 ! remainder is positive
738 subcc %o3,%o5,%o3
739 b 9f
740 add %o2, (-5*2+1), %o2
742 L4.11:
743 ! remainder is negative
744 addcc %o3,%o5,%o3
745 b 9f
746 add %o2, (-5*2-1), %o2
748 L3.13:
749 ! remainder is negative
750 addcc %o3,%o5,%o3
751 ! depth 4, accumulated bits -7
752 bl L4.9
753 srl %o5,1,%o5
754 ! remainder is positive
755 subcc %o3,%o5,%o3
756 b 9f
757 add %o2, (-7*2+1), %o2
759 L4.9:
760 ! remainder is negative
761 addcc %o3,%o5,%o3
762 b 9f
763 add %o2, (-7*2-1), %o2
766 end_regular_divide:
767 subcc %o4, 1, %o4
768 bge divloop
769 tst %o3
770 bl,a got_result
771 ! non-restoring fixup here (one instruction only!)
772 add %o3, %o1, %o3
774 got_result:
775 ! check to see if answer should be < 0
776 tst %g3
777 bl,a 1f
778 sub %g0, %o3, %o3
780 retl
781 mov %o3, %o0
783 #endif