Merge from the pain train
[official-gcc.git] / gcc / config / ia64 / lib1funcs.asm
blob68ee421ff6575757e1dca556cb0f727b1c53dbfc
1 #ifdef L__divxf3
2 // Compute a 80-bit IEEE double-extended quotient.
3 //
4 // From the Intel IA-64 Optimization Guide, choose the minimum latency
5 // alternative.
6 //
7 // farg0 holds the dividend. farg1 holds the divisor.
8 //
9 // __divtf3 is an alternate symbol name for backward compatibility.
11 .text
12 .align 16
13 .global __divxf3
14 .global __divtf3
15 .proc __divxf3
16 __divxf3:
17 __divtf3:
18 cmp.eq p7, p0 = r0, r0
19 frcpa.s0 f10, p6 = farg0, farg1
21 (p6) cmp.ne p7, p0 = r0, r0
22 .pred.rel.mutex p6, p7
23 (p6) fnma.s1 f11 = farg1, f10, f1
24 (p6) fma.s1 f12 = farg0, f10, f0
26 (p6) fma.s1 f13 = f11, f11, f0
27 (p6) fma.s1 f14 = f11, f11, f11
29 (p6) fma.s1 f11 = f13, f13, f11
30 (p6) fma.s1 f13 = f14, f10, f10
32 (p6) fma.s1 f10 = f13, f11, f10
33 (p6) fnma.s1 f11 = farg1, f12, farg0
35 (p6) fma.s1 f11 = f11, f10, f12
36 (p6) fnma.s1 f12 = farg1, f10, f1
38 (p6) fma.s1 f10 = f12, f10, f10
39 (p6) fnma.s1 f12 = farg1, f11, farg0
41 (p6) fma.s0 fret0 = f12, f10, f11
42 (p7) mov fret0 = f10
43 br.ret.sptk rp
44 .endp __divxf3
45 #endif
47 #ifdef L__divdf3
48 // Compute a 64-bit IEEE double quotient.
50 // From the Intel IA-64 Optimization Guide, choose the minimum latency
51 // alternative.
53 // farg0 holds the dividend. farg1 holds the divisor.
55 .text
56 .align 16
57 .global __divdf3
58 .proc __divdf3
59 __divdf3:
60 cmp.eq p7, p0 = r0, r0
61 frcpa.s0 f10, p6 = farg0, farg1
63 (p6) cmp.ne p7, p0 = r0, r0
64 .pred.rel.mutex p6, p7
65 (p6) fmpy.s1 f11 = farg0, f10
66 (p6) fnma.s1 f12 = farg1, f10, f1
68 (p6) fma.s1 f11 = f12, f11, f11
69 (p6) fmpy.s1 f13 = f12, f12
71 (p6) fma.s1 f10 = f12, f10, f10
72 (p6) fma.s1 f11 = f13, f11, f11
74 (p6) fmpy.s1 f12 = f13, f13
75 (p6) fma.s1 f10 = f13, f10, f10
77 (p6) fma.d.s1 f11 = f12, f11, f11
78 (p6) fma.s1 f10 = f12, f10, f10
80 (p6) fnma.d.s1 f8 = farg1, f11, farg0
82 (p6) fma.d fret0 = f8, f10, f11
83 (p7) mov fret0 = f10
84 br.ret.sptk rp
86 .endp __divdf3
87 #endif
89 #ifdef L__divsf3
90 // Compute a 32-bit IEEE float quotient.
92 // From the Intel IA-64 Optimization Guide, choose the minimum latency
93 // alternative.
95 // farg0 holds the dividend. farg1 holds the divisor.
97 .text
98 .align 16
99 .global __divsf3
100 .proc __divsf3
101 __divsf3:
102 cmp.eq p7, p0 = r0, r0
103 frcpa.s0 f10, p6 = farg0, farg1
105 (p6) cmp.ne p7, p0 = r0, r0
106 .pred.rel.mutex p6, p7
107 (p6) fmpy.s1 f8 = farg0, f10
108 (p6) fnma.s1 f9 = farg1, f10, f1
110 (p6) fma.s1 f8 = f9, f8, f8
111 (p6) fmpy.s1 f9 = f9, f9
113 (p6) fma.s1 f8 = f9, f8, f8
114 (p6) fmpy.s1 f9 = f9, f9
116 (p6) fma.d.s1 f10 = f9, f8, f8
118 (p6) fnorm.s.s0 fret0 = f10
119 (p7) mov fret0 = f10
120 br.ret.sptk rp
122 .endp __divsf3
123 #endif
125 #ifdef L__divdi3
126 // Compute a 64-bit integer quotient.
128 // From the Intel IA-64 Optimization Guide, choose the minimum latency
129 // alternative.
131 // in0 holds the dividend. in1 holds the divisor.
133 .text
134 .align 16
135 .global __divdi3
136 .proc __divdi3
137 __divdi3:
138 .regstk 2,0,0,0
139 // Transfer inputs to FP registers.
140 setf.sig f8 = in0
141 setf.sig f9 = in1
143 // Convert the inputs to FP, so that they won't be treated as unsigned.
144 fcvt.xf f8 = f8
145 fcvt.xf f9 = f9
147 // Compute the reciprocal approximation.
148 frcpa.s1 f10, p6 = f8, f9
150 // 3 Newton-Raphson iterations.
151 (p6) fnma.s1 f11 = f9, f10, f1
152 (p6) fmpy.s1 f12 = f8, f10
154 (p6) fmpy.s1 f13 = f11, f11
155 (p6) fma.s1 f12 = f11, f12, f12
157 (p6) fma.s1 f10 = f11, f10, f10
158 (p6) fma.s1 f11 = f13, f12, f12
160 (p6) fma.s1 f10 = f13, f10, f10
161 (p6) fnma.s1 f12 = f9, f11, f8
163 (p6) fma.s1 f10 = f12, f10, f11
165 // Round quotient to an integer.
166 fcvt.fx.trunc.s1 f10 = f10
168 // Transfer result to GP registers.
169 getf.sig ret0 = f10
170 br.ret.sptk rp
172 .endp __divdi3
173 #endif
175 #ifdef L__moddi3
176 // Compute a 64-bit integer modulus.
178 // From the Intel IA-64 Optimization Guide, choose the minimum latency
179 // alternative.
181 // in0 holds the dividend (a). in1 holds the divisor (b).
183 .text
184 .align 16
185 .global __moddi3
186 .proc __moddi3
187 __moddi3:
188 .regstk 2,0,0,0
189 // Transfer inputs to FP registers.
190 setf.sig f14 = in0
191 setf.sig f9 = in1
193 // Convert the inputs to FP, so that they won't be treated as unsigned.
194 fcvt.xf f8 = f14
195 fcvt.xf f9 = f9
197 // Compute the reciprocal approximation.
198 frcpa.s1 f10, p6 = f8, f9
200 // 3 Newton-Raphson iterations.
201 (p6) fmpy.s1 f12 = f8, f10
202 (p6) fnma.s1 f11 = f9, f10, f1
204 (p6) fma.s1 f12 = f11, f12, f12
205 (p6) fmpy.s1 f13 = f11, f11
207 (p6) fma.s1 f10 = f11, f10, f10
208 (p6) fma.s1 f11 = f13, f12, f12
210 sub in1 = r0, in1
211 (p6) fma.s1 f10 = f13, f10, f10
212 (p6) fnma.s1 f12 = f9, f11, f8
214 setf.sig f9 = in1
215 (p6) fma.s1 f10 = f12, f10, f11
217 fcvt.fx.trunc.s1 f10 = f10
219 // r = q * (-b) + a
220 xma.l f10 = f10, f9, f14
222 // Transfer result to GP registers.
223 getf.sig ret0 = f10
224 br.ret.sptk rp
226 .endp __moddi3
227 #endif
229 #ifdef L__udivdi3
230 // Compute a 64-bit unsigned integer quotient.
232 // From the Intel IA-64 Optimization Guide, choose the minimum latency
233 // alternative.
235 // in0 holds the dividend. in1 holds the divisor.
237 .text
238 .align 16
239 .global __udivdi3
240 .proc __udivdi3
241 __udivdi3:
242 .regstk 2,0,0,0
243 // Transfer inputs to FP registers.
244 setf.sig f8 = in0
245 setf.sig f9 = in1
247 // Convert the inputs to FP, to avoid FP software-assist faults.
248 fcvt.xuf.s1 f8 = f8
249 fcvt.xuf.s1 f9 = f9
251 // Compute the reciprocal approximation.
252 frcpa.s1 f10, p6 = f8, f9
254 // 3 Newton-Raphson iterations.
255 (p6) fnma.s1 f11 = f9, f10, f1
256 (p6) fmpy.s1 f12 = f8, f10
258 (p6) fmpy.s1 f13 = f11, f11
259 (p6) fma.s1 f12 = f11, f12, f12
261 (p6) fma.s1 f10 = f11, f10, f10
262 (p6) fma.s1 f11 = f13, f12, f12
264 (p6) fma.s1 f10 = f13, f10, f10
265 (p6) fnma.s1 f12 = f9, f11, f8
267 (p6) fma.s1 f10 = f12, f10, f11
269 // Round quotient to an unsigned integer.
270 fcvt.fxu.trunc.s1 f10 = f10
272 // Transfer result to GP registers.
273 getf.sig ret0 = f10
274 br.ret.sptk rp
276 .endp __udivdi3
277 #endif
279 #ifdef L__umoddi3
280 // Compute a 64-bit unsigned integer modulus.
282 // From the Intel IA-64 Optimization Guide, choose the minimum latency
283 // alternative.
285 // in0 holds the dividend (a). in1 holds the divisor (b).
287 .text
288 .align 16
289 .global __umoddi3
290 .proc __umoddi3
291 __umoddi3:
292 .regstk 2,0,0,0
293 // Transfer inputs to FP registers.
294 setf.sig f14 = in0
295 setf.sig f9 = in1
297 // Convert the inputs to FP, to avoid FP software assist faults.
298 fcvt.xuf.s1 f8 = f14
299 fcvt.xuf.s1 f9 = f9
301 // Compute the reciprocal approximation.
302 frcpa.s1 f10, p6 = f8, f9
304 // 3 Newton-Raphson iterations.
305 (p6) fmpy.s1 f12 = f8, f10
306 (p6) fnma.s1 f11 = f9, f10, f1
308 (p6) fma.s1 f12 = f11, f12, f12
309 (p6) fmpy.s1 f13 = f11, f11
311 (p6) fma.s1 f10 = f11, f10, f10
312 (p6) fma.s1 f11 = f13, f12, f12
314 sub in1 = r0, in1
315 (p6) fma.s1 f10 = f13, f10, f10
316 (p6) fnma.s1 f12 = f9, f11, f8
318 setf.sig f9 = in1
319 (p6) fma.s1 f10 = f12, f10, f11
321 // Round quotient to an unsigned integer.
322 fcvt.fxu.trunc.s1 f10 = f10
324 // r = q * (-b) + a
325 xma.l f10 = f10, f9, f14
327 // Transfer result to GP registers.
328 getf.sig ret0 = f10
329 br.ret.sptk rp
331 .endp __umoddi3
332 #endif
334 #ifdef L__divsi3
335 // Compute a 32-bit integer quotient.
337 // From the Intel IA-64 Optimization Guide, choose the minimum latency
338 // alternative.
340 // in0 holds the dividend. in1 holds the divisor.
342 .text
343 .align 16
344 .global __divsi3
345 .proc __divsi3
346 __divsi3:
347 .regstk 2,0,0,0
348 sxt4 in0 = in0
349 sxt4 in1 = in1
351 setf.sig f8 = in0
352 setf.sig f9 = in1
354 mov r2 = 0x0ffdd
355 fcvt.xf f8 = f8
356 fcvt.xf f9 = f9
358 setf.exp f11 = r2
359 frcpa.s1 f10, p6 = f8, f9
361 (p6) fmpy.s1 f8 = f8, f10
362 (p6) fnma.s1 f9 = f9, f10, f1
364 (p6) fma.s1 f8 = f9, f8, f8
365 (p6) fma.s1 f9 = f9, f9, f11
367 (p6) fma.s1 f10 = f9, f8, f8
369 fcvt.fx.trunc.s1 f10 = f10
371 getf.sig ret0 = f10
372 br.ret.sptk rp
374 .endp __divsi3
375 #endif
377 #ifdef L__modsi3
378 // Compute a 32-bit integer modulus.
380 // From the Intel IA-64 Optimization Guide, choose the minimum latency
381 // alternative.
383 // in0 holds the dividend. in1 holds the divisor.
385 .text
386 .align 16
387 .global __modsi3
388 .proc __modsi3
389 __modsi3:
390 .regstk 2,0,0,0
391 mov r2 = 0x0ffdd
392 sxt4 in0 = in0
393 sxt4 in1 = in1
395 setf.sig f13 = r32
396 setf.sig f9 = r33
398 sub in1 = r0, in1
399 fcvt.xf f8 = f13
400 fcvt.xf f9 = f9
402 setf.exp f11 = r2
403 frcpa.s1 f10, p6 = f8, f9
405 (p6) fmpy.s1 f12 = f8, f10
406 (p6) fnma.s1 f10 = f9, f10, f1
408 setf.sig f9 = in1
409 (p6) fma.s1 f12 = f10, f12, f12
410 (p6) fma.s1 f10 = f10, f10, f11
412 (p6) fma.s1 f10 = f10, f12, f12
414 fcvt.fx.trunc.s1 f10 = f10
416 xma.l f10 = f10, f9, f13
418 getf.sig ret0 = f10
419 br.ret.sptk rp
421 .endp __modsi3
422 #endif
424 #ifdef L__udivsi3
425 // Compute a 32-bit unsigned integer quotient.
427 // From the Intel IA-64 Optimization Guide, choose the minimum latency
428 // alternative.
430 // in0 holds the dividend. in1 holds the divisor.
432 .text
433 .align 16
434 .global __udivsi3
435 .proc __udivsi3
436 __udivsi3:
437 .regstk 2,0,0,0
438 mov r2 = 0x0ffdd
439 zxt4 in0 = in0
440 zxt4 in1 = in1
442 setf.sig f8 = in0
443 setf.sig f9 = in1
445 fcvt.xf f8 = f8
446 fcvt.xf f9 = f9
448 setf.exp f11 = r2
449 frcpa.s1 f10, p6 = f8, f9
451 (p6) fmpy.s1 f8 = f8, f10
452 (p6) fnma.s1 f9 = f9, f10, f1
454 (p6) fma.s1 f8 = f9, f8, f8
455 (p6) fma.s1 f9 = f9, f9, f11
457 (p6) fma.s1 f10 = f9, f8, f8
459 fcvt.fxu.trunc.s1 f10 = f10
461 getf.sig ret0 = f10
462 br.ret.sptk rp
464 .endp __udivsi3
465 #endif
467 #ifdef L__umodsi3
468 // Compute a 32-bit unsigned integer modulus.
470 // From the Intel IA-64 Optimization Guide, choose the minimum latency
471 // alternative.
473 // in0 holds the dividend. in1 holds the divisor.
475 .text
476 .align 16
477 .global __umodsi3
478 .proc __umodsi3
479 __umodsi3:
480 .regstk 2,0,0,0
481 mov r2 = 0x0ffdd
482 zxt4 in0 = in0
483 zxt4 in1 = in1
485 setf.sig f13 = in0
486 setf.sig f9 = in1
488 sub in1 = r0, in1
489 fcvt.xf f8 = f13
490 fcvt.xf f9 = f9
492 setf.exp f11 = r2
493 frcpa.s1 f10, p6 = f8, f9
495 (p6) fmpy.s1 f12 = f8, f10
496 (p6) fnma.s1 f10 = f9, f10, f1
498 setf.sig f9 = in1
499 (p6) fma.s1 f12 = f10, f12, f12
500 (p6) fma.s1 f10 = f10, f10, f11
502 (p6) fma.s1 f10 = f10, f12, f12
504 fcvt.fxu.trunc.s1 f10 = f10
506 xma.l f10 = f10, f9, f13
508 getf.sig ret0 = f10
509 br.ret.sptk rp
511 .endp __umodsi3
512 #endif
514 #ifdef L__save_stack_nonlocal
515 // Notes on save/restore stack nonlocal: We read ar.bsp but write
516 // ar.bspstore. This is because ar.bsp can be read at all times
517 // (independent of the RSE mode) but since it's read-only we need to
518 // restore the value via ar.bspstore. This is OK because
519 // ar.bsp==ar.bspstore after executing "flushrs".
521 // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
523 .text
524 .align 16
525 .global __ia64_save_stack_nonlocal
526 .proc __ia64_save_stack_nonlocal
527 __ia64_save_stack_nonlocal:
528 { .mmf
529 alloc r18 = ar.pfs, 2, 0, 0, 0
530 mov r19 = ar.rsc
533 { .mmi
534 flushrs
535 st8 [in0] = in1, 24
536 and r19 = 0x1c, r19
539 { .mmi
540 st8 [in0] = r18, -16
541 mov ar.rsc = r19
542 or r19 = 0x3, r19
545 { .mmi
546 mov r16 = ar.bsp
547 mov r17 = ar.rnat
548 adds r2 = 8, in0
551 { .mmi
552 st8 [in0] = r16
553 st8 [r2] = r17
555 { .mib
556 mov ar.rsc = r19
557 br.ret.sptk.few rp
560 .endp __ia64_save_stack_nonlocal
561 #endif
563 #ifdef L__nonlocal_goto
564 // void __ia64_nonlocal_goto(void *target_label, void *save_area,
565 // void *static_chain);
567 .text
568 .align 16
569 .global __ia64_nonlocal_goto
570 .proc __ia64_nonlocal_goto
571 __ia64_nonlocal_goto:
572 { .mmi
573 alloc r20 = ar.pfs, 3, 0, 0, 0
574 ld8 r12 = [in1], 8
575 mov.ret.sptk rp = in0, .L0
578 { .mmf
579 ld8 r16 = [in1], 8
580 mov r19 = ar.rsc
583 { .mmi
584 flushrs
585 ld8 r17 = [in1], 8
586 and r19 = 0x1c, r19
589 { .mmi
590 ld8 r18 = [in1]
591 mov ar.rsc = r19
592 or r19 = 0x3, r19
595 { .mmi
596 mov ar.bspstore = r16
598 mov ar.rnat = r17
601 { .mmi
602 loadrs
603 invala
604 mov r15 = in2
607 .L0: { .mib
608 mov ar.rsc = r19
609 mov ar.pfs = r18
610 br.ret.sptk.few rp
613 .endp __ia64_nonlocal_goto
614 #endif
616 #ifdef L__restore_stack_nonlocal
617 // This is mostly the same as nonlocal_goto above.
618 // ??? This has not been tested yet.
620 // void __ia64_restore_stack_nonlocal(void *save_area)
622 .text
623 .align 16
624 .global __ia64_restore_stack_nonlocal
625 .proc __ia64_restore_stack_nonlocal
626 __ia64_restore_stack_nonlocal:
627 { .mmf
628 alloc r20 = ar.pfs, 4, 0, 0, 0
629 ld8 r12 = [in0], 8
632 { .mmb
633 ld8 r16=[in0], 8
634 mov r19 = ar.rsc
637 { .mmi
638 flushrs
639 ld8 r17 = [in0], 8
640 and r19 = 0x1c, r19
643 { .mmf
644 ld8 r18 = [in0]
645 mov ar.rsc = r19
648 { .mmi
649 mov ar.bspstore = r16
651 mov ar.rnat = r17
652 or r19 = 0x3, r19
655 { .mmf
656 loadrs
657 invala
660 .L0: { .mib
661 mov ar.rsc = r19
662 mov ar.pfs = r18
663 br.ret.sptk.few rp
666 .endp __ia64_restore_stack_nonlocal
667 #endif
669 #ifdef L__trampoline
670 // Implement the nested function trampoline. This is out of line
671 // so that we don't have to bother with flushing the icache, as
672 // well as making the on-stack trampoline smaller.
674 // The trampoline has the following form:
676 // +-------------------+ >
677 // TRAMP: | __ia64_trampoline | |
678 // +-------------------+ > fake function descriptor
679 // | TRAMP+16 | |
680 // +-------------------+ >
681 // | target descriptor |
682 // +-------------------+
683 // | static link |
684 // +-------------------+
686 .text
687 .align 16
688 .global __ia64_trampoline
689 .proc __ia64_trampoline
690 __ia64_trampoline:
691 { .mmi
692 ld8 r2 = [r1], 8
694 ld8 r15 = [r1]
696 { .mmi
697 ld8 r3 = [r2], 8
699 ld8 r1 = [r2]
700 mov b6 = r3
702 { .bbb
703 br.sptk.many b6
706 .endp __ia64_trampoline
707 #endif
709 // Thunks for backward compatibility.
710 #ifdef L_fixtfdi
711 .text
712 .align 16
713 .global __fixtfti
714 .proc __fixtfti
715 __fixtfti:
716 { .bbb
717 br.sptk.many __fixxfti
720 .endp __fixtfti
721 #endif
722 #ifdef L_fixunstfdi
723 .align 16
724 .global __fixunstfti
725 .proc __fixunstfti
726 __fixunstfti:
727 { .bbb
728 br.sptk.many __fixunsxfti
731 .endp __fixunstfti
732 #endif
733 #if L_floatditf
734 .align 16
735 .global __floattitf
736 .proc __floattitf
737 __floattitf:
738 { .bbb
739 br.sptk.many __floattixf
742 .endp __floattitf
743 #endif