Merged revisions 143552,143554,143557,143560,143562,143564-143567,143570-143573,14357...
[official-gcc.git] / gcc / config / ia64 / lib1funcs.asm
blobe29afca9e83cf617ce54568bf5dfd8c91dae7b8a
1 /* Copyright (C) 2000, 2001, 2003, 2005, 2009 Free Software Foundation, Inc.
2 Contributed by James E. Wilson <wilson@cygnus.com>.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING. If not, write to
18 the Free Software Foundation, 51 Franklin Street, Fifth Floor,
19 Boston, MA 02110-1301, USA. */
21 /* As a special exception, if you link this library with other files,
22 some of which are compiled with GCC, to produce an executable,
23 this library does not by itself cause the resulting executable
24 to be covered by the GNU General Public License.
25 This exception does not however invalidate any other reasons why
26 the executable file might be covered by the GNU General Public License. */
28 #ifdef L__divxf3
29 // Compute a 80-bit IEEE double-extended quotient.
31 // From the Intel IA-64 Optimization Guide, choose the minimum latency
32 // alternative.
34 // farg0 holds the dividend. farg1 holds the divisor.
36 // __divtf3 is an alternate symbol name for backward compatibility.
38 .text
39 .align 16
40 .global __divxf3
41 .proc __divxf3
42 __divxf3:
43 #ifdef SHARED
44 .global __divtf3
45 __divtf3:
46 #endif
47 cmp.eq p7, p0 = r0, r0
48 frcpa.s0 f10, p6 = farg0, farg1
50 (p6) cmp.ne p7, p0 = r0, r0
51 .pred.rel.mutex p6, p7
52 (p6) fnma.s1 f11 = farg1, f10, f1
53 (p6) fma.s1 f12 = farg0, f10, f0
55 (p6) fma.s1 f13 = f11, f11, f0
56 (p6) fma.s1 f14 = f11, f11, f11
58 (p6) fma.s1 f11 = f13, f13, f11
59 (p6) fma.s1 f13 = f14, f10, f10
61 (p6) fma.s1 f10 = f13, f11, f10
62 (p6) fnma.s1 f11 = farg1, f12, farg0
64 (p6) fma.s1 f11 = f11, f10, f12
65 (p6) fnma.s1 f12 = farg1, f10, f1
67 (p6) fma.s1 f10 = f12, f10, f10
68 (p6) fnma.s1 f12 = farg1, f11, farg0
70 (p6) fma.s0 fret0 = f12, f10, f11
71 (p7) mov fret0 = f10
72 br.ret.sptk rp
73 .endp __divxf3
74 #endif
76 #ifdef L__divdf3
77 // Compute a 64-bit IEEE double quotient.
79 // From the Intel IA-64 Optimization Guide, choose the minimum latency
80 // alternative.
82 // farg0 holds the dividend. farg1 holds the divisor.
84 .text
85 .align 16
86 .global __divdf3
87 .proc __divdf3
88 __divdf3:
89 cmp.eq p7, p0 = r0, r0
90 frcpa.s0 f10, p6 = farg0, farg1
92 (p6) cmp.ne p7, p0 = r0, r0
93 .pred.rel.mutex p6, p7
94 (p6) fmpy.s1 f11 = farg0, f10
95 (p6) fnma.s1 f12 = farg1, f10, f1
97 (p6) fma.s1 f11 = f12, f11, f11
98 (p6) fmpy.s1 f13 = f12, f12
100 (p6) fma.s1 f10 = f12, f10, f10
101 (p6) fma.s1 f11 = f13, f11, f11
103 (p6) fmpy.s1 f12 = f13, f13
104 (p6) fma.s1 f10 = f13, f10, f10
106 (p6) fma.d.s1 f11 = f12, f11, f11
107 (p6) fma.s1 f10 = f12, f10, f10
109 (p6) fnma.d.s1 f8 = farg1, f11, farg0
111 (p6) fma.d fret0 = f8, f10, f11
112 (p7) mov fret0 = f10
113 br.ret.sptk rp
115 .endp __divdf3
116 #endif
118 #ifdef L__divsf3
119 // Compute a 32-bit IEEE float quotient.
121 // From the Intel IA-64 Optimization Guide, choose the minimum latency
122 // alternative.
124 // farg0 holds the dividend. farg1 holds the divisor.
126 .text
127 .align 16
128 .global __divsf3
129 .proc __divsf3
130 __divsf3:
131 cmp.eq p7, p0 = r0, r0
132 frcpa.s0 f10, p6 = farg0, farg1
134 (p6) cmp.ne p7, p0 = r0, r0
135 .pred.rel.mutex p6, p7
136 (p6) fmpy.s1 f8 = farg0, f10
137 (p6) fnma.s1 f9 = farg1, f10, f1
139 (p6) fma.s1 f8 = f9, f8, f8
140 (p6) fmpy.s1 f9 = f9, f9
142 (p6) fma.s1 f8 = f9, f8, f8
143 (p6) fmpy.s1 f9 = f9, f9
145 (p6) fma.d.s1 f10 = f9, f8, f8
147 (p6) fnorm.s.s0 fret0 = f10
148 (p7) mov fret0 = f10
149 br.ret.sptk rp
151 .endp __divsf3
152 #endif
154 #ifdef L__divdi3
155 // Compute a 64-bit integer quotient.
157 // From the Intel IA-64 Optimization Guide, choose the minimum latency
158 // alternative.
160 // in0 holds the dividend. in1 holds the divisor.
162 .text
163 .align 16
164 .global __divdi3
165 .proc __divdi3
166 __divdi3:
167 .regstk 2,0,0,0
168 // Transfer inputs to FP registers.
169 setf.sig f8 = in0
170 setf.sig f9 = in1
171 // Check divide by zero.
172 cmp.ne.unc p0,p7=0,in1
174 // Convert the inputs to FP, so that they won't be treated as unsigned.
175 fcvt.xf f8 = f8
176 fcvt.xf f9 = f9
177 (p7) break 1
179 // Compute the reciprocal approximation.
180 frcpa.s1 f10, p6 = f8, f9
182 // 3 Newton-Raphson iterations.
183 (p6) fnma.s1 f11 = f9, f10, f1
184 (p6) fmpy.s1 f12 = f8, f10
186 (p6) fmpy.s1 f13 = f11, f11
187 (p6) fma.s1 f12 = f11, f12, f12
189 (p6) fma.s1 f10 = f11, f10, f10
190 (p6) fma.s1 f11 = f13, f12, f12
192 (p6) fma.s1 f10 = f13, f10, f10
193 (p6) fnma.s1 f12 = f9, f11, f8
195 (p6) fma.s1 f10 = f12, f10, f11
197 // Round quotient to an integer.
198 fcvt.fx.trunc.s1 f10 = f10
200 // Transfer result to GP registers.
201 getf.sig ret0 = f10
202 br.ret.sptk rp
204 .endp __divdi3
205 #endif
207 #ifdef L__moddi3
208 // Compute a 64-bit integer modulus.
210 // From the Intel IA-64 Optimization Guide, choose the minimum latency
211 // alternative.
213 // in0 holds the dividend (a). in1 holds the divisor (b).
215 .text
216 .align 16
217 .global __moddi3
218 .proc __moddi3
219 __moddi3:
220 .regstk 2,0,0,0
221 // Transfer inputs to FP registers.
222 setf.sig f14 = in0
223 setf.sig f9 = in1
224 // Check divide by zero.
225 cmp.ne.unc p0,p7=0,in1
227 // Convert the inputs to FP, so that they won't be treated as unsigned.
228 fcvt.xf f8 = f14
229 fcvt.xf f9 = f9
230 (p7) break 1
232 // Compute the reciprocal approximation.
233 frcpa.s1 f10, p6 = f8, f9
235 // 3 Newton-Raphson iterations.
236 (p6) fmpy.s1 f12 = f8, f10
237 (p6) fnma.s1 f11 = f9, f10, f1
239 (p6) fma.s1 f12 = f11, f12, f12
240 (p6) fmpy.s1 f13 = f11, f11
242 (p6) fma.s1 f10 = f11, f10, f10
243 (p6) fma.s1 f11 = f13, f12, f12
245 sub in1 = r0, in1
246 (p6) fma.s1 f10 = f13, f10, f10
247 (p6) fnma.s1 f12 = f9, f11, f8
249 setf.sig f9 = in1
250 (p6) fma.s1 f10 = f12, f10, f11
252 fcvt.fx.trunc.s1 f10 = f10
254 // r = q * (-b) + a
255 xma.l f10 = f10, f9, f14
257 // Transfer result to GP registers.
258 getf.sig ret0 = f10
259 br.ret.sptk rp
261 .endp __moddi3
262 #endif
264 #ifdef L__udivdi3
265 // Compute a 64-bit unsigned integer quotient.
267 // From the Intel IA-64 Optimization Guide, choose the minimum latency
268 // alternative.
270 // in0 holds the dividend. in1 holds the divisor.
272 .text
273 .align 16
274 .global __udivdi3
275 .proc __udivdi3
276 __udivdi3:
277 .regstk 2,0,0,0
278 // Transfer inputs to FP registers.
279 setf.sig f8 = in0
280 setf.sig f9 = in1
281 // Check divide by zero.
282 cmp.ne.unc p0,p7=0,in1
284 // Convert the inputs to FP, to avoid FP software-assist faults.
285 fcvt.xuf.s1 f8 = f8
286 fcvt.xuf.s1 f9 = f9
287 (p7) break 1
289 // Compute the reciprocal approximation.
290 frcpa.s1 f10, p6 = f8, f9
292 // 3 Newton-Raphson iterations.
293 (p6) fnma.s1 f11 = f9, f10, f1
294 (p6) fmpy.s1 f12 = f8, f10
296 (p6) fmpy.s1 f13 = f11, f11
297 (p6) fma.s1 f12 = f11, f12, f12
299 (p6) fma.s1 f10 = f11, f10, f10
300 (p6) fma.s1 f11 = f13, f12, f12
302 (p6) fma.s1 f10 = f13, f10, f10
303 (p6) fnma.s1 f12 = f9, f11, f8
305 (p6) fma.s1 f10 = f12, f10, f11
307 // Round quotient to an unsigned integer.
308 fcvt.fxu.trunc.s1 f10 = f10
310 // Transfer result to GP registers.
311 getf.sig ret0 = f10
312 br.ret.sptk rp
314 .endp __udivdi3
315 #endif
317 #ifdef L__umoddi3
318 // Compute a 64-bit unsigned integer modulus.
320 // From the Intel IA-64 Optimization Guide, choose the minimum latency
321 // alternative.
323 // in0 holds the dividend (a). in1 holds the divisor (b).
325 .text
326 .align 16
327 .global __umoddi3
328 .proc __umoddi3
329 __umoddi3:
330 .regstk 2,0,0,0
331 // Transfer inputs to FP registers.
332 setf.sig f14 = in0
333 setf.sig f9 = in1
334 // Check divide by zero.
335 cmp.ne.unc p0,p7=0,in1
337 // Convert the inputs to FP, to avoid FP software assist faults.
338 fcvt.xuf.s1 f8 = f14
339 fcvt.xuf.s1 f9 = f9
340 (p7) break 1;
342 // Compute the reciprocal approximation.
343 frcpa.s1 f10, p6 = f8, f9
345 // 3 Newton-Raphson iterations.
346 (p6) fmpy.s1 f12 = f8, f10
347 (p6) fnma.s1 f11 = f9, f10, f1
349 (p6) fma.s1 f12 = f11, f12, f12
350 (p6) fmpy.s1 f13 = f11, f11
352 (p6) fma.s1 f10 = f11, f10, f10
353 (p6) fma.s1 f11 = f13, f12, f12
355 sub in1 = r0, in1
356 (p6) fma.s1 f10 = f13, f10, f10
357 (p6) fnma.s1 f12 = f9, f11, f8
359 setf.sig f9 = in1
360 (p6) fma.s1 f10 = f12, f10, f11
362 // Round quotient to an unsigned integer.
363 fcvt.fxu.trunc.s1 f10 = f10
365 // r = q * (-b) + a
366 xma.l f10 = f10, f9, f14
368 // Transfer result to GP registers.
369 getf.sig ret0 = f10
370 br.ret.sptk rp
372 .endp __umoddi3
373 #endif
375 #ifdef L__divsi3
376 // Compute a 32-bit integer quotient.
378 // From the Intel IA-64 Optimization Guide, choose the minimum latency
379 // alternative.
381 // in0 holds the dividend. in1 holds the divisor.
383 .text
384 .align 16
385 .global __divsi3
386 .proc __divsi3
387 __divsi3:
388 .regstk 2,0,0,0
389 // Check divide by zero.
390 cmp.ne.unc p0,p7=0,in1
391 sxt4 in0 = in0
392 sxt4 in1 = in1
394 setf.sig f8 = in0
395 setf.sig f9 = in1
396 (p7) break 1
398 mov r2 = 0x0ffdd
399 fcvt.xf f8 = f8
400 fcvt.xf f9 = f9
402 setf.exp f11 = r2
403 frcpa.s1 f10, p6 = f8, f9
405 (p6) fmpy.s1 f8 = f8, f10
406 (p6) fnma.s1 f9 = f9, f10, f1
408 (p6) fma.s1 f8 = f9, f8, f8
409 (p6) fma.s1 f9 = f9, f9, f11
411 (p6) fma.s1 f10 = f9, f8, f8
413 fcvt.fx.trunc.s1 f10 = f10
415 getf.sig ret0 = f10
416 br.ret.sptk rp
418 .endp __divsi3
419 #endif
421 #ifdef L__modsi3
422 // Compute a 32-bit integer modulus.
424 // From the Intel IA-64 Optimization Guide, choose the minimum latency
425 // alternative.
427 // in0 holds the dividend. in1 holds the divisor.
429 .text
430 .align 16
431 .global __modsi3
432 .proc __modsi3
433 __modsi3:
434 .regstk 2,0,0,0
435 mov r2 = 0x0ffdd
436 sxt4 in0 = in0
437 sxt4 in1 = in1
439 setf.sig f13 = r32
440 setf.sig f9 = r33
441 // Check divide by zero.
442 cmp.ne.unc p0,p7=0,in1
444 sub in1 = r0, in1
445 fcvt.xf f8 = f13
446 fcvt.xf f9 = f9
448 setf.exp f11 = r2
449 frcpa.s1 f10, p6 = f8, f9
450 (p7) break 1
452 (p6) fmpy.s1 f12 = f8, f10
453 (p6) fnma.s1 f10 = f9, f10, f1
455 setf.sig f9 = in1
456 (p6) fma.s1 f12 = f10, f12, f12
457 (p6) fma.s1 f10 = f10, f10, f11
459 (p6) fma.s1 f10 = f10, f12, f12
461 fcvt.fx.trunc.s1 f10 = f10
463 xma.l f10 = f10, f9, f13
465 getf.sig ret0 = f10
466 br.ret.sptk rp
468 .endp __modsi3
469 #endif
471 #ifdef L__udivsi3
472 // Compute a 32-bit unsigned integer quotient.
474 // From the Intel IA-64 Optimization Guide, choose the minimum latency
475 // alternative.
477 // in0 holds the dividend. in1 holds the divisor.
479 .text
480 .align 16
481 .global __udivsi3
482 .proc __udivsi3
483 __udivsi3:
484 .regstk 2,0,0,0
485 mov r2 = 0x0ffdd
486 zxt4 in0 = in0
487 zxt4 in1 = in1
489 setf.sig f8 = in0
490 setf.sig f9 = in1
491 // Check divide by zero.
492 cmp.ne.unc p0,p7=0,in1
494 fcvt.xf f8 = f8
495 fcvt.xf f9 = f9
496 (p7) break 1
498 setf.exp f11 = r2
499 frcpa.s1 f10, p6 = f8, f9
501 (p6) fmpy.s1 f8 = f8, f10
502 (p6) fnma.s1 f9 = f9, f10, f1
504 (p6) fma.s1 f8 = f9, f8, f8
505 (p6) fma.s1 f9 = f9, f9, f11
507 (p6) fma.s1 f10 = f9, f8, f8
509 fcvt.fxu.trunc.s1 f10 = f10
511 getf.sig ret0 = f10
512 br.ret.sptk rp
514 .endp __udivsi3
515 #endif
517 #ifdef L__umodsi3
518 // Compute a 32-bit unsigned integer modulus.
520 // From the Intel IA-64 Optimization Guide, choose the minimum latency
521 // alternative.
523 // in0 holds the dividend. in1 holds the divisor.
525 .text
526 .align 16
527 .global __umodsi3
528 .proc __umodsi3
529 __umodsi3:
530 .regstk 2,0,0,0
531 mov r2 = 0x0ffdd
532 zxt4 in0 = in0
533 zxt4 in1 = in1
535 setf.sig f13 = in0
536 setf.sig f9 = in1
537 // Check divide by zero.
538 cmp.ne.unc p0,p7=0,in1
540 sub in1 = r0, in1
541 fcvt.xf f8 = f13
542 fcvt.xf f9 = f9
544 setf.exp f11 = r2
545 frcpa.s1 f10, p6 = f8, f9
546 (p7) break 1;
548 (p6) fmpy.s1 f12 = f8, f10
549 (p6) fnma.s1 f10 = f9, f10, f1
551 setf.sig f9 = in1
552 (p6) fma.s1 f12 = f10, f12, f12
553 (p6) fma.s1 f10 = f10, f10, f11
555 (p6) fma.s1 f10 = f10, f12, f12
557 fcvt.fxu.trunc.s1 f10 = f10
559 xma.l f10 = f10, f9, f13
561 getf.sig ret0 = f10
562 br.ret.sptk rp
564 .endp __umodsi3
565 #endif
567 #ifdef L__save_stack_nonlocal
568 // Notes on save/restore stack nonlocal: We read ar.bsp but write
569 // ar.bspstore. This is because ar.bsp can be read at all times
570 // (independent of the RSE mode) but since it's read-only we need to
571 // restore the value via ar.bspstore. This is OK because
572 // ar.bsp==ar.bspstore after executing "flushrs".
574 // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
576 .text
577 .align 16
578 .global __ia64_save_stack_nonlocal
579 .proc __ia64_save_stack_nonlocal
580 __ia64_save_stack_nonlocal:
581 { .mmf
582 alloc r18 = ar.pfs, 2, 0, 0, 0
583 mov r19 = ar.rsc
586 { .mmi
587 flushrs
588 st8 [in0] = in1, 24
589 and r19 = 0x1c, r19
592 { .mmi
593 st8 [in0] = r18, -16
594 mov ar.rsc = r19
595 or r19 = 0x3, r19
598 { .mmi
599 mov r16 = ar.bsp
600 mov r17 = ar.rnat
601 adds r2 = 8, in0
604 { .mmi
605 st8 [in0] = r16
606 st8 [r2] = r17
608 { .mib
609 mov ar.rsc = r19
610 br.ret.sptk.few rp
613 .endp __ia64_save_stack_nonlocal
614 #endif
616 #ifdef L__nonlocal_goto
617 // void __ia64_nonlocal_goto(void *target_label, void *save_area,
618 // void *static_chain);
620 .text
621 .align 16
622 .global __ia64_nonlocal_goto
623 .proc __ia64_nonlocal_goto
624 __ia64_nonlocal_goto:
625 { .mmi
626 alloc r20 = ar.pfs, 3, 0, 0, 0
627 ld8 r12 = [in1], 8
628 mov.ret.sptk rp = in0, .L0
631 { .mmf
632 ld8 r16 = [in1], 8
633 mov r19 = ar.rsc
636 { .mmi
637 flushrs
638 ld8 r17 = [in1], 8
639 and r19 = 0x1c, r19
642 { .mmi
643 ld8 r18 = [in1]
644 mov ar.rsc = r19
645 or r19 = 0x3, r19
648 { .mmi
649 mov ar.bspstore = r16
651 mov ar.rnat = r17
654 { .mmi
655 loadrs
656 invala
657 mov r15 = in2
660 .L0: { .mib
661 mov ar.rsc = r19
662 mov ar.pfs = r18
663 br.ret.sptk.few rp
666 .endp __ia64_nonlocal_goto
667 #endif
669 #ifdef L__restore_stack_nonlocal
670 // This is mostly the same as nonlocal_goto above.
671 // ??? This has not been tested yet.
673 // void __ia64_restore_stack_nonlocal(void *save_area)
675 .text
676 .align 16
677 .global __ia64_restore_stack_nonlocal
678 .proc __ia64_restore_stack_nonlocal
679 __ia64_restore_stack_nonlocal:
680 { .mmf
681 alloc r20 = ar.pfs, 4, 0, 0, 0
682 ld8 r12 = [in0], 8
685 { .mmb
686 ld8 r16=[in0], 8
687 mov r19 = ar.rsc
690 { .mmi
691 flushrs
692 ld8 r17 = [in0], 8
693 and r19 = 0x1c, r19
696 { .mmf
697 ld8 r18 = [in0]
698 mov ar.rsc = r19
701 { .mmi
702 mov ar.bspstore = r16
704 mov ar.rnat = r17
705 or r19 = 0x3, r19
708 { .mmf
709 loadrs
710 invala
713 .L0: { .mib
714 mov ar.rsc = r19
715 mov ar.pfs = r18
716 br.ret.sptk.few rp
719 .endp __ia64_restore_stack_nonlocal
720 #endif
722 #ifdef L__trampoline
723 // Implement the nested function trampoline. This is out of line
724 // so that we don't have to bother with flushing the icache, as
725 // well as making the on-stack trampoline smaller.
727 // The trampoline has the following form:
729 // +-------------------+ >
730 // TRAMP: | __ia64_trampoline | |
731 // +-------------------+ > fake function descriptor
732 // | TRAMP+16 | |
733 // +-------------------+ >
734 // | target descriptor |
735 // +-------------------+
736 // | static link |
737 // +-------------------+
739 .text
740 .align 16
741 .global __ia64_trampoline
742 .proc __ia64_trampoline
743 __ia64_trampoline:
744 { .mmi
745 ld8 r2 = [r1], 8
747 ld8 r15 = [r1]
749 { .mmi
750 ld8 r3 = [r2], 8
752 ld8 r1 = [r2]
753 mov b6 = r3
755 { .bbb
756 br.sptk.many b6
759 .endp __ia64_trampoline
760 #endif
762 #ifdef SHARED
763 // Thunks for backward compatibility.
764 #ifdef L_fixtfdi
765 .text
766 .align 16
767 .global __fixtfti
768 .proc __fixtfti
769 __fixtfti:
770 { .bbb
771 br.sptk.many __fixxfti
774 .endp __fixtfti
775 #endif
776 #ifdef L_fixunstfdi
777 .align 16
778 .global __fixunstfti
779 .proc __fixunstfti
780 __fixunstfti:
781 { .bbb
782 br.sptk.many __fixunsxfti
785 .endp __fixunstfti
786 #endif
787 #ifdef L_floatditf
788 .align 16
789 .global __floattitf
790 .proc __floattitf
791 __floattitf:
792 { .bbb
793 br.sptk.many __floattixf
796 .endp __floattitf
797 #endif
798 #endif