Remove old autovect-branch by moving to "dead" directory.
[official-gcc.git] / old-autovect-branch / gcc / config / ia64 / lib1funcs.asm
blob245a8bb1595a9575423cb2f27b6ce93333d49334
1 /* Copyright (C) 2000, 2001, 2003, 2005 Free Software Foundation, Inc.
2 Contributed by James E. Wilson <wilson@cygnus.com>.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING. If not, write to
18 the Free Software Foundation, 51 Franklin Street, Fifth Floor,
19 Boston, MA 02110-1301, USA. */
21 /* As a special exception, if you link this library with other files,
22 some of which are compiled with GCC, to produce an executable,
23 this library does not by itself cause the resulting executable
24 to be covered by the GNU General Public License.
25 This exception does not however invalidate any other reasons why
26 the executable file might be covered by the GNU General Public License. */
28 #ifdef L__divxf3
29 // Compute a 80-bit IEEE double-extended quotient.
31 // From the Intel IA-64 Optimization Guide, choose the minimum latency
32 // alternative.
34 // farg0 holds the dividend. farg1 holds the divisor.
36 // __divtf3 is an alternate symbol name for backward compatibility.
38 .text
39 .align 16
40 .global __divxf3
41 .global __divtf3
42 .proc __divxf3
43 __divxf3:
44 __divtf3:
45 cmp.eq p7, p0 = r0, r0
46 frcpa.s0 f10, p6 = farg0, farg1
48 (p6) cmp.ne p7, p0 = r0, r0
49 .pred.rel.mutex p6, p7
50 (p6) fnma.s1 f11 = farg1, f10, f1
51 (p6) fma.s1 f12 = farg0, f10, f0
53 (p6) fma.s1 f13 = f11, f11, f0
54 (p6) fma.s1 f14 = f11, f11, f11
56 (p6) fma.s1 f11 = f13, f13, f11
57 (p6) fma.s1 f13 = f14, f10, f10
59 (p6) fma.s1 f10 = f13, f11, f10
60 (p6) fnma.s1 f11 = farg1, f12, farg0
62 (p6) fma.s1 f11 = f11, f10, f12
63 (p6) fnma.s1 f12 = farg1, f10, f1
65 (p6) fma.s1 f10 = f12, f10, f10
66 (p6) fnma.s1 f12 = farg1, f11, farg0
68 (p6) fma.s0 fret0 = f12, f10, f11
69 (p7) mov fret0 = f10
70 br.ret.sptk rp
71 .endp __divxf3
72 #endif
74 #ifdef L__divdf3
75 // Compute a 64-bit IEEE double quotient.
77 // From the Intel IA-64 Optimization Guide, choose the minimum latency
78 // alternative.
80 // farg0 holds the dividend. farg1 holds the divisor.
82 .text
83 .align 16
84 .global __divdf3
85 .proc __divdf3
86 __divdf3:
87 cmp.eq p7, p0 = r0, r0
88 frcpa.s0 f10, p6 = farg0, farg1
90 (p6) cmp.ne p7, p0 = r0, r0
91 .pred.rel.mutex p6, p7
92 (p6) fmpy.s1 f11 = farg0, f10
93 (p6) fnma.s1 f12 = farg1, f10, f1
95 (p6) fma.s1 f11 = f12, f11, f11
96 (p6) fmpy.s1 f13 = f12, f12
98 (p6) fma.s1 f10 = f12, f10, f10
99 (p6) fma.s1 f11 = f13, f11, f11
101 (p6) fmpy.s1 f12 = f13, f13
102 (p6) fma.s1 f10 = f13, f10, f10
104 (p6) fma.d.s1 f11 = f12, f11, f11
105 (p6) fma.s1 f10 = f12, f10, f10
107 (p6) fnma.d.s1 f8 = farg1, f11, farg0
109 (p6) fma.d fret0 = f8, f10, f11
110 (p7) mov fret0 = f10
111 br.ret.sptk rp
113 .endp __divdf3
114 #endif
116 #ifdef L__divsf3
117 // Compute a 32-bit IEEE float quotient.
119 // From the Intel IA-64 Optimization Guide, choose the minimum latency
120 // alternative.
122 // farg0 holds the dividend. farg1 holds the divisor.
124 .text
125 .align 16
126 .global __divsf3
127 .proc __divsf3
128 __divsf3:
129 cmp.eq p7, p0 = r0, r0
130 frcpa.s0 f10, p6 = farg0, farg1
132 (p6) cmp.ne p7, p0 = r0, r0
133 .pred.rel.mutex p6, p7
134 (p6) fmpy.s1 f8 = farg0, f10
135 (p6) fnma.s1 f9 = farg1, f10, f1
137 (p6) fma.s1 f8 = f9, f8, f8
138 (p6) fmpy.s1 f9 = f9, f9
140 (p6) fma.s1 f8 = f9, f8, f8
141 (p6) fmpy.s1 f9 = f9, f9
143 (p6) fma.d.s1 f10 = f9, f8, f8
145 (p6) fnorm.s.s0 fret0 = f10
146 (p7) mov fret0 = f10
147 br.ret.sptk rp
149 .endp __divsf3
150 #endif
152 #ifdef L__divdi3
153 // Compute a 64-bit integer quotient.
155 // From the Intel IA-64 Optimization Guide, choose the minimum latency
156 // alternative.
158 // in0 holds the dividend. in1 holds the divisor.
160 .text
161 .align 16
162 .global __divdi3
163 .proc __divdi3
164 __divdi3:
165 .regstk 2,0,0,0
166 // Transfer inputs to FP registers.
167 setf.sig f8 = in0
168 setf.sig f9 = in1
169 // Check divide by zero.
170 cmp.ne.unc p0,p7=0,in1
172 // Convert the inputs to FP, so that they won't be treated as unsigned.
173 fcvt.xf f8 = f8
174 fcvt.xf f9 = f9
175 (p7) break 1
177 // Compute the reciprocal approximation.
178 frcpa.s1 f10, p6 = f8, f9
180 // 3 Newton-Raphson iterations.
181 (p6) fnma.s1 f11 = f9, f10, f1
182 (p6) fmpy.s1 f12 = f8, f10
184 (p6) fmpy.s1 f13 = f11, f11
185 (p6) fma.s1 f12 = f11, f12, f12
187 (p6) fma.s1 f10 = f11, f10, f10
188 (p6) fma.s1 f11 = f13, f12, f12
190 (p6) fma.s1 f10 = f13, f10, f10
191 (p6) fnma.s1 f12 = f9, f11, f8
193 (p6) fma.s1 f10 = f12, f10, f11
195 // Round quotient to an integer.
196 fcvt.fx.trunc.s1 f10 = f10
198 // Transfer result to GP registers.
199 getf.sig ret0 = f10
200 br.ret.sptk rp
202 .endp __divdi3
203 #endif
205 #ifdef L__moddi3
206 // Compute a 64-bit integer modulus.
208 // From the Intel IA-64 Optimization Guide, choose the minimum latency
209 // alternative.
211 // in0 holds the dividend (a). in1 holds the divisor (b).
213 .text
214 .align 16
215 .global __moddi3
216 .proc __moddi3
217 __moddi3:
218 .regstk 2,0,0,0
219 // Transfer inputs to FP registers.
220 setf.sig f14 = in0
221 setf.sig f9 = in1
222 // Check divide by zero.
223 cmp.ne.unc p0,p7=0,in1
225 // Convert the inputs to FP, so that they won't be treated as unsigned.
226 fcvt.xf f8 = f14
227 fcvt.xf f9 = f9
228 (p7) break 1
230 // Compute the reciprocal approximation.
231 frcpa.s1 f10, p6 = f8, f9
233 // 3 Newton-Raphson iterations.
234 (p6) fmpy.s1 f12 = f8, f10
235 (p6) fnma.s1 f11 = f9, f10, f1
237 (p6) fma.s1 f12 = f11, f12, f12
238 (p6) fmpy.s1 f13 = f11, f11
240 (p6) fma.s1 f10 = f11, f10, f10
241 (p6) fma.s1 f11 = f13, f12, f12
243 sub in1 = r0, in1
244 (p6) fma.s1 f10 = f13, f10, f10
245 (p6) fnma.s1 f12 = f9, f11, f8
247 setf.sig f9 = in1
248 (p6) fma.s1 f10 = f12, f10, f11
250 fcvt.fx.trunc.s1 f10 = f10
252 // r = q * (-b) + a
253 xma.l f10 = f10, f9, f14
255 // Transfer result to GP registers.
256 getf.sig ret0 = f10
257 br.ret.sptk rp
259 .endp __moddi3
260 #endif
262 #ifdef L__udivdi3
263 // Compute a 64-bit unsigned integer quotient.
265 // From the Intel IA-64 Optimization Guide, choose the minimum latency
266 // alternative.
268 // in0 holds the dividend. in1 holds the divisor.
270 .text
271 .align 16
272 .global __udivdi3
273 .proc __udivdi3
274 __udivdi3:
275 .regstk 2,0,0,0
276 // Transfer inputs to FP registers.
277 setf.sig f8 = in0
278 setf.sig f9 = in1
279 // Check divide by zero.
280 cmp.ne.unc p0,p7=0,in1
282 // Convert the inputs to FP, to avoid FP software-assist faults.
283 fcvt.xuf.s1 f8 = f8
284 fcvt.xuf.s1 f9 = f9
285 (p7) break 1
287 // Compute the reciprocal approximation.
288 frcpa.s1 f10, p6 = f8, f9
290 // 3 Newton-Raphson iterations.
291 (p6) fnma.s1 f11 = f9, f10, f1
292 (p6) fmpy.s1 f12 = f8, f10
294 (p6) fmpy.s1 f13 = f11, f11
295 (p6) fma.s1 f12 = f11, f12, f12
297 (p6) fma.s1 f10 = f11, f10, f10
298 (p6) fma.s1 f11 = f13, f12, f12
300 (p6) fma.s1 f10 = f13, f10, f10
301 (p6) fnma.s1 f12 = f9, f11, f8
303 (p6) fma.s1 f10 = f12, f10, f11
305 // Round quotient to an unsigned integer.
306 fcvt.fxu.trunc.s1 f10 = f10
308 // Transfer result to GP registers.
309 getf.sig ret0 = f10
310 br.ret.sptk rp
312 .endp __udivdi3
313 #endif
315 #ifdef L__umoddi3
316 // Compute a 64-bit unsigned integer modulus.
318 // From the Intel IA-64 Optimization Guide, choose the minimum latency
319 // alternative.
321 // in0 holds the dividend (a). in1 holds the divisor (b).
323 .text
324 .align 16
325 .global __umoddi3
326 .proc __umoddi3
327 __umoddi3:
328 .regstk 2,0,0,0
329 // Transfer inputs to FP registers.
330 setf.sig f14 = in0
331 setf.sig f9 = in1
332 // Check divide by zero.
333 cmp.ne.unc p0,p7=0,in1
335 // Convert the inputs to FP, to avoid FP software assist faults.
336 fcvt.xuf.s1 f8 = f14
337 fcvt.xuf.s1 f9 = f9
338 (p7) break 1;
340 // Compute the reciprocal approximation.
341 frcpa.s1 f10, p6 = f8, f9
343 // 3 Newton-Raphson iterations.
344 (p6) fmpy.s1 f12 = f8, f10
345 (p6) fnma.s1 f11 = f9, f10, f1
347 (p6) fma.s1 f12 = f11, f12, f12
348 (p6) fmpy.s1 f13 = f11, f11
350 (p6) fma.s1 f10 = f11, f10, f10
351 (p6) fma.s1 f11 = f13, f12, f12
353 sub in1 = r0, in1
354 (p6) fma.s1 f10 = f13, f10, f10
355 (p6) fnma.s1 f12 = f9, f11, f8
357 setf.sig f9 = in1
358 (p6) fma.s1 f10 = f12, f10, f11
360 // Round quotient to an unsigned integer.
361 fcvt.fxu.trunc.s1 f10 = f10
363 // r = q * (-b) + a
364 xma.l f10 = f10, f9, f14
366 // Transfer result to GP registers.
367 getf.sig ret0 = f10
368 br.ret.sptk rp
370 .endp __umoddi3
371 #endif
373 #ifdef L__divsi3
374 // Compute a 32-bit integer quotient.
376 // From the Intel IA-64 Optimization Guide, choose the minimum latency
377 // alternative.
379 // in0 holds the dividend. in1 holds the divisor.
381 .text
382 .align 16
383 .global __divsi3
384 .proc __divsi3
385 __divsi3:
386 .regstk 2,0,0,0
387 // Check divide by zero.
388 cmp.ne.unc p0,p7=0,in1
389 sxt4 in0 = in0
390 sxt4 in1 = in1
392 setf.sig f8 = in0
393 setf.sig f9 = in1
394 (p7) break 1
396 mov r2 = 0x0ffdd
397 fcvt.xf f8 = f8
398 fcvt.xf f9 = f9
400 setf.exp f11 = r2
401 frcpa.s1 f10, p6 = f8, f9
403 (p6) fmpy.s1 f8 = f8, f10
404 (p6) fnma.s1 f9 = f9, f10, f1
406 (p6) fma.s1 f8 = f9, f8, f8
407 (p6) fma.s1 f9 = f9, f9, f11
409 (p6) fma.s1 f10 = f9, f8, f8
411 fcvt.fx.trunc.s1 f10 = f10
413 getf.sig ret0 = f10
414 br.ret.sptk rp
416 .endp __divsi3
417 #endif
419 #ifdef L__modsi3
420 // Compute a 32-bit integer modulus.
422 // From the Intel IA-64 Optimization Guide, choose the minimum latency
423 // alternative.
425 // in0 holds the dividend. in1 holds the divisor.
427 .text
428 .align 16
429 .global __modsi3
430 .proc __modsi3
431 __modsi3:
432 .regstk 2,0,0,0
433 mov r2 = 0x0ffdd
434 sxt4 in0 = in0
435 sxt4 in1 = in1
437 setf.sig f13 = r32
438 setf.sig f9 = r33
439 // Check divide by zero.
440 cmp.ne.unc p0,p7=0,in1
442 sub in1 = r0, in1
443 fcvt.xf f8 = f13
444 fcvt.xf f9 = f9
446 setf.exp f11 = r2
447 frcpa.s1 f10, p6 = f8, f9
448 (p7) break 1
450 (p6) fmpy.s1 f12 = f8, f10
451 (p6) fnma.s1 f10 = f9, f10, f1
453 setf.sig f9 = in1
454 (p6) fma.s1 f12 = f10, f12, f12
455 (p6) fma.s1 f10 = f10, f10, f11
457 (p6) fma.s1 f10 = f10, f12, f12
459 fcvt.fx.trunc.s1 f10 = f10
461 xma.l f10 = f10, f9, f13
463 getf.sig ret0 = f10
464 br.ret.sptk rp
466 .endp __modsi3
467 #endif
469 #ifdef L__udivsi3
470 // Compute a 32-bit unsigned integer quotient.
472 // From the Intel IA-64 Optimization Guide, choose the minimum latency
473 // alternative.
475 // in0 holds the dividend. in1 holds the divisor.
477 .text
478 .align 16
479 .global __udivsi3
480 .proc __udivsi3
481 __udivsi3:
482 .regstk 2,0,0,0
483 mov r2 = 0x0ffdd
484 zxt4 in0 = in0
485 zxt4 in1 = in1
487 setf.sig f8 = in0
488 setf.sig f9 = in1
489 // Check divide by zero.
490 cmp.ne.unc p0,p7=0,in1
492 fcvt.xf f8 = f8
493 fcvt.xf f9 = f9
494 (p7) break 1
496 setf.exp f11 = r2
497 frcpa.s1 f10, p6 = f8, f9
499 (p6) fmpy.s1 f8 = f8, f10
500 (p6) fnma.s1 f9 = f9, f10, f1
502 (p6) fma.s1 f8 = f9, f8, f8
503 (p6) fma.s1 f9 = f9, f9, f11
505 (p6) fma.s1 f10 = f9, f8, f8
507 fcvt.fxu.trunc.s1 f10 = f10
509 getf.sig ret0 = f10
510 br.ret.sptk rp
512 .endp __udivsi3
513 #endif
515 #ifdef L__umodsi3
516 // Compute a 32-bit unsigned integer modulus.
518 // From the Intel IA-64 Optimization Guide, choose the minimum latency
519 // alternative.
521 // in0 holds the dividend. in1 holds the divisor.
523 .text
524 .align 16
525 .global __umodsi3
526 .proc __umodsi3
527 __umodsi3:
528 .regstk 2,0,0,0
529 mov r2 = 0x0ffdd
530 zxt4 in0 = in0
531 zxt4 in1 = in1
533 setf.sig f13 = in0
534 setf.sig f9 = in1
535 // Check divide by zero.
536 cmp.ne.unc p0,p7=0,in1
538 sub in1 = r0, in1
539 fcvt.xf f8 = f13
540 fcvt.xf f9 = f9
542 setf.exp f11 = r2
543 frcpa.s1 f10, p6 = f8, f9
544 (p7) break 1;
546 (p6) fmpy.s1 f12 = f8, f10
547 (p6) fnma.s1 f10 = f9, f10, f1
549 setf.sig f9 = in1
550 (p6) fma.s1 f12 = f10, f12, f12
551 (p6) fma.s1 f10 = f10, f10, f11
553 (p6) fma.s1 f10 = f10, f12, f12
555 fcvt.fxu.trunc.s1 f10 = f10
557 xma.l f10 = f10, f9, f13
559 getf.sig ret0 = f10
560 br.ret.sptk rp
562 .endp __umodsi3
563 #endif
565 #ifdef L__save_stack_nonlocal
566 // Notes on save/restore stack nonlocal: We read ar.bsp but write
567 // ar.bspstore. This is because ar.bsp can be read at all times
568 // (independent of the RSE mode) but since it's read-only we need to
569 // restore the value via ar.bspstore. This is OK because
570 // ar.bsp==ar.bspstore after executing "flushrs".
572 // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
574 .text
575 .align 16
576 .global __ia64_save_stack_nonlocal
577 .proc __ia64_save_stack_nonlocal
578 __ia64_save_stack_nonlocal:
579 { .mmf
580 alloc r18 = ar.pfs, 2, 0, 0, 0
581 mov r19 = ar.rsc
584 { .mmi
585 flushrs
586 st8 [in0] = in1, 24
587 and r19 = 0x1c, r19
590 { .mmi
591 st8 [in0] = r18, -16
592 mov ar.rsc = r19
593 or r19 = 0x3, r19
596 { .mmi
597 mov r16 = ar.bsp
598 mov r17 = ar.rnat
599 adds r2 = 8, in0
602 { .mmi
603 st8 [in0] = r16
604 st8 [r2] = r17
606 { .mib
607 mov ar.rsc = r19
608 br.ret.sptk.few rp
611 .endp __ia64_save_stack_nonlocal
612 #endif
614 #ifdef L__nonlocal_goto
615 // void __ia64_nonlocal_goto(void *target_label, void *save_area,
616 // void *static_chain);
618 .text
619 .align 16
620 .global __ia64_nonlocal_goto
621 .proc __ia64_nonlocal_goto
622 __ia64_nonlocal_goto:
623 { .mmi
624 alloc r20 = ar.pfs, 3, 0, 0, 0
625 ld8 r12 = [in1], 8
626 mov.ret.sptk rp = in0, .L0
629 { .mmf
630 ld8 r16 = [in1], 8
631 mov r19 = ar.rsc
634 { .mmi
635 flushrs
636 ld8 r17 = [in1], 8
637 and r19 = 0x1c, r19
640 { .mmi
641 ld8 r18 = [in1]
642 mov ar.rsc = r19
643 or r19 = 0x3, r19
646 { .mmi
647 mov ar.bspstore = r16
649 mov ar.rnat = r17
652 { .mmi
653 loadrs
654 invala
655 mov r15 = in2
658 .L0: { .mib
659 mov ar.rsc = r19
660 mov ar.pfs = r18
661 br.ret.sptk.few rp
664 .endp __ia64_nonlocal_goto
665 #endif
667 #ifdef L__restore_stack_nonlocal
668 // This is mostly the same as nonlocal_goto above.
669 // ??? This has not been tested yet.
671 // void __ia64_restore_stack_nonlocal(void *save_area)
673 .text
674 .align 16
675 .global __ia64_restore_stack_nonlocal
676 .proc __ia64_restore_stack_nonlocal
677 __ia64_restore_stack_nonlocal:
678 { .mmf
679 alloc r20 = ar.pfs, 4, 0, 0, 0
680 ld8 r12 = [in0], 8
683 { .mmb
684 ld8 r16=[in0], 8
685 mov r19 = ar.rsc
688 { .mmi
689 flushrs
690 ld8 r17 = [in0], 8
691 and r19 = 0x1c, r19
694 { .mmf
695 ld8 r18 = [in0]
696 mov ar.rsc = r19
699 { .mmi
700 mov ar.bspstore = r16
702 mov ar.rnat = r17
703 or r19 = 0x3, r19
706 { .mmf
707 loadrs
708 invala
711 .L0: { .mib
712 mov ar.rsc = r19
713 mov ar.pfs = r18
714 br.ret.sptk.few rp
717 .endp __ia64_restore_stack_nonlocal
718 #endif
720 #ifdef L__trampoline
721 // Implement the nested function trampoline. This is out of line
722 // so that we don't have to bother with flushing the icache, as
723 // well as making the on-stack trampoline smaller.
725 // The trampoline has the following form:
727 // +-------------------+ >
728 // TRAMP: | __ia64_trampoline | |
729 // +-------------------+ > fake function descriptor
730 // | TRAMP+16 | |
731 // +-------------------+ >
732 // | target descriptor |
733 // +-------------------+
734 // | static link |
735 // +-------------------+
737 .text
738 .align 16
739 .global __ia64_trampoline
740 .proc __ia64_trampoline
741 __ia64_trampoline:
742 { .mmi
743 ld8 r2 = [r1], 8
745 ld8 r15 = [r1]
747 { .mmi
748 ld8 r3 = [r2], 8
750 ld8 r1 = [r2]
751 mov b6 = r3
753 { .bbb
754 br.sptk.many b6
757 .endp __ia64_trampoline
758 #endif
760 // Thunks for backward compatibility.
761 #ifdef L_fixtfdi
762 .text
763 .align 16
764 .global __fixtfti
765 .proc __fixtfti
766 __fixtfti:
767 { .bbb
768 br.sptk.many __fixxfti
771 .endp __fixtfti
772 #endif
773 #ifdef L_fixunstfdi
774 .align 16
775 .global __fixunstfti
776 .proc __fixunstfti
777 __fixunstfti:
778 { .bbb
779 br.sptk.many __fixunsxfti
782 .endp __fixunstfti
783 #endif
784 #if L_floatditf
785 .align 16
786 .global __floattitf
787 .proc __floattitf
788 __floattitf:
789 { .bbb
790 br.sptk.many __floattixf
793 .endp __floattitf
794 #endif