1 /* Copyright (C) 2000-2024 Free Software Foundation, Inc.
2 Contributed by James E. Wilson <wilson@cygnus.com>.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 Under Section 7 of GPL version 3, you are granted additional
17 permissions described in the GCC Runtime Library Exception, version
18 3.1, as published by the Free Software Foundation.
20 You should have received a copy of the GNU General Public License and
21 a copy of the GCC Runtime Library Exception along with this program;
22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 <http://www.gnu.org/licenses/>. */
26 // Compute a 80-bit IEEE double-extended quotient.
28 // From the Intel IA-64 Optimization Guide, choose the minimum latency
31 // farg0 holds the dividend. farg1 holds the divisor.
33 // __divtf3 is an alternate symbol name for backward compatibility.
44 cmp.eq p7, p0 = r0, r0
45 frcpa.s0 f10, p6 = farg0, farg1
47 (p6) cmp.ne p7, p0 = r0, r0
48 .pred.rel.mutex p6, p7
49 (p6) fnma.s1 f11 = farg1, f10, f1
50 (p6) fma.s1 f12 = farg0, f10, f0
52 (p6) fma.s1 f13 = f11, f11, f0
53 (p6) fma.s1 f14 = f11, f11, f11
55 (p6) fma.s1 f11 = f13, f13, f11
56 (p6) fma.s1 f13 = f14, f10, f10
58 (p6) fma.s1 f10 = f13, f11, f10
59 (p6) fnma.s1 f11 = farg1, f12, farg0
61 (p6) fma.s1 f11 = f11, f10, f12
62 (p6) fnma.s1 f12 = farg1, f10, f1
64 (p6) fma.s1 f10 = f12, f10, f10
65 (p6) fnma.s1 f12 = farg1, f11, farg0
67 (p6) fma.s0 fret0 = f12, f10, f11
74 // Compute a 64-bit IEEE double quotient.
76 // From the Intel IA-64 Optimization Guide, choose the minimum latency
79 // farg0 holds the dividend. farg1 holds the divisor.
86 cmp.eq p7, p0 = r0, r0
87 frcpa.s0 f10, p6 = farg0, farg1
89 (p6) cmp.ne p7, p0 = r0, r0
90 .pred.rel.mutex p6, p7
91 (p6) fmpy.s1 f11 = farg0, f10
92 (p6) fnma.s1 f12 = farg1, f10, f1
94 (p6) fma.s1 f11 = f12, f11, f11
95 (p6) fmpy.s1 f13 = f12, f12
97 (p6) fma.s1 f10 = f12, f10, f10
98 (p6) fma.s1 f11 = f13, f11, f11
100 (p6) fmpy.s1 f12 = f13, f13
101 (p6) fma.s1 f10 = f13, f10, f10
103 (p6) fma.d.s1 f11 = f12, f11, f11
104 (p6) fma.s1 f10 = f12, f10, f10
106 (p6) fnma.d.s1 f8 = farg1, f11, farg0
108 (p6) fma.d fret0 = f8, f10, f11
116 // Compute a 32-bit IEEE float quotient.
118 // From the Intel IA-64 Optimization Guide, choose the minimum latency
121 // farg0 holds the dividend. farg1 holds the divisor.
128 cmp.eq p7, p0 = r0, r0
129 frcpa.s0 f10, p6 = farg0, farg1
131 (p6) cmp.ne p7, p0 = r0, r0
132 .pred.rel.mutex p6, p7
133 (p6) fmpy.s1 f8 = farg0, f10
134 (p6) fnma.s1 f9 = farg1, f10, f1
136 (p6) fma.s1 f8 = f9, f8, f8
137 (p6) fmpy.s1 f9 = f9, f9
139 (p6) fma.s1 f8 = f9, f8, f8
140 (p6) fmpy.s1 f9 = f9, f9
142 (p6) fma.d.s1 f10 = f9, f8, f8
144 (p6) fnorm.s.s0 fret0 = f10
152 // Compute a 64-bit integer quotient.
154 // From the Intel IA-64 Optimization Guide, choose the minimum latency
157 // in0 holds the dividend. in1 holds the divisor.
165 // Transfer inputs to FP registers.
168 // Check divide by zero.
169 cmp.ne.unc p0,p7=0,in1
171 // Convert the inputs to FP, so that they won't be treated as unsigned.
176 // Compute the reciprocal approximation.
177 frcpa.s1 f10, p6 = f8, f9
179 // 3 Newton-Raphson iterations.
180 (p6) fnma.s1 f11 = f9, f10, f1
181 (p6) fmpy.s1 f12 = f8, f10
183 (p6) fmpy.s1 f13 = f11, f11
184 (p6) fma.s1 f12 = f11, f12, f12
186 (p6) fma.s1 f10 = f11, f10, f10
187 (p6) fma.s1 f11 = f13, f12, f12
189 (p6) fma.s1 f10 = f13, f10, f10
190 (p6) fnma.s1 f12 = f9, f11, f8
192 (p6) fma.s1 f10 = f12, f10, f11
194 // Round quotient to an integer.
195 fcvt.fx.trunc.s1 f10 = f10
197 // Transfer result to GP registers.
205 // Compute a 64-bit integer modulus.
207 // From the Intel IA-64 Optimization Guide, choose the minimum latency
210 // in0 holds the dividend (a). in1 holds the divisor (b).
218 // Transfer inputs to FP registers.
221 // Check divide by zero.
222 cmp.ne.unc p0,p7=0,in1
224 // Convert the inputs to FP, so that they won't be treated as unsigned.
229 // Compute the reciprocal approximation.
230 frcpa.s1 f10, p6 = f8, f9
232 // 3 Newton-Raphson iterations.
233 (p6) fmpy.s1 f12 = f8, f10
234 (p6) fnma.s1 f11 = f9, f10, f1
236 (p6) fma.s1 f12 = f11, f12, f12
237 (p6) fmpy.s1 f13 = f11, f11
239 (p6) fma.s1 f10 = f11, f10, f10
240 (p6) fma.s1 f11 = f13, f12, f12
243 (p6) fma.s1 f10 = f13, f10, f10
244 (p6) fnma.s1 f12 = f9, f11, f8
247 (p6) fma.s1 f10 = f12, f10, f11
249 fcvt.fx.trunc.s1 f10 = f10
252 xma.l f10 = f10, f9, f14
254 // Transfer result to GP registers.
262 // Compute a 64-bit unsigned integer quotient.
264 // From the Intel IA-64 Optimization Guide, choose the minimum latency
267 // in0 holds the dividend. in1 holds the divisor.
275 // Transfer inputs to FP registers.
278 // Check divide by zero.
279 cmp.ne.unc p0,p7=0,in1
281 // Convert the inputs to FP, to avoid FP software-assist faults.
286 // Compute the reciprocal approximation.
287 frcpa.s1 f10, p6 = f8, f9
289 // 3 Newton-Raphson iterations.
290 (p6) fnma.s1 f11 = f9, f10, f1
291 (p6) fmpy.s1 f12 = f8, f10
293 (p6) fmpy.s1 f13 = f11, f11
294 (p6) fma.s1 f12 = f11, f12, f12
296 (p6) fma.s1 f10 = f11, f10, f10
297 (p6) fma.s1 f11 = f13, f12, f12
299 (p6) fma.s1 f10 = f13, f10, f10
300 (p6) fnma.s1 f12 = f9, f11, f8
302 (p6) fma.s1 f10 = f12, f10, f11
304 // Round quotient to an unsigned integer.
305 fcvt.fxu.trunc.s1 f10 = f10
307 // Transfer result to GP registers.
315 // Compute a 64-bit unsigned integer modulus.
317 // From the Intel IA-64 Optimization Guide, choose the minimum latency
320 // in0 holds the dividend (a). in1 holds the divisor (b).
328 // Transfer inputs to FP registers.
331 // Check divide by zero.
332 cmp.ne.unc p0,p7=0,in1
334 // Convert the inputs to FP, to avoid FP software assist faults.
339 // Compute the reciprocal approximation.
340 frcpa.s1 f10, p6 = f8, f9
342 // 3 Newton-Raphson iterations.
343 (p6) fmpy.s1 f12 = f8, f10
344 (p6) fnma.s1 f11 = f9, f10, f1
346 (p6) fma.s1 f12 = f11, f12, f12
347 (p6) fmpy.s1 f13 = f11, f11
349 (p6) fma.s1 f10 = f11, f10, f10
350 (p6) fma.s1 f11 = f13, f12, f12
353 (p6) fma.s1 f10 = f13, f10, f10
354 (p6) fnma.s1 f12 = f9, f11, f8
357 (p6) fma.s1 f10 = f12, f10, f11
359 // Round quotient to an unsigned integer.
360 fcvt.fxu.trunc.s1 f10 = f10
363 xma.l f10 = f10, f9, f14
365 // Transfer result to GP registers.
373 // Compute a 32-bit integer quotient.
375 // From the Intel IA-64 Optimization Guide, choose the minimum latency
378 // in0 holds the dividend. in1 holds the divisor.
386 // Check divide by zero.
387 cmp.ne.unc p0,p7=0,in1
400 frcpa.s1 f10, p6 = f8, f9
402 (p6) fmpy.s1 f8 = f8, f10
403 (p6) fnma.s1 f9 = f9, f10, f1
405 (p6) fma.s1 f8 = f9, f8, f8
406 (p6) fma.s1 f9 = f9, f9, f11
408 (p6) fma.s1 f10 = f9, f8, f8
410 fcvt.fx.trunc.s1 f10 = f10
419 // Compute a 32-bit integer modulus.
421 // From the Intel IA-64 Optimization Guide, choose the minimum latency
424 // in0 holds the dividend. in1 holds the divisor.
438 // Check divide by zero.
439 cmp.ne.unc p0,p7=0,in1
446 frcpa.s1 f10, p6 = f8, f9
449 (p6) fmpy.s1 f12 = f8, f10
450 (p6) fnma.s1 f10 = f9, f10, f1
453 (p6) fma.s1 f12 = f10, f12, f12
454 (p6) fma.s1 f10 = f10, f10, f11
456 (p6) fma.s1 f10 = f10, f12, f12
458 fcvt.fx.trunc.s1 f10 = f10
460 xma.l f10 = f10, f9, f13
469 // Compute a 32-bit unsigned integer quotient.
471 // From the Intel IA-64 Optimization Guide, choose the minimum latency
474 // in0 holds the dividend. in1 holds the divisor.
488 // Check divide by zero.
489 cmp.ne.unc p0,p7=0,in1
496 frcpa.s1 f10, p6 = f8, f9
498 (p6) fmpy.s1 f8 = f8, f10
499 (p6) fnma.s1 f9 = f9, f10, f1
501 (p6) fma.s1 f8 = f9, f8, f8
502 (p6) fma.s1 f9 = f9, f9, f11
504 (p6) fma.s1 f10 = f9, f8, f8
506 fcvt.fxu.trunc.s1 f10 = f10
515 // Compute a 32-bit unsigned integer modulus.
517 // From the Intel IA-64 Optimization Guide, choose the minimum latency
520 // in0 holds the dividend. in1 holds the divisor.
534 // Check divide by zero.
535 cmp.ne.unc p0,p7=0,in1
542 frcpa.s1 f10, p6 = f8, f9
545 (p6) fmpy.s1 f12 = f8, f10
546 (p6) fnma.s1 f10 = f9, f10, f1
549 (p6) fma.s1 f12 = f10, f12, f12
550 (p6) fma.s1 f10 = f10, f10, f11
552 (p6) fma.s1 f10 = f10, f12, f12
554 fcvt.fxu.trunc.s1 f10 = f10
556 xma.l f10 = f10, f9, f13
564 #ifdef L__save_stack_nonlocal
565 // Notes on save/restore stack nonlocal: We read ar.bsp but write
566 // ar.bspstore. This is because ar.bsp can be read at all times
567 // (independent of the RSE mode) but since it's read-only we need to
568 // restore the value via ar.bspstore. This is OK because
569 // ar.bsp==ar.bspstore after executing "flushrs".
571 // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
575 .global __ia64_save_stack_nonlocal
576 .proc __ia64_save_stack_nonlocal
577 __ia64_save_stack_nonlocal:
579 alloc r18 = ar.pfs, 2, 0, 0, 0
610 .endp __ia64_save_stack_nonlocal
613 #ifdef L__nonlocal_goto
614 // void __ia64_nonlocal_goto(void *target_label, void *save_area,
615 // void *static_chain);
619 .global __ia64_nonlocal_goto
620 .proc __ia64_nonlocal_goto
621 __ia64_nonlocal_goto:
623 alloc r20 = ar.pfs, 3, 0, 0, 0
625 mov.ret.sptk rp = in0, .L0
646 mov ar.bspstore = r16
663 .endp __ia64_nonlocal_goto
666 #ifdef L__restore_stack_nonlocal
667 // This is mostly the same as nonlocal_goto above.
668 // ??? This has not been tested yet.
670 // void __ia64_restore_stack_nonlocal(void *save_area)
674 .global __ia64_restore_stack_nonlocal
675 .proc __ia64_restore_stack_nonlocal
676 __ia64_restore_stack_nonlocal:
678 alloc r20 = ar.pfs, 4, 0, 0, 0
699 mov ar.bspstore = r16
716 .endp __ia64_restore_stack_nonlocal
720 // Implement the nested function trampoline. This is out of line
721 // so that we don't have to bother with flushing the icache, as
722 // well as making the on-stack trampoline smaller.
724 // The trampoline has the following form:
726 // +-------------------+ >
727 // TRAMP: | __ia64_trampoline | |
728 // +-------------------+ > fake function descriptor
730 // +-------------------+ >
731 // | target descriptor |
732 // +-------------------+
734 // +-------------------+
738 .global __ia64_trampoline
739 .proc __ia64_trampoline
756 .endp __ia64_trampoline
760 // Thunks for backward compatibility.
768 br.sptk.many __fixxfti
779 br.sptk.many __fixunsxfti
790 br.sptk.many __floattixf