1 /* Copyright
(C
) 2000, 2001, 2003, 2005, 2009 Free Software Foundation
, Inc.
2 Contributed by James E. Wilson
<wilson
@cygnus.com
>.
4 This file is part of GCC.
6 GCC is free software
; you can redistribute it and/or modify
7 it under the terms of the GNU General
Public License as published by
8 the Free Software Foundation
; either version 2, or (at your option)
11 GCC is distributed
in the hope that it will be useful
,
12 but WITHOUT ANY WARRANTY
; without even the implied warranty of
13 MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General
Public License for more details.
16 You should have received a copy of the GNU General
Public License
17 along with GCC
; see the file COPYING. If not, write to
18 the Free Software Foundation
, 51 Franklin Street
, Fifth Floor
,
19 Boston
, MA
02110-1301, USA.
*/
21 /* As a special exception
, if you link
this library with other files
,
22 some of which are compiled with GCC
, to produce an executable
,
23 this library does
not by itself cause the resulting executable
24 to be covered by the GNU General
Public License.
25 This exception does
not however invalidate any other reasons why
26 the executable file might be covered by the GNU General
Public License.
*/
29 // Compute a
80-bit IEEE double
-extended quotient.
31 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
34 // farg0 holds the dividend. farg1 holds the divisor.
36 // __divtf3 is an alternate symbol
name for backward compatibility.
47 cmp.
eq p7
, p0
= r0
, r0
48 frcpa.s0 f10
, p6
= farg0
, farg1
50 (p6
) cmp.ne p7
, p0
= r0
, r0
51 .pred.rel.mutex p6
, p7
52 (p6
) fnma.s1 f11
= farg1
, f10
, f1
53 (p6
) fma.s1 f12
= farg0
, f10
, f0
55 (p6
) fma.s1 f13
= f11
, f11
, f0
56 (p6
) fma.s1 f14
= f11
, f11
, f11
58 (p6
) fma.s1 f11
= f13
, f13
, f11
59 (p6
) fma.s1 f13
= f14
, f10
, f10
61 (p6
) fma.s1 f10
= f13
, f11
, f10
62 (p6
) fnma.s1 f11
= farg1
, f12
, farg0
64 (p6
) fma.s1 f11
= f11
, f10
, f12
65 (p6
) fnma.s1 f12
= farg1
, f10
, f1
67 (p6
) fma.s1 f10
= f12
, f10
, f10
68 (p6
) fnma.s1 f12
= farg1
, f11
, farg0
70 (p6
) fma.s0 fret0
= f12
, f10
, f11
77 // Compute a
64-bit IEEE double quotient.
79 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
82 // farg0 holds the dividend. farg1 holds the divisor.
89 cmp.
eq p7
, p0
= r0
, r0
90 frcpa.s0 f10
, p6
= farg0
, farg1
92 (p6
) cmp.ne p7
, p0
= r0
, r0
93 .pred.rel.mutex p6
, p7
94 (p6
) fmpy.s1 f11
= farg0
, f10
95 (p6
) fnma.s1 f12
= farg1
, f10
, f1
97 (p6
) fma.s1 f11
= f12
, f11
, f11
98 (p6
) fmpy.s1 f13
= f12
, f12
100 (p6
) fma.s1 f10
= f12
, f10
, f10
101 (p6
) fma.s1 f11
= f13
, f11
, f11
103 (p6
) fmpy.s1 f12
= f13
, f13
104 (p6
) fma.s1 f10
= f13
, f10
, f10
106 (p6
) fma.d.s1 f11
= f12
, f11
, f11
107 (p6
) fma.s1 f10
= f12
, f10
, f10
109 (p6
) fnma.d.s1 f8
= farg1
, f11
, farg0
111 (p6
) fma.d fret0
= f8
, f10
, f11
119 // Compute a
32-bit IEEE float quotient.
121 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
124 // farg0 holds the dividend. farg1 holds the divisor.
131 cmp.
eq p7
, p0
= r0
, r0
132 frcpa.s0 f10
, p6
= farg0
, farg1
134 (p6
) cmp.ne p7
, p0
= r0
, r0
135 .pred.rel.mutex p6
, p7
136 (p6
) fmpy.s1 f8
= farg0
, f10
137 (p6
) fnma.s1 f9
= farg1
, f10
, f1
139 (p6
) fma.s1 f8
= f9
, f8
, f8
140 (p6
) fmpy.s1 f9
= f9
, f9
142 (p6
) fma.s1 f8
= f9
, f8
, f8
143 (p6
) fmpy.s1 f9
= f9
, f9
145 (p6
) fma.d.s1 f10
= f9
, f8
, f8
147 (p6
) fnorm.s.s0 fret0
= f10
155 // Compute a
64-bit integer quotient.
157 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
160 // in0 holds the dividend. in1 holds the divisor.
168 // Transfer inputs to FP registers.
171 // Check divide by zero.
172 cmp.ne.unc p0
,p7
=0,in1
174 // Convert the inputs to FP
, so that they won
't be treated as unsigned.
179 // Compute the reciprocal approximation.
180 frcpa.s1 f10, p6 = f8, f9
182 // 3 Newton-Raphson iterations.
183 (p6) fnma.s1 f11 = f9, f10, f1
184 (p6) fmpy.s1 f12 = f8, f10
186 (p6) fmpy.s1 f13 = f11, f11
187 (p6) fma.s1 f12 = f11, f12, f12
189 (p6) fma.s1 f10 = f11, f10, f10
190 (p6) fma.s1 f11 = f13, f12, f12
192 (p6) fma.s1 f10 = f13, f10, f10
193 (p6) fnma.s1 f12 = f9, f11, f8
195 (p6) fma.s1 f10 = f12, f10, f11
197 // Round quotient to an integer.
198 fcvt.fx.trunc.s1 f10 = f10
200 // Transfer result to GP registers.
208 // Compute a 64-bit integer modulus.
210 // From the Intel IA-64 Optimization Guide, choose the minimum latency
213 // in0 holds the dividend (a). in1 holds the divisor (b).
221 // Transfer inputs to FP registers.
224 // Check divide by zero.
225 cmp.ne.unc p0,p7=0,in1
227 // Convert the inputs to FP, so that they won't be treated as unsigned.
232 // Compute the reciprocal approximation.
233 frcpa.s1 f10
, p6
= f8
, f9
235 // 3 Newton
-Raphson iterations.
236 (p6
) fmpy.s1 f12
= f8
, f10
237 (p6
) fnma.s1 f11
= f9
, f10
, f1
239 (p6
) fma.s1 f12
= f11
, f12
, f12
240 (p6
) fmpy.s1 f13
= f11
, f11
242 (p6
) fma.s1 f10
= f11
, f10
, f10
243 (p6
) fma.s1 f11
= f13
, f12
, f12
246 (p6
) fma.s1 f10
= f13
, f10
, f10
247 (p6
) fnma.s1 f12
= f9
, f11
, f8
250 (p6
) fma.s1 f10
= f12
, f10
, f11
252 fcvt.fx.trunc.s1 f10
= f10
255 xma.l f10
= f10
, f9
, f14
257 // Transfer result to GP registers.
265 // Compute a
64-bit unsigned integer quotient.
267 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
270 // in0 holds the dividend. in1 holds the divisor.
278 // Transfer inputs to FP registers.
281 // Check divide by zero.
282 cmp.ne.unc p0
,p7
=0,in1
284 // Convert the inputs to FP
, to avoid FP software
-assist faults.
289 // Compute the reciprocal approximation.
290 frcpa.s1 f10
, p6
= f8
, f9
292 // 3 Newton
-Raphson iterations.
293 (p6
) fnma.s1 f11
= f9
, f10
, f1
294 (p6
) fmpy.s1 f12
= f8
, f10
296 (p6
) fmpy.s1 f13
= f11
, f11
297 (p6
) fma.s1 f12
= f11
, f12
, f12
299 (p6
) fma.s1 f10
= f11
, f10
, f10
300 (p6
) fma.s1 f11
= f13
, f12
, f12
302 (p6
) fma.s1 f10
= f13
, f10
, f10
303 (p6
) fnma.s1 f12
= f9
, f11
, f8
305 (p6
) fma.s1 f10
= f12
, f10
, f11
307 // Round quotient to an unsigned integer.
308 fcvt.fxu.trunc.s1 f10
= f10
310 // Transfer result to GP registers.
318 // Compute a
64-bit unsigned integer modulus.
320 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
323 // in0 holds the dividend
(a
). in1 holds the divisor
(b
).
331 // Transfer inputs to FP registers.
334 // Check divide by zero.
335 cmp.ne.unc p0
,p7
=0,in1
337 // Convert the inputs to FP
, to avoid FP software assist faults.
342 // Compute the reciprocal approximation.
343 frcpa.s1 f10
, p6
= f8
, f9
345 // 3 Newton
-Raphson iterations.
346 (p6
) fmpy.s1 f12
= f8
, f10
347 (p6
) fnma.s1 f11
= f9
, f10
, f1
349 (p6
) fma.s1 f12
= f11
, f12
, f12
350 (p6
) fmpy.s1 f13
= f11
, f11
352 (p6
) fma.s1 f10
= f11
, f10
, f10
353 (p6
) fma.s1 f11
= f13
, f12
, f12
356 (p6
) fma.s1 f10
= f13
, f10
, f10
357 (p6
) fnma.s1 f12
= f9
, f11
, f8
360 (p6
) fma.s1 f10
= f12
, f10
, f11
362 // Round quotient to an unsigned integer.
363 fcvt.fxu.trunc.s1 f10
= f10
366 xma.l f10
= f10
, f9
, f14
368 // Transfer result to GP registers.
376 // Compute a
32-bit integer quotient.
378 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
381 // in0 holds the dividend. in1 holds the divisor.
389 // Check divide by zero.
390 cmp.ne.unc p0
,p7
=0,in1
403 frcpa.s1 f10
, p6
= f8
, f9
405 (p6
) fmpy.s1 f8
= f8
, f10
406 (p6
) fnma.s1 f9
= f9
, f10
, f1
408 (p6
) fma.s1 f8
= f9
, f8
, f8
409 (p6
) fma.s1 f9
= f9
, f9
, f11
411 (p6
) fma.s1 f10
= f9
, f8
, f8
413 fcvt.fx.trunc.s1 f10
= f10
422 // Compute a
32-bit integer modulus.
424 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
427 // in0 holds the dividend. in1 holds the divisor.
441 // Check divide by zero.
442 cmp.ne.unc p0
,p7
=0,in1
449 frcpa.s1 f10
, p6
= f8
, f9
452 (p6
) fmpy.s1 f12
= f8
, f10
453 (p6
) fnma.s1 f10
= f9
, f10
, f1
456 (p6
) fma.s1 f12
= f10
, f12
, f12
457 (p6
) fma.s1 f10
= f10
, f10
, f11
459 (p6
) fma.s1 f10
= f10
, f12
, f12
461 fcvt.fx.trunc.s1 f10
= f10
463 xma.l f10
= f10
, f9
, f13
472 // Compute a
32-bit unsigned integer quotient.
474 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
477 // in0 holds the dividend. in1 holds the divisor.
491 // Check divide by zero.
492 cmp.ne.unc p0
,p7
=0,in1
499 frcpa.s1 f10
, p6
= f8
, f9
501 (p6
) fmpy.s1 f8
= f8
, f10
502 (p6
) fnma.s1 f9
= f9
, f10
, f1
504 (p6
) fma.s1 f8
= f9
, f8
, f8
505 (p6
) fma.s1 f9
= f9
, f9
, f11
507 (p6
) fma.s1 f10
= f9
, f8
, f8
509 fcvt.fxu.trunc.s1 f10
= f10
518 // Compute a
32-bit unsigned integer modulus.
520 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
523 // in0 holds the dividend. in1 holds the divisor.
537 // Check divide by zero.
538 cmp.ne.unc p0
,p7
=0,in1
545 frcpa.s1 f10
, p6
= f8
, f9
548 (p6
) fmpy.s1 f12
= f8
, f10
549 (p6
) fnma.s1 f10
= f9
, f10
, f1
552 (p6
) fma.s1 f12
= f10
, f12
, f12
553 (p6
) fma.s1 f10
= f10
, f10
, f11
555 (p6
) fma.s1 f10
= f10
, f12
, f12
557 fcvt.fxu.trunc.s1 f10
= f10
559 xma.l f10
= f10
, f9
, f13
567 #ifdef L__save_stack_nonlocal
568 // Notes on save
/restore stack
nonlocal: We read ar.bsp but write
569 // ar.bspstore.
This is because ar.bsp can be read at all times
570 // (independent of the RSE mode
) but since it
's read-only we need to
571 // restore the value via ar.bspstore. This is OK because
572 // ar.bsp==ar.bspstore after executing "flushrs".
574 // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
578 .global __ia64_save_stack_nonlocal
579 .proc __ia64_save_stack_nonlocal
580 __ia64_save_stack_nonlocal:
582 alloc r18 = ar.pfs, 2, 0, 0, 0
613 .endp __ia64_save_stack_nonlocal
616 #ifdef L__nonlocal_goto
617 // void __ia64_nonlocal_goto(void *target_label, void *save_area,
618 // void *static_chain);
622 .global __ia64_nonlocal_goto
623 .proc __ia64_nonlocal_goto
624 __ia64_nonlocal_goto:
626 alloc r20 = ar.pfs, 3, 0, 0, 0
628 mov.ret.sptk rp = in0, .L0
649 mov ar.bspstore = r16
666 .endp __ia64_nonlocal_goto
669 #ifdef L__restore_stack_nonlocal
670 // This is mostly the same as nonlocal_goto above.
671 // ??? This has not been tested yet.
673 // void __ia64_restore_stack_nonlocal(void *save_area)
677 .global __ia64_restore_stack_nonlocal
678 .proc __ia64_restore_stack_nonlocal
679 __ia64_restore_stack_nonlocal:
681 alloc r20 = ar.pfs, 4, 0, 0, 0
702 mov ar.bspstore = r16
719 .endp __ia64_restore_stack_nonlocal
723 // Implement the nested function trampoline. This is out of line
724 // so that we don't have to bother with flushing the icache
, as
725 // well as making the on
-stack trampoline smaller.
727 // The trampoline has the following
form:
729 // +-------------------+ >
730 // TRAMP: | __ia64_trampoline | |
731 // +-------------------+ > fake function descriptor
733 // +-------------------+ >
734 // | target descriptor |
735 // +-------------------+
737 // +-------------------+
741 .
global __ia64_trampoline
742 .
proc __ia64_trampoline
759 .
endp __ia64_trampoline
763 // Thunks for backward compatibility.
771 br.sptk.many __fixxfti
782 br.sptk.many __fixunsxfti
793 br.sptk.many __floattixf