2 // Compute a
80-bit IEEE double
-extended quotient.
4 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
7 // farg0 holds the dividend. farg1 holds the divisor.
9 // __divtf3 is an alternate symbol
name for backward compatibility.
18 cmp.
eq p7
, p0
= r0
, r0
19 frcpa.s0 f10
, p6
= farg0
, farg1
21 (p6
) cmp.ne p7
, p0
= r0
, r0
22 .pred.rel.mutex p6
, p7
23 (p6
) fnma.s1 f11
= farg1
, f10
, f1
24 (p6
) fma.s1 f12
= farg0
, f10
, f0
26 (p6
) fma.s1 f13
= f11
, f11
, f0
27 (p6
) fma.s1 f14
= f11
, f11
, f11
29 (p6
) fma.s1 f11
= f13
, f13
, f11
30 (p6
) fma.s1 f13
= f14
, f10
, f10
32 (p6
) fma.s1 f10
= f13
, f11
, f10
33 (p6
) fnma.s1 f11
= farg1
, f12
, farg0
35 (p6
) fma.s1 f11
= f11
, f10
, f12
36 (p6
) fnma.s1 f12
= farg1
, f10
, f1
38 (p6
) fma.s1 f10
= f12
, f10
, f10
39 (p6
) fnma.s1 f12
= farg1
, f11
, farg0
41 (p6
) fma.s0 fret0
= f12
, f10
, f11
48 // Compute a
64-bit IEEE double quotient.
50 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
53 // farg0 holds the dividend. farg1 holds the divisor.
60 cmp.
eq p7
, p0
= r0
, r0
61 frcpa.s0 f10
, p6
= farg0
, farg1
63 (p6
) cmp.ne p7
, p0
= r0
, r0
64 .pred.rel.mutex p6
, p7
65 (p6
) fmpy.s1 f11
= farg0
, f10
66 (p6
) fnma.s1 f12
= farg1
, f10
, f1
68 (p6
) fma.s1 f11
= f12
, f11
, f11
69 (p6
) fmpy.s1 f13
= f12
, f12
71 (p6
) fma.s1 f10
= f12
, f10
, f10
72 (p6
) fma.s1 f11
= f13
, f11
, f11
74 (p6
) fmpy.s1 f12
= f13
, f13
75 (p6
) fma.s1 f10
= f13
, f10
, f10
77 (p6
) fma.d.s1 f11
= f12
, f11
, f11
78 (p6
) fma.s1 f10
= f12
, f10
, f10
80 (p6
) fnma.d.s1 f8
= farg1
, f11
, farg0
82 (p6
) fma.d fret0
= f8
, f10
, f11
90 // Compute a
32-bit IEEE float quotient.
92 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
95 // farg0 holds the dividend. farg1 holds the divisor.
102 cmp.
eq p7
, p0
= r0
, r0
103 frcpa.s0 f10
, p6
= farg0
, farg1
105 (p6
) cmp.ne p7
, p0
= r0
, r0
106 .pred.rel.mutex p6
, p7
107 (p6
) fmpy.s1 f8
= farg0
, f10
108 (p6
) fnma.s1 f9
= farg1
, f10
, f1
110 (p6
) fma.s1 f8
= f9
, f8
, f8
111 (p6
) fmpy.s1 f9
= f9
, f9
113 (p6
) fma.s1 f8
= f9
, f8
, f8
114 (p6
) fmpy.s1 f9
= f9
, f9
116 (p6
) fma.d.s1 f10
= f9
, f8
, f8
118 (p6
) fnorm.s.s0 fret0
= f10
126 // Compute a
64-bit integer quotient.
128 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
131 // in0 holds the dividend. in1 holds the divisor.
139 // Transfer inputs to FP registers.
143 // Convert the inputs to FP
, so that they won
't be treated as unsigned.
147 // Compute the reciprocal approximation.
148 frcpa.s1 f10, p6 = f8, f9
150 // 3 Newton-Raphson iterations.
151 (p6) fnma.s1 f11 = f9, f10, f1
152 (p6) fmpy.s1 f12 = f8, f10
154 (p6) fmpy.s1 f13 = f11, f11
155 (p6) fma.s1 f12 = f11, f12, f12
157 (p6) fma.s1 f10 = f11, f10, f10
158 (p6) fma.s1 f11 = f13, f12, f12
160 (p6) fma.s1 f10 = f13, f10, f10
161 (p6) fnma.s1 f12 = f9, f11, f8
163 (p6) fma.s1 f10 = f12, f10, f11
165 // Round quotient to an integer.
166 fcvt.fx.trunc.s1 f10 = f10
168 // Transfer result to GP registers.
176 // Compute a 64-bit integer modulus.
178 // From the Intel IA-64 Optimization Guide, choose the minimum latency
181 // in0 holds the dividend (a). in1 holds the divisor (b).
189 // Transfer inputs to FP registers.
193 // Convert the inputs to FP, so that they won't be treated as unsigned.
197 // Compute the reciprocal approximation.
198 frcpa.s1 f10
, p6
= f8
, f9
200 // 3 Newton
-Raphson iterations.
201 (p6
) fmpy.s1 f12
= f8
, f10
202 (p6
) fnma.s1 f11
= f9
, f10
, f1
204 (p6
) fma.s1 f12
= f11
, f12
, f12
205 (p6
) fmpy.s1 f13
= f11
, f11
207 (p6
) fma.s1 f10
= f11
, f10
, f10
208 (p6
) fma.s1 f11
= f13
, f12
, f12
211 (p6
) fma.s1 f10
= f13
, f10
, f10
212 (p6
) fnma.s1 f12
= f9
, f11
, f8
215 (p6
) fma.s1 f10
= f12
, f10
, f11
217 fcvt.fx.trunc.s1 f10
= f10
220 xma.l f10
= f10
, f9
, f14
222 // Transfer result to GP registers.
230 // Compute a
64-bit unsigned integer quotient.
232 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
235 // in0 holds the dividend. in1 holds the divisor.
243 // Transfer inputs to FP registers.
247 // Convert the inputs to FP
, to avoid FP software
-assist faults.
251 // Compute the reciprocal approximation.
252 frcpa.s1 f10
, p6
= f8
, f9
254 // 3 Newton
-Raphson iterations.
255 (p6
) fnma.s1 f11
= f9
, f10
, f1
256 (p6
) fmpy.s1 f12
= f8
, f10
258 (p6
) fmpy.s1 f13
= f11
, f11
259 (p6
) fma.s1 f12
= f11
, f12
, f12
261 (p6
) fma.s1 f10
= f11
, f10
, f10
262 (p6
) fma.s1 f11
= f13
, f12
, f12
264 (p6
) fma.s1 f10
= f13
, f10
, f10
265 (p6
) fnma.s1 f12
= f9
, f11
, f8
267 (p6
) fma.s1 f10
= f12
, f10
, f11
269 // Round quotient to an unsigned integer.
270 fcvt.fxu.trunc.s1 f10
= f10
272 // Transfer result to GP registers.
280 // Compute a
64-bit unsigned integer modulus.
282 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
285 // in0 holds the dividend
(a
). in1 holds the divisor
(b
).
293 // Transfer inputs to FP registers.
297 // Convert the inputs to FP
, to avoid FP software assist faults.
301 // Compute the reciprocal approximation.
302 frcpa.s1 f10
, p6
= f8
, f9
304 // 3 Newton
-Raphson iterations.
305 (p6
) fmpy.s1 f12
= f8
, f10
306 (p6
) fnma.s1 f11
= f9
, f10
, f1
308 (p6
) fma.s1 f12
= f11
, f12
, f12
309 (p6
) fmpy.s1 f13
= f11
, f11
311 (p6
) fma.s1 f10
= f11
, f10
, f10
312 (p6
) fma.s1 f11
= f13
, f12
, f12
315 (p6
) fma.s1 f10
= f13
, f10
, f10
316 (p6
) fnma.s1 f12
= f9
, f11
, f8
319 (p6
) fma.s1 f10
= f12
, f10
, f11
321 // Round quotient to an unsigned integer.
322 fcvt.fxu.trunc.s1 f10
= f10
325 xma.l f10
= f10
, f9
, f14
327 // Transfer result to GP registers.
335 // Compute a
32-bit integer quotient.
337 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
340 // in0 holds the dividend. in1 holds the divisor.
359 frcpa.s1 f10
, p6
= f8
, f9
361 (p6
) fmpy.s1 f8
= f8
, f10
362 (p6
) fnma.s1 f9
= f9
, f10
, f1
364 (p6
) fma.s1 f8
= f9
, f8
, f8
365 (p6
) fma.s1 f9
= f9
, f9
, f11
367 (p6
) fma.s1 f10
= f9
, f8
, f8
369 fcvt.fx.trunc.s1 f10
= f10
378 // Compute a
32-bit integer modulus.
380 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
383 // in0 holds the dividend. in1 holds the divisor.
403 frcpa.s1 f10
, p6
= f8
, f9
405 (p6
) fmpy.s1 f12
= f8
, f10
406 (p6
) fnma.s1 f10
= f9
, f10
, f1
409 (p6
) fma.s1 f12
= f10
, f12
, f12
410 (p6
) fma.s1 f10
= f10
, f10
, f11
412 (p6
) fma.s1 f10
= f10
, f12
, f12
414 fcvt.fx.trunc.s1 f10
= f10
416 xma.l f10
= f10
, f9
, f13
425 // Compute a
32-bit unsigned integer quotient.
427 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
430 // in0 holds the dividend. in1 holds the divisor.
449 frcpa.s1 f10
, p6
= f8
, f9
451 (p6
) fmpy.s1 f8
= f8
, f10
452 (p6
) fnma.s1 f9
= f9
, f10
, f1
454 (p6
) fma.s1 f8
= f9
, f8
, f8
455 (p6
) fma.s1 f9
= f9
, f9
, f11
457 (p6
) fma.s1 f10
= f9
, f8
, f8
459 fcvt.fxu.trunc.s1 f10
= f10
468 // Compute a
32-bit unsigned integer modulus.
470 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
473 // in0 holds the dividend. in1 holds the divisor.
493 frcpa.s1 f10
, p6
= f8
, f9
495 (p6
) fmpy.s1 f12
= f8
, f10
496 (p6
) fnma.s1 f10
= f9
, f10
, f1
499 (p6
) fma.s1 f12
= f10
, f12
, f12
500 (p6
) fma.s1 f10
= f10
, f10
, f11
502 (p6
) fma.s1 f10
= f10
, f12
, f12
504 fcvt.fxu.trunc.s1 f10
= f10
506 xma.l f10
= f10
, f9
, f13
514 #ifdef L__save_stack_nonlocal
515 // Notes on save
/restore stack
nonlocal: We read ar.bsp but write
516 // ar.bspstore.
This is because ar.bsp can be read at all times
517 // (independent of the RSE mode
) but since it
's read-only we need to
518 // restore the value via ar.bspstore. This is OK because
519 // ar.bsp==ar.bspstore after executing "flushrs".
521 // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
525 .global __ia64_save_stack_nonlocal
526 .proc __ia64_save_stack_nonlocal
527 __ia64_save_stack_nonlocal:
529 alloc r18 = ar.pfs, 2, 0, 0, 0
560 .endp __ia64_save_stack_nonlocal
563 #ifdef L__nonlocal_goto
564 // void __ia64_nonlocal_goto(void *target_label, void *save_area,
565 // void *static_chain);
569 .global __ia64_nonlocal_goto
570 .proc __ia64_nonlocal_goto
571 __ia64_nonlocal_goto:
573 alloc r20 = ar.pfs, 3, 0, 0, 0
575 mov.ret.sptk rp = in0, .L0
596 mov ar.bspstore = r16
613 .endp __ia64_nonlocal_goto
616 #ifdef L__restore_stack_nonlocal
617 // This is mostly the same as nonlocal_goto above.
618 // ??? This has not been tested yet.
620 // void __ia64_restore_stack_nonlocal(void *save_area)
624 .global __ia64_restore_stack_nonlocal
625 .proc __ia64_restore_stack_nonlocal
626 __ia64_restore_stack_nonlocal:
628 alloc r20 = ar.pfs, 4, 0, 0, 0
649 mov ar.bspstore = r16
666 .endp __ia64_restore_stack_nonlocal
670 // Implement the nested function trampoline. This is out of line
671 // so that we don't have to bother with flushing the icache
, as
672 // well as making the on
-stack trampoline smaller.
674 // The trampoline has the following
form:
676 // +-------------------+ >
677 // TRAMP: | __ia64_trampoline | |
678 // +-------------------+ > fake function descriptor
680 // +-------------------+ >
681 // | target descriptor |
682 // +-------------------+
684 // +-------------------+
688 .
global __ia64_trampoline
689 .
proc __ia64_trampoline
706 .
endp __ia64_trampoline
709 // Thunks for backward compatibility.
717 br.sptk.many __fixxfti
728 br.sptk.many __fixunsxfti
739 br.sptk.many __floattixf