1 /* From the Intel IA-64 Optimization Guide, choose the minimum latency
7 #include <shlib-compat.h>
9 #if SHLIB_COMPAT(libc, GLIBC_2_2, GLIBC_2_2_6)
12 Compute a 80-bit IEEE double-extended quotient.
13 farg0 holds the dividend. farg1 holds the divisor. */
16 cmp.eq p7, p0 = r0, r0
17 frcpa.s0 f10, p6 = farg0, farg1
19 (p6) cmp.ne p7, p0 = r0, r0
20 .pred.rel.mutex p6, p7
21 (p6) fnma.s1 f11 = farg1, f10, f1
22 (p6) fma.s1 f12 = farg0, f10, f0
24 (p6) fma.s1 f13 = f11, f11, f0
25 (p6) fma.s1 f14 = f11, f11, f11
27 (p6) fma.s1 f11 = f13, f13, f11
28 (p6) fma.s1 f13 = f14, f10, f10
30 (p6) fma.s1 f10 = f13, f11, f10
31 (p6) fnma.s1 f11 = farg1, f12, farg0
33 (p6) fma.s1 f11 = f11, f10, f12
34 (p6) fnma.s1 f12 = farg1, f10, f1
36 (p6) fma.s1 f10 = f12, f10, f10
37 (p6) fnma.s1 f12 = farg1, f11, farg0
39 (p6) fma.s0 fret0 = f12, f10, f11
43 .symver ___divtf3, __divtf3@GLIBC_2.2
46 Compute a 64-bit IEEE double quotient.
47 farg0 holds the dividend. farg1 holds the divisor. */
50 cmp.eq p7, p0 = r0, r0
51 frcpa.s0 f10, p6 = farg0, farg1
53 (p6) cmp.ne p7, p0 = r0, r0
54 .pred.rel.mutex p6, p7
55 (p6) fmpy.s1 f11 = farg0, f10
56 (p6) fnma.s1 f12 = farg1, f10, f1
58 (p6) fma.s1 f11 = f12, f11, f11
59 (p6) fmpy.s1 f13 = f12, f12
61 (p6) fma.s1 f10 = f12, f10, f10
62 (p6) fma.s1 f11 = f13, f11, f11
64 (p6) fmpy.s1 f12 = f13, f13
65 (p6) fma.s1 f10 = f13, f10, f10
67 (p6) fma.d.s1 f11 = f12, f11, f11
68 (p6) fma.s1 f10 = f12, f10, f10
70 (p6) fnma.d.s1 f8 = farg1, f11, farg0
72 (p6) fma.d fret0 = f8, f10, f11
77 .symver ___divdf3, __divdf3@GLIBC_2.2
80 Compute a 32-bit IEEE float quotient.
81 farg0 holds the dividend. farg1 holds the divisor. */
84 cmp.eq p7, p0 = r0, r0
85 frcpa.s0 f10, p6 = farg0, farg1
87 (p6) cmp.ne p7, p0 = r0, r0
88 .pred.rel.mutex p6, p7
89 (p6) fmpy.s1 f8 = farg0, f10
90 (p6) fnma.s1 f9 = farg1, f10, f1
92 (p6) fma.s1 f8 = f9, f8, f8
93 (p6) fmpy.s1 f9 = f9, f9
95 (p6) fma.s1 f8 = f9, f8, f8
96 (p6) fmpy.s1 f9 = f9, f9
98 (p6) fma.d.s1 f10 = f9, f8, f8
100 (p6) fnorm.s.s0 fret0 = f10
105 .symver ___divsf3, __divsf3@GLIBC_2.2
108 Compute a 64-bit integer quotient.
109 in0 holds the dividend. in1 holds the divisor. */
113 /* Transfer inputs to FP registers. */
117 /* Convert the inputs to FP, so that they won't be treated as
122 /* Compute the reciprocal approximation. */
123 frcpa.s1 f10, p6 = f8, f9
125 /* 3 Newton-Raphson iterations. */
126 (p6) fnma.s1 f11 = f9, f10, f1
127 (p6) fmpy.s1 f12 = f8, f10
129 (p6) fmpy.s1 f13 = f11, f11
130 (p6) fma.s1 f12 = f11, f12, f12
132 (p6) fma.s1 f10 = f11, f10, f10
133 (p6) fma.s1 f11 = f13, f12, f12
135 (p6) fma.s1 f10 = f13, f10, f10
136 (p6) fnma.s1 f12 = f9, f11, f8
138 (p6) fma.s1 f10 = f12, f10, f11
140 /* Round quotient to an integer. */
141 fcvt.fx.trunc.s1 f10 = f10
143 /* Transfer result to GP registers. */
148 .symver ___divdi3, __divdi3@GLIBC_2.2
151 Compute a 64-bit integer modulus.
152 in0 holds the dividend (a). in1 holds the divisor (b). */
156 /* Transfer inputs to FP registers. */
160 /* Convert the inputs to FP, so that they won't be treated as
165 /* Compute the reciprocal approximation. */
166 frcpa.s1 f10, p6 = f8, f9
168 /* 3 Newton-Raphson iterations. */
169 (p6) fmpy.s1 f12 = f8, f10
170 (p6) fnma.s1 f11 = f9, f10, f1
172 (p6) fma.s1 f12 = f11, f12, f12
173 (p6) fmpy.s1 f13 = f11, f11
175 (p6) fma.s1 f10 = f11, f10, f10
176 (p6) fma.s1 f11 = f13, f12, f12
179 (p6) fma.s1 f10 = f13, f10, f10
180 (p6) fnma.s1 f12 = f9, f11, f8
183 (p6) fma.s1 f10 = f12, f10, f11
185 fcvt.fx.trunc.s1 f10 = f10
187 /* r = q * (-b) + a */
188 xma.l f10 = f10, f9, f14
190 /* Transfer result to GP registers. */
195 .symver ___moddi3, __moddi3@GLIBC_2.2
198 Compute a 64-bit unsigned integer quotient.
199 in0 holds the dividend. in1 holds the divisor. */
203 /* Transfer inputs to FP registers. */
207 /* Convert the inputs to FP, to avoid FP software-assist faults. */
211 /* Compute the reciprocal approximation. */
212 frcpa.s1 f10, p6 = f8, f9
214 /* 3 Newton-Raphson iterations. */
215 (p6) fnma.s1 f11 = f9, f10, f1
216 (p6) fmpy.s1 f12 = f8, f10
218 (p6) fmpy.s1 f13 = f11, f11
219 (p6) fma.s1 f12 = f11, f12, f12
221 (p6) fma.s1 f10 = f11, f10, f10
222 (p6) fma.s1 f11 = f13, f12, f12
224 (p6) fma.s1 f10 = f13, f10, f10
225 (p6) fnma.s1 f12 = f9, f11, f8
227 (p6) fma.s1 f10 = f12, f10, f11
229 /* Round quotient to an unsigned integer. */
230 fcvt.fxu.trunc.s1 f10 = f10
232 /* Transfer result to GP registers. */
237 .symver ___udivdi3, __udivdi3@GLIBC_2.2
240 Compute a 64-bit unsigned integer modulus.
241 in0 holds the dividend (a). in1 holds the divisor (b). */
245 /* Transfer inputs to FP registers. */
249 /* Convert the inputs to FP, to avoid FP software assist faults. */
253 /* Compute the reciprocal approximation. */
254 frcpa.s1 f10, p6 = f8, f9
256 /* 3 Newton-Raphson iterations. */
257 (p6) fmpy.s1 f12 = f8, f10
258 (p6) fnma.s1 f11 = f9, f10, f1
260 (p6) fma.s1 f12 = f11, f12, f12
261 (p6) fmpy.s1 f13 = f11, f11
263 (p6) fma.s1 f10 = f11, f10, f10
264 (p6) fma.s1 f11 = f13, f12, f12
267 (p6) fma.s1 f10 = f13, f10, f10
268 (p6) fnma.s1 f12 = f9, f11, f8
271 (p6) fma.s1 f10 = f12, f10, f11
273 /* Round quotient to an unsigned integer. */
274 fcvt.fxu.trunc.s1 f10 = f10
276 /* r = q * (-b) + a */
277 xma.l f10 = f10, f9, f14
279 /* Transfer result to GP registers. */
284 .symver ___umoddi3, __umoddi3@GLIBC_2.2
287 Compute a 128-bit multiply of 128-bit multiplicands.
288 in0/in1 holds one multiplicand (a), in2/in3 holds the other one (b). */
293 movl r19 = 0xffffffff
324 xma.l f11 = f10, f8, f11
325 xma.l f6 = f6, f7, f9
332 cmp.ltu p7, p6 = r18, r17
335 (p7) adds r14 = 1, r19
337 (p7) add r21 = r21, r14
348 .symver ___multi3, __multi3@GLIBC_2.2