localedata: dz_BT, bo_CN: convert to UTF-8
[glibc.git] / sysdeps / ia64 / fpu / s_asinhl.S
blob30c5acb1a13e70ce458fab27658d4d80cc658df2
1 .file "asinhl.s"
4 // Copyright (c) 2000 - 2003, Intel Corporation
5 // All rights reserved.
6 //
7 //
8 // Redistribution and use in source and binary forms, with or without
9 // modification, are permitted provided that the following conditions are
10 // met:
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
15 // * Redistributions in binary form must reproduce the above copyright
16 // notice, this list of conditions and the following disclaimer in the
17 // documentation and/or other materials provided with the distribution.
19 // * The name of Intel Corporation may not be used to endorse or promote
20 // products derived from this software without specific prior written
21 // permission.
23 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 // Intel Corporation is the author of this code, and requests that all
36 // problem reports or change requests be submitted to it directly at
37 // http://www.intel.com/software/products/opensource/libraries/num.htm.
39 //*********************************************************************
41 // History:
42 // 09/04/01 Initial version
43 // 09/13/01 Performance improved, symmetry problems fixed
44 // 10/10/01 Performance improved, split issues removed
45 // 12/11/01 Changed huges_logp to not be global
46 // 05/20/02 Cleaned up namespace and sf0 syntax
47 // 02/10/03 Reordered header: .section, .global, .proc, .align;
48 //          used data8 for long double table values
50 //*********************************************************************
52 // API
53 //==============================================================
54 // long double asinhl(long double);
56 // Overview of operation
57 //==============================================================
59 // There are 6 paths:
60 // 1. x = 0, [S,Q]Nan or +/-INF
61 //    Return asinhl(x) = x + x;
63 // 2. x = + denormal
64 //    Return asinhl(x) = x - x^2;
66 // 3. x = - denormal
67 //    Return asinhl(x) = x + x^2;
69 // 4. 'Near 0': max denormal < |x| < 1/128
70 //    Return asinhl(x) = sign(x)*(x+x^3*(c3+x^2*(c5+x^2*(c7+x^2*(c9)))));
72 // 5. 'Huges': |x| > 2^63
73 //    Return asinhl(x) = sign(x)*(logl(2*x));
75 // 6. 'Main path': 1/128 < |x| < 2^63
76 //    b_hi + b_lo = x + sqrt(x^2 + 1);
77 //    asinhl(x) = sign(x)*(log_special(b_hi, b_lo));
79 // Algorithm description
80 //==============================================================
82 // Main path algorithm
83 // ( thanks to Peter Markstein for the idea of sqrt(x^2+1) computation! )
84 // *************************************************************************
86 // There are 3 parts of x+sqrt(x^2+1) computation:
88 //  1) p2 = (p2_hi+p2_lo) = x^2+1 obtaining
89 //     ------------------------------------
90 //     p2_hi = x2_hi + 1, where x2_hi = x * x;
91 //     p2_lo = x2_lo + p1_lo, where
92 //                            x2_lo = FMS(x*x-x2_hi),
93 //                            p1_lo = (1 - p2_hi) + x2_hi;
95 //  2) g = (g_hi+g_lo) = sqrt(p2) = sqrt(p2_hi+p2_lo)
96 //     ----------------------------------------------
97 //     r = invsqrt(p2_hi) (8-bit reciprocal square root approximation);
98 //     g = p2_hi * r (first 8 bit-approximation of sqrt);
100 //     h = 0.5 * r;
101 //     e = 0.5 - g * h;
102 //     g = g * e + g (second 16 bit-approximation of sqrt);
104 //     h = h * e + h;
105 //     e = 0.5 - g * h;
106 //     g = g * e + g (third 32 bit-approximation of sqrt);
108 //     h = h * e + h;
109 //     e = 0.5 - g * h;
110 //     g_hi = g * e + g (fourth 64 bit-approximation of sqrt);
112 //     Remainder computation:
113 //     h = h * e + h;
114 //     d = (p2_hi - g_hi * g_hi) + p2_lo;
115 //     g_lo = d * h;
117 //  3) b = (b_hi + b_lo) = x + g, where g = (g_hi + g_lo) = sqrt(x^2+1)
118 //     -------------------------------------------------------------------
119 //     b_hi = (g_hi + x) + gl;
120 //     b_lo = (g_hi - b_hi) + x + gl;
122 //  Now we pass b presented as sum b_hi + b_lo to special version
123 //  of logl function which accept a pair of arguments as
124 //  'mutiprecision' value.
126 //  Special log algorithm overview
127 //  ================================
128 //   Here we use a table lookup method. The basic idea is that in
129 //   order to compute logl(Arg) = logl (Arg-1) for an argument Arg in [1,2),
130 //   we construct a value G such that G*Arg is close to 1 and that
131 //   logl(1/G) is obtainable easily from a table of values calculated
132 //   beforehand. Thus
134 //      logl(Arg) = logl(1/G) + logl((G*Arg - 1))
136 //   Because |G*Arg - 1| is small, the second term on the right hand
137 //   side can be approximated by a short polynomial. We elaborate
138 //   this method in four steps.
140 //   Step 0: Initialization
142 //   We need to calculate logl( X ). Obtain N, S_hi such that
144 //      X = 2^N * ( S_hi + S_lo )   exactly
146 //   where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
147 //   that |S_lo| <= ulp(S_hi).
149 //   For the special version of logl: S_lo = b_lo
150 //   !-----------------------------------------------!
152 //   Step 1: Argument Reduction
154 //   Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
156 //      G := G_1 * G_2 * G_3
157 //      r := (G * S_hi - 1) + G * S_lo
159 //   These G_j's have the property that the product is exactly
160 //   representable and that |r| < 2^(-12) as a result.
162 //   Step 2: Approximation
164 //   logl(1 + r) is approximated by a short polynomial poly(r).
166 //   Step 3: Reconstruction
168 //   Finally,
170 //   logl( X )   =   logl( 2^N * (S_hi + S_lo) )
171 //                 ~=~  N*logl(2) + logl(1/G) + logl(1 + r)
172 //                 ~=~  N*logl(2) + logl(1/G) + poly(r).
174 //   For detailed description see logl or log1pl function, regular path.
176 // Registers used
177 //==============================================================
178 // Floating Point registers used:
179 // f8, input
180 // f32 -> f101 (70 registers)
182 // General registers used:
183 // r32 -> r57 (26 registers)
185 // Predicate registers used:
186 // p6 -> p11
187 // p6  for '0, NaNs, Inf' path
188 // p7  for '+ denormals' path
189 // p8  for 'near 0' path
190 // p9  for 'huges' path
191 // p10 for '- denormals' path
192 // p11 for negative values
194 // Data tables
195 //==============================================================
197 RODATA
198 .align 64
200 // C7, C9 'near 0' polynomial coefficients
201 LOCAL_OBJECT_START(Poly_C_near_0_79)
202 data8 0xF8DC939BBEDD5A54, 0x00003FF9
203 data8 0xB6DB6DAB21565AC5, 0x0000BFFA
204 LOCAL_OBJECT_END(Poly_C_near_0_79)
206 // C3, C5 'near 0' polynomial coefficients
207 LOCAL_OBJECT_START(Poly_C_near_0_35)
208 data8 0x999999999991D582, 0x00003FFB
209 data8 0xAAAAAAAAAAAAAAA9, 0x0000BFFC
210 LOCAL_OBJECT_END(Poly_C_near_0_35)
212 // Q coeffs
213 LOCAL_OBJECT_START(Constants_Q)
214 data4  0x00000000,0xB1721800,0x00003FFE,0x00000000
215 data4  0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
216 data4  0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
217 data4  0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
218 data4  0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
219 data4  0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
220 LOCAL_OBJECT_END(Constants_Q)
222 // Z1 - 16 bit fixed
223 LOCAL_OBJECT_START(Constants_Z_1)
224 data4  0x00008000
225 data4  0x00007879
226 data4  0x000071C8
227 data4  0x00006BCB
228 data4  0x00006667
229 data4  0x00006187
230 data4  0x00005D18
231 data4  0x0000590C
232 data4  0x00005556
233 data4  0x000051EC
234 data4  0x00004EC5
235 data4  0x00004BDB
236 data4  0x00004925
237 data4  0x0000469F
238 data4  0x00004445
239 data4  0x00004211
240 LOCAL_OBJECT_END(Constants_Z_1)
242 // G1 and H1 - IEEE single and h1 - IEEE double
243 LOCAL_OBJECT_START(Constants_G_H_h1)
244 data4  0x3F800000,0x00000000
245 data8  0x0000000000000000
246 data4  0x3F70F0F0,0x3D785196
247 data8  0x3DA163A6617D741C
248 data4  0x3F638E38,0x3DF13843
249 data8  0x3E2C55E6CBD3D5BB
250 data4  0x3F579430,0x3E2FF9A0
251 data8  0xBE3EB0BFD86EA5E7
252 data4  0x3F4CCCC8,0x3E647FD6
253 data8  0x3E2E6A8C86B12760
254 data4  0x3F430C30,0x3E8B3AE7
255 data8  0x3E47574C5C0739BA
256 data4  0x3F3A2E88,0x3EA30C68
257 data8  0x3E20E30F13E8AF2F
258 data4  0x3F321640,0x3EB9CEC8
259 data8  0xBE42885BF2C630BD
260 data4  0x3F2AAAA8,0x3ECF9927
261 data8  0x3E497F3497E577C6
262 data4  0x3F23D708,0x3EE47FC5
263 data8  0x3E3E6A6EA6B0A5AB
264 data4  0x3F1D89D8,0x3EF8947D
265 data8  0xBDF43E3CD328D9BE
266 data4  0x3F17B420,0x3F05F3A1
267 data8  0x3E4094C30ADB090A
268 data4  0x3F124920,0x3F0F4303
269 data8  0xBE28FBB2FC1FE510
270 data4  0x3F0D3DC8,0x3F183EBF
271 data8  0x3E3A789510FDE3FA
272 data4  0x3F088888,0x3F20EC80
273 data8  0x3E508CE57CC8C98F
274 data4  0x3F042108,0x3F29516A
275 data8  0xBE534874A223106C
276 LOCAL_OBJECT_END(Constants_G_H_h1)
278 // Z2 - 16 bit fixed
279 LOCAL_OBJECT_START(Constants_Z_2)
280 data4  0x00008000
281 data4  0x00007F81
282 data4  0x00007F02
283 data4  0x00007E85
284 data4  0x00007E08
285 data4  0x00007D8D
286 data4  0x00007D12
287 data4  0x00007C98
288 data4  0x00007C20
289 data4  0x00007BA8
290 data4  0x00007B31
291 data4  0x00007ABB
292 data4  0x00007A45
293 data4  0x000079D1
294 data4  0x0000795D
295 data4  0x000078EB
296 LOCAL_OBJECT_END(Constants_Z_2)
298 // G2 and H2 - IEEE single and h2 - IEEE double
299 LOCAL_OBJECT_START(Constants_G_H_h2)
300 data4  0x3F800000,0x00000000
301 data8  0x0000000000000000
302 data4  0x3F7F00F8,0x3B7F875D
303 data8  0x3DB5A11622C42273
304 data4  0x3F7E03F8,0x3BFF015B
305 data8  0x3DE620CF21F86ED3
306 data4  0x3F7D08E0,0x3C3EE393
307 data8  0xBDAFA07E484F34ED
308 data4  0x3F7C0FC0,0x3C7E0586
309 data8  0xBDFE07F03860BCF6
310 data4  0x3F7B1880,0x3C9E75D2
311 data8  0x3DEA370FA78093D6
312 data4  0x3F7A2328,0x3CBDC97A
313 data8  0x3DFF579172A753D0
314 data4  0x3F792FB0,0x3CDCFE47
315 data8  0x3DFEBE6CA7EF896B
316 data4  0x3F783E08,0x3CFC15D0
317 data8  0x3E0CF156409ECB43
318 data4  0x3F774E38,0x3D0D874D
319 data8  0xBE0B6F97FFEF71DF
320 data4  0x3F766038,0x3D1CF49B
321 data8  0xBE0804835D59EEE8
322 data4  0x3F757400,0x3D2C531D
323 data8  0x3E1F91E9A9192A74
324 data4  0x3F748988,0x3D3BA322
325 data8  0xBE139A06BF72A8CD
326 data4  0x3F73A0D0,0x3D4AE46F
327 data8  0x3E1D9202F8FBA6CF
328 data4  0x3F72B9D0,0x3D5A1756
329 data8  0xBE1DCCC4BA796223
330 data4  0x3F71D488,0x3D693B9D
331 data8  0xBE049391B6B7C239
332 LOCAL_OBJECT_END(Constants_G_H_h2)
334 // G3 and H3 - IEEE single and h3 - IEEE double
335 LOCAL_OBJECT_START(Constants_G_H_h3)
336 data4  0x3F7FFC00,0x38800100
337 data8  0x3D355595562224CD
338 data4  0x3F7FF400,0x39400480
339 data8  0x3D8200A206136FF6
340 data4  0x3F7FEC00,0x39A00640
341 data8  0x3DA4D68DE8DE9AF0
342 data4  0x3F7FE400,0x39E00C41
343 data8  0xBD8B4291B10238DC
344 data4  0x3F7FDC00,0x3A100A21
345 data8  0xBD89CCB83B1952CA
346 data4  0x3F7FD400,0x3A300F22
347 data8  0xBDB107071DC46826
348 data4  0x3F7FCC08,0x3A4FF51C
349 data8  0x3DB6FCB9F43307DB
350 data4  0x3F7FC408,0x3A6FFC1D
351 data8  0xBD9B7C4762DC7872
352 data4  0x3F7FBC10,0x3A87F20B
353 data8  0xBDC3725E3F89154A
354 data4  0x3F7FB410,0x3A97F68B
355 data8  0xBD93519D62B9D392
356 data4  0x3F7FAC18,0x3AA7EB86
357 data8  0x3DC184410F21BD9D
358 data4  0x3F7FA420,0x3AB7E101
359 data8  0xBDA64B952245E0A6
360 data4  0x3F7F9C20,0x3AC7E701
361 data8  0x3DB4B0ECAABB34B8
362 data4  0x3F7F9428,0x3AD7DD7B
363 data8  0x3D9923376DC40A7E
364 data4  0x3F7F8C30,0x3AE7D474
365 data8  0x3DC6E17B4F2083D3
366 data4  0x3F7F8438,0x3AF7CBED
367 data8  0x3DAE314B811D4394
368 data4  0x3F7F7C40,0x3B03E1F3
369 data8  0xBDD46F21B08F2DB1
370 data4  0x3F7F7448,0x3B0BDE2F
371 data8  0xBDDC30A46D34522B
372 data4  0x3F7F6C50,0x3B13DAAA
373 data8  0x3DCB0070B1F473DB
374 data4  0x3F7F6458,0x3B1BD766
375 data8  0xBDD65DDC6AD282FD
376 data4  0x3F7F5C68,0x3B23CC5C
377 data8  0xBDCDAB83F153761A
378 data4  0x3F7F5470,0x3B2BC997
379 data8  0xBDDADA40341D0F8F
380 data4  0x3F7F4C78,0x3B33C711
381 data8  0x3DCD1BD7EBC394E8
382 data4  0x3F7F4488,0x3B3BBCC6
383 data8  0xBDC3532B52E3E695
384 data4  0x3F7F3C90,0x3B43BAC0
385 data8  0xBDA3961EE846B3DE
386 data4  0x3F7F34A0,0x3B4BB0F4
387 data8  0xBDDADF06785778D4
388 data4  0x3F7F2CA8,0x3B53AF6D
389 data8  0x3DCC3ED1E55CE212
390 data4  0x3F7F24B8,0x3B5BA620
391 data8  0xBDBA31039E382C15
392 data4  0x3F7F1CC8,0x3B639D12
393 data8  0x3D635A0B5C5AF197
394 data4  0x3F7F14D8,0x3B6B9444
395 data8  0xBDDCCB1971D34EFC
396 data4  0x3F7F0CE0,0x3B7393BC
397 data8  0x3DC7450252CD7ADA
398 data4  0x3F7F04F0,0x3B7B8B6D
399 data8  0xBDB68F177D7F2A42
400 LOCAL_OBJECT_END(Constants_G_H_h3)
402 // Assembly macros
403 //==============================================================
405 // Floating Point Registers
407 FR_Arg          = f8
408 FR_Res          = f8
409 FR_AX           = f32
410 FR_XLog_Hi      = f33
411 FR_XLog_Lo      = f34
413     // Special logl registers
414 FR_Y_hi         = f35
415 FR_Y_lo         = f36
417 FR_Scale        = f37
418 FR_X_Prime      = f38
419 FR_S_hi         = f39
420 FR_W            = f40
421 FR_G            = f41
423 FR_H            = f42
424 FR_wsq          = f43
425 FR_w4           = f44
426 FR_h            = f45
427 FR_w6           = f46
429 FR_G2           = f47
430 FR_H2           = f48
431 FR_poly_lo      = f49
432 FR_P8           = f50
433 FR_poly_hi      = f51
435 FR_P7           = f52
436 FR_h2           = f53
437 FR_rsq          = f54
438 FR_P6           = f55
439 FR_r            = f56
441 FR_log2_hi      = f57
442 FR_log2_lo      = f58
444 FR_float_N      = f59
445 FR_Q4           = f60
447 FR_G3           = f61
448 FR_H3           = f62
449 FR_h3           = f63
451 FR_Q3           = f64
452 FR_Q2           = f65
453 FR_1LN10_hi     = f66
455 FR_Q1           = f67
456 FR_1LN10_lo     = f68
457 FR_P5           = f69
458 FR_rcub         = f70
460 FR_Neg_One      = f71
461 FR_Z            = f72
462 FR_AA           = f73
463 FR_BB           = f74
464 FR_S_lo         = f75
465 FR_2_to_minus_N = f76
468     // Huge & Main path prolog registers
469 FR_Half         = f77
470 FR_Two          = f78
471 FR_X2           = f79
472 FR_P2           = f80
473 FR_P2L          = f81
474 FR_Rcp          = f82
475 FR_GG           = f83
476 FR_HH           = f84
477 FR_EE           = f85
478 FR_DD           = f86
479 FR_GL           = f87
480 FR_A            = f88
481 FR_AL           = f89
482 FR_B            = f90
483 FR_BL           = f91
484 FR_Tmp          = f92
486     // Near 0 & Huges path prolog registers
487 FR_C3           = f93
488 FR_C5           = f94
489 FR_C7           = f95
490 FR_C9           = f96
492 FR_X3           = f97
493 FR_X4           = f98
494 FR_P9           = f99
495 FR_P5           = f100
496 FR_P3           = f101
499 // General Purpose Registers
501     // General prolog registers
502 GR_PFS          = r32
503 GR_TwoN7        = r40
504 GR_TwoP63       = r41
505 GR_ExpMask      = r42
506 GR_ArgExp       = r43
507 GR_Half         = r44
509     // Near 0 path prolog registers
510 GR_Poly_C_35    = r45
511 GR_Poly_C_79    = r46
513     // Special logl registers
514 GR_Index1       = r34
515 GR_Index2       = r35
516 GR_signif       = r36
517 GR_X_0          = r37
518 GR_X_1          = r38
519 GR_X_2          = r39
520 GR_Z_1          = r40
521 GR_Z_2          = r41
522 GR_N            = r42
523 GR_Bias         = r43
524 GR_M            = r44
525 GR_Index3       = r45
526 GR_exp_2tom80   = r45
527 GR_exp_mask     = r47
528 GR_exp_2tom7    = r48
529 GR_ad_ln10      = r49
530 GR_ad_tbl_1     = r50
531 GR_ad_tbl_2     = r51
532 GR_ad_tbl_3     = r52
533 GR_ad_q         = r53
534 GR_ad_z_1       = r54
535 GR_ad_z_2       = r55
536 GR_ad_z_3       = r56
537 GR_minus_N      = r57
541 .section .text
542 GLOBAL_LIBM_ENTRY(asinhl)
544 { .mfi
545       alloc     GR_PFS        = ar.pfs,0,27,0,0
546       fma.s1    FR_P2         = FR_Arg, FR_Arg, f1  // p2 = x^2 + 1
547       mov       GR_Half       = 0xfffe              // 0.5's exp
549 { .mfi
550       addl      GR_Poly_C_79  = @ltoff(Poly_C_near_0_79), gp // C7, C9 coeffs
551       fma.s1    FR_X2         = FR_Arg, FR_Arg, f0           // Obtain x^2
552       addl      GR_Poly_C_35  = @ltoff(Poly_C_near_0_35), gp // C3, C5 coeffs
555 { .mfi
556       getf.exp  GR_ArgExp     = FR_Arg        // get arument's exponent
557       fabs      FR_AX         = FR_Arg        // absolute value of argument
558       mov       GR_TwoN7      = 0xfff8        // 2^-7 exp
560 { .mfi
561       ld8       GR_Poly_C_79  = [GR_Poly_C_79] // get actual coeff table address
562       fma.s0       FR_Two        = f1, f1, f1        // construct 2.0
563       mov       GR_ExpMask    = 0x1ffff        // mask for exp
566 { .mfi
567       ld8       GR_Poly_C_35  = [GR_Poly_C_35] // get actual coeff table address
568       fclass.m  p6,p0         = FR_Arg, 0xe7   // if arg NaN inf zero
569       mov       GR_TwoP63     = 0x1003e        // 2^63 exp
571 { .mfi
572       addl      GR_ad_z_1     = @ltoff(Constants_Z_1#),gp
573       nop.f 0
574       nop.i 0
577 { .mfi
578       setf.exp  FR_Half       = GR_Half              // construct 0.5
579       fclass.m  p7,p0         = FR_Arg, 0x09  //  if arg + denorm
580       and       GR_ArgExp     = GR_ExpMask, GR_ArgExp // select exp
582 { .mfb
583       ld8       GR_ad_z_1     = [GR_ad_z_1]   // Get pointer to Constants_Z_1
584       nop.f 0
585       nop.b 0
587 { .mfi
588       ldfe      FR_C9         = [GR_Poly_C_79],16 // load C9
589       fclass.m  p10,p0        = FR_Arg, 0x0a    //  if arg - denorm
590       cmp.gt    p8, p0        = GR_TwoN7,  GR_ArgExp // if arg < 2^-7 ('near 0')
592 { .mfb
593       cmp.le    p9, p0        = GR_TwoP63, GR_ArgExp  // if arg > 2^63 ('huges')
594 (p6)  fma.s0    FR_Res        = FR_Arg,f1,FR_Arg     // r = a + a
595 (p6)  br.ret.spnt b0                            // return
597 // (X^2 + 1) computation
598 { .mfi
599 (p8)  ldfe      FR_C5         = [GR_Poly_C_35],16        // load C5
600       fms.s1    FR_Tmp        = f1, f1, FR_P2           // Tmp = 1 - p2
601       add       GR_ad_tbl_1   = 0x040, GR_ad_z_1    // Point to Constants_G_H_h1
603 { .mfb
604 (p8)  ldfe      FR_C7         = [GR_Poly_C_79],16        // load C7
605 (p7)  fnma.s0   FR_Res        =  FR_Arg,FR_Arg,FR_Arg // r = a - a*a
606 (p7)  br.ret.spnt b0                              // return
609 { .mfi
610 (p8)  ldfe      FR_C3         = [GR_Poly_C_35],16     // load C3
611       fcmp.lt.s1 p11, p12      = FR_Arg, f0     // if arg is negative
612       add       GR_ad_q       = -0x60, GR_ad_z_1    // Point to Constants_P
614 { .mfb
615       add       GR_ad_z_2     = 0x140, GR_ad_z_1    // Point to Constants_Z_2
616 (p10) fma.s0    FR_Res        =  FR_Arg,FR_Arg,FR_Arg // r = a + a*a
617 (p10) br.ret.spnt b0                             // return
620 { .mfi
621       add       GR_ad_tbl_2   = 0x180, GR_ad_z_1    // Point to Constants_G_H_h2
622       frsqrta.s1 FR_Rcp, p0   = FR_P2           // Rcp = 1/p2 reciprocal appr.
623       add       GR_ad_tbl_3   = 0x280, GR_ad_z_1    // Point to Constants_G_H_h3
625 { .mfi
626       nop.m 0
627       fms.s1    FR_P2L        = FR_AX, FR_AX, FR_X2 //low part of p2=fma(X*X-p2)
628       mov       GR_Bias       = 0x0FFFF            // Create exponent bias
631 { .mfb
632       nop.m 0
633 (p9)  fms.s1    FR_XLog_Hi    = FR_Two, FR_AX, f0  // Hi  of log1p arg = 2*X - 1
634 (p9)  br.cond.spnt huges_logl                      // special version of log1p
637 { .mfb
638       ldfe      FR_log2_hi    = [GR_ad_q],16      // Load log2_hi
639 (p8)  fma.s1    FR_X3         = FR_X2, FR_Arg, f0        // x^3 = x^2 * x
640 (p8)  br.cond.spnt near_0                                // Go to near 0 branch
643 { .mfi
644       ldfe      FR_log2_lo    = [GR_ad_q],16      // Load log2_lo
645       nop.f 0
646       nop.i 0
649 { .mfi
650       ldfe      FR_Q4         = [GR_ad_q],16          // Load Q4
651       fma.s1    FR_Tmp        = FR_Tmp, f1, FR_X2       // Tmp = Tmp + x^2
652       mov       GR_exp_mask   = 0x1FFFF        // Create exponent mask
655 { .mfi
656       ldfe      FR_Q3         = [GR_ad_q],16   // Load Q3
657       fma.s1    FR_GG         = FR_Rcp, FR_P2, f0        // g = Rcp * p2
658                                                // 8 bit Newton Raphson iteration
659       nop.i 0
661 { .mfi
662       nop.m 0
663       fma.s1    FR_HH         = FR_Half, FR_Rcp, f0      // h = 0.5 * Rcp
664       nop.i 0
666 { .mfi
667       ldfe      FR_Q2         = [GR_ad_q],16      // Load Q2
668       fnma.s1   FR_EE         = FR_GG, FR_HH, FR_Half   // e = 0.5 - g * h
669       nop.i 0
671 { .mfi
672       nop.m 0
673       fma.s1    FR_P2L        = FR_Tmp, f1, FR_P2L // low part of p2 = Tmp + p2l
674       nop.i 0
677 { .mfi
678       ldfe      FR_Q1         = [GR_ad_q]                // Load Q1
679       fma.s1    FR_GG         = FR_GG, FR_EE, FR_GG     // g = g * e + g
680                                               // 16 bit Newton Raphson iteration
681       nop.i 0
683 { .mfi
684       nop.m 0
685       fma.s1    FR_HH         = FR_HH, FR_EE, FR_HH     // h = h * e + h
686       nop.i 0
689 { .mfi
690       nop.m 0
691       fnma.s1   FR_EE         = FR_GG, FR_HH, FR_Half   // e = 0.5 - g * h
692       nop.i 0
695 { .mfi
696       nop.m 0
697       fma.s1    FR_GG         = FR_GG, FR_EE, FR_GG     // g = g * e + g
698                                               // 32 bit Newton Raphson iteration
699       nop.i 0
701 { .mfi
702       nop.m 0
703       fma.s1    FR_HH         = FR_HH, FR_EE, FR_HH     // h = h * e + h
704       nop.i 0
707 { .mfi
708       nop.m 0
709       fnma.s1   FR_EE         = FR_GG, FR_HH, FR_Half   // e = 0.5 - g * h
710       nop.i 0
713 { .mfi
714       nop.m 0
715       fma.s1    FR_GG         = FR_GG, FR_EE, FR_GG     // g = g * e + g
716                                               // 64 bit Newton Raphson iteration
717       nop.i 0
719 { .mfi
720       nop.m 0
721       fma.s1    FR_HH         = FR_HH, FR_EE, FR_HH     // h = h * e + h
722       nop.i 0
725 { .mfi
726       nop.m 0
727       fnma.s1   FR_DD         = FR_GG, FR_GG, FR_P2  // Remainder d = g * g - p2
728       nop.i 0
730 { .mfi
731       nop.m 0
732       fma.s1    FR_XLog_Hi     = FR_AX, f1, FR_GG // bh = z + gh
733       nop.i 0
736 { .mfi
737       nop.m 0
738       fma.s1    FR_DD         = FR_DD, f1, FR_P2L       // add p2l: d = d + p2l
739       nop.i 0
743 { .mfi
744       getf.sig  GR_signif     = FR_XLog_Hi     // Get significand of x+1
745       fmerge.ns FR_Neg_One    = f1, f1         // Form -1.0
746       mov       GR_exp_2tom7  = 0x0fff8        // Exponent of 2^-7
749 { .mfi
750       nop.m 0
751       fma.s1    FR_GL         = FR_DD, FR_HH, f0        // gl = d * h
752       extr.u    GR_Index1     = GR_signif, 59, 4    // Get high 4 bits of signif
754 { .mfi
755       nop.m 0
756       fma.s1    FR_XLog_Hi     = FR_DD,  FR_HH, FR_XLog_Hi // bh = bh + gl
757       nop.i 0
760 { .mmi
761       shladd    GR_ad_z_1     = GR_Index1, 2, GR_ad_z_1  // Point to Z_1
762       shladd    GR_ad_tbl_1   = GR_Index1, 4, GR_ad_tbl_1  // Point to G_1
763       extr.u    GR_X_0        = GR_signif, 49, 15 // Get high 15 bits of signif.
766 { .mmi
767       ld4       GR_Z_1        = [GR_ad_z_1]    // Load Z_1
768       nop.m 0
769       nop.i 0
772 { .mmi
773       ldfps     FR_G, FR_H    = [GR_ad_tbl_1],8     // Load G_1, H_1
774       nop.m 0
775       nop.i 0
778 { .mfi
779       nop.m 0
780       fms.s1    FR_XLog_Lo     = FR_GG,  f1,   FR_XLog_Hi // bl = gh - bh
781       pmpyshr2.u GR_X_1       = GR_X_0,GR_Z_1,15  // Get bits 30-15 of X_0 * Z_1
784 // WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
785 // "DEAD" ZONE!
787 { .mfi
788       nop.m 0
789       nop.f 0
790       nop.i 0
793 { .mfi
794       nop.m 0
795       fmerge.se FR_S_hi       =  f1,FR_XLog_Hi            // Form |x+1|
796       nop.i 0
799 { .mmi
800       getf.exp  GR_N          =  FR_XLog_Hi    // Get N = exponent of x+1
801       ldfd      FR_h          = [GR_ad_tbl_1]        // Load h_1
802       nop.i 0
805 { .mfi
806       nop.m 0
807       nop.f 0
808       extr.u    GR_Index2     = GR_X_1, 6, 4      // Extract bits 6-9 of X_1
812 { .mfi
813       shladd    GR_ad_tbl_2   = GR_Index2, 4, GR_ad_tbl_2  // Point to G_2
814       fma.s1    FR_XLog_Lo    = FR_XLog_Lo, f1, FR_AX // bl = bl + x
815       mov       GR_exp_2tom80 = 0x0ffaf           // Exponent of 2^-80
817 { .mfi
818       shladd    GR_ad_z_2     = GR_Index2, 2, GR_ad_z_2  // Point to Z_2
819       nop.f 0
820       sub       GR_N          = GR_N, GR_Bias // sub bias from exp
823 { .mmi
824       ldfps     FR_G2, FR_H2  = [GR_ad_tbl_2],8       // Load G_2, H_2
825       ld4       GR_Z_2        = [GR_ad_z_2]                // Load Z_2
826       sub       GR_minus_N    = GR_Bias, GR_N         // Form exponent of 2^(-N)
829 { .mmi
830       ldfd      FR_h2         = [GR_ad_tbl_2]             // Load h_2
831       nop.m 0
832       nop.i 0
835 { .mmi
836       setf.sig  FR_float_N    = GR_N        // Put integer N into rightmost sign
837       setf.exp  FR_2_to_minus_N = GR_minus_N   // Form 2^(-N)
838       pmpyshr2.u GR_X_2       = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
841 // WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
842 // BECAUSE OF POSSIBLE 10 CLOCKS STALL!
843 // So we can negate Q coefficients there for negative values
845 { .mfi
846       nop.m 0
847 (p11) fma.s1    FR_Q1         = FR_Q1, FR_Neg_One, f0 // Negate Q1
848       nop.i 0
850 { .mfi
851       nop.m 0
852       fma.s1    FR_XLog_Lo     = FR_XLog_Lo, f1, FR_GL // bl = bl + gl
853       nop.i 0
856 { .mfi
857       nop.m 0
858 (p11) fma.s1    FR_Q2         = FR_Q2, FR_Neg_One, f0 // Negate Q2
859       nop.i 0
862 { .mfi
863       nop.m 0
864 (p11) fma.s1    FR_Q3         = FR_Q3, FR_Neg_One, f0 // Negate Q3
865       nop.i 0
868 { .mfi
869       nop.m 0
870 (p11) fma.s1    FR_Q4         = FR_Q4, FR_Neg_One, f0 // Negate Q4
871       extr.u    GR_Index3     = GR_X_2, 1, 5         // Extract bits 1-5 of X_2
874 { .mfi
875       shladd    GR_ad_tbl_3   = GR_Index3, 4, GR_ad_tbl_3  // Point to G_3
876       nop.f 0
877       nop.i 0
880 { .mfi
881       ldfps     FR_G3, FR_H3  = [GR_ad_tbl_3],8   // Load G_3, H_3
882       nop.f 0
883       nop.i 0
886 { .mfi
887       ldfd      FR_h3         = [GR_ad_tbl_3]            // Load h_3
888           fcvt.xf   FR_float_N    = FR_float_N
889       nop.i 0
892 { .mfi
893       nop.m 0
894       fmpy.s1   FR_G          = FR_G, FR_G2              // G = G_1 * G_2
895       nop.i 0
897 { .mfi
898       nop.m 0
899       fadd.s1   FR_H          = FR_H, FR_H2              // H = H_1 + H_2
900       nop.i 0
903 { .mfi
904       nop.m 0
905       fadd.s1   FR_h          = FR_h, FR_h2              // h = h_1 + h_2
906       nop.i 0
908 { .mfi
909       nop.m 0
910       fma.s1    FR_S_lo       = FR_XLog_Lo, FR_2_to_minus_N, f0 //S_lo=S_lo*2^-N
911       nop.i 0
914 { .mfi
915       nop.m 0
916       fmpy.s1   FR_G          = FR_G, FR_G3             // G = (G_1 * G_2) * G_3
917       nop.i 0
919 { .mfi
920       nop.m 0
921       fadd.s1   FR_H          = FR_H, FR_H3             // H = (H_1 + H_2) + H_3
922       nop.i 0
925 { .mfi
926       nop.m 0
927       fadd.s1   FR_h          = FR_h, FR_h3             // h = (h_1 + h_2) + h_3
928       nop.i 0
931 { .mfi
932       nop.m 0
933       fms.s1    FR_r          = FR_G, FR_S_hi, f1           // r = G * S_hi - 1
934       nop.i 0
936 { .mfi
937       nop.m 0
938       fma.s1    FR_Y_hi       = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
939       nop.i 0
942 { .mfi
943       nop.m 0
944       fma.s1    FR_h          = FR_float_N, FR_log2_lo, FR_h  // h=N*log2_lo+h
945       nop.i 0
947 { .mfi
948       nop.m 0
949       fma.s1    FR_r          = FR_G, FR_S_lo, FR_r  // r=G*S_lo+(G*S_hi-1)
950       nop.i 0
953 { .mfi
954       nop.m 0
955       fma.s1    FR_poly_lo    = FR_r, FR_Q4, FR_Q3      // poly_lo = r * Q4 + Q3
956       nop.i 0
958 { .mfi
959       nop.m 0
960       fmpy.s1   FR_rsq        = FR_r, FR_r              // rsq = r * r
961       nop.i 0
964 { .mfi
965       nop.m 0
966       fma.s1    FR_poly_lo    = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
967       nop.i 0
969 { .mfi
970       nop.m 0
971       fma.s1    FR_rcub       = FR_rsq, FR_r, f0        // rcub = r^3
972       nop.i 0
975 .pred.rel "mutex",p12,p11
976 { .mfi
977       nop.m 0
978 (p12) fma.s1    FR_poly_hi    = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
979       nop.i 0
981 { .mfi
982       nop.m 0
983 (p11) fms.s1    FR_poly_hi    = FR_Q1, FR_rsq, FR_r     // poly_hi = Q1*rsq + r
984       nop.i 0
988 .pred.rel "mutex",p12,p11
989 { .mfi
990       nop.m 0
991 (p12) fma.s1    FR_poly_lo    = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
992       nop.i 0
994 { .mfi
995       nop.m 0
996 (p11) fms.s1    FR_poly_lo    = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
997       nop.i 0
1001 { .mfi
1002       nop.m 0
1003       fadd.s0   FR_Y_lo       = FR_poly_hi, FR_poly_lo
1004                                                              // Y_lo=poly_hi+poly_lo
1005       nop.i 0
1007 { .mfi
1008       nop.m 0
1009 (p11) fma.s0    FR_Y_hi       = FR_Y_hi, FR_Neg_One, f0 // FR_Y_hi sign for neg
1010       nop.i 0
1013 { .mfb
1014       nop.m 0
1015       fadd.s0   FR_Res        = FR_Y_lo,FR_Y_hi    // Result=Y_lo+Y_hi
1016       br.ret.sptk   b0                         // Common exit for 2^-7 < x < inf
1019 // * SPECIAL VERSION OF LOGL FOR HUGE ARGUMENTS *
1021 huges_logl:
1022 { .mfi
1023       getf.sig  GR_signif     = FR_XLog_Hi     // Get significand of x+1
1024       fmerge.ns FR_Neg_One    = f1, f1         // Form -1.0
1025       mov       GR_exp_2tom7  = 0x0fff8        // Exponent of 2^-7
1028 { .mfi
1029       add       GR_ad_tbl_1   = 0x040, GR_ad_z_1    // Point to Constants_G_H_h1
1030       nop.f 0
1031       add       GR_ad_q       = -0x60, GR_ad_z_1    // Point to Constants_P
1033 { .mfi
1034       add       GR_ad_z_2     = 0x140, GR_ad_z_1    // Point to Constants_Z_2
1035       nop.f 0
1036       add       GR_ad_tbl_2   = 0x180, GR_ad_z_1    // Point to Constants_G_H_h2
1039 { .mfi
1040       nop.m 0
1041       nop.f 0
1042       extr.u    GR_Index1     = GR_signif, 59, 4    // Get high 4 bits of signif
1044 { .mfi
1045       add       GR_ad_tbl_3   = 0x280, GR_ad_z_1    // Point to Constants_G_H_h3
1046       nop.f 0
1047       nop.i 0
1050 { .mfi
1051       shladd    GR_ad_z_1     = GR_Index1, 2, GR_ad_z_1  // Point to Z_1
1052       nop.f 0
1053       extr.u    GR_X_0        = GR_signif, 49, 15 // Get high 15 bits of signif.
1056 { .mfi
1057       ld4       GR_Z_1        = [GR_ad_z_1]    // Load Z_1
1058       nop.f 0
1059       mov       GR_exp_mask   = 0x1FFFF        // Create exponent mask
1061 { .mfi
1062       shladd    GR_ad_tbl_1   = GR_Index1, 4, GR_ad_tbl_1  // Point to G_1
1063       nop.f 0
1064       mov       GR_Bias       = 0x0FFFF            // Create exponent bias
1067 { .mfi
1068       ldfps     FR_G, FR_H    = [GR_ad_tbl_1],8     // Load G_1, H_1
1069       fmerge.se FR_S_hi       =  f1,FR_XLog_Hi            // Form |x+1|
1070       nop.i 0
1073 { .mmi
1074       getf.exp  GR_N          =  FR_XLog_Hi          // Get N = exponent of x+1
1075       ldfd      FR_h          = [GR_ad_tbl_1]        // Load h_1
1076       nop.i 0
1079 { .mfi
1080       ldfe      FR_log2_hi    = [GR_ad_q],16      // Load log2_hi
1081       nop.f 0
1082       pmpyshr2.u GR_X_1       = GR_X_0,GR_Z_1,15  // Get bits 30-15 of X_0 * Z_1
1085 // WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
1086 // "DEAD" ZONE!
1088 { .mmi
1089       ldfe      FR_log2_lo    = [GR_ad_q],16      // Load log2_lo
1090       sub       GR_N          = GR_N, GR_Bias
1091       mov       GR_exp_2tom80 = 0x0ffaf           // Exponent of 2^-80
1094 { .mfi
1095       ldfe      FR_Q4         = [GR_ad_q],16          // Load Q4
1096       nop.f 0
1097       sub       GR_minus_N    = GR_Bias, GR_N         // Form exponent of 2^(-N)
1100 { .mmf
1101       ldfe      FR_Q3         = [GR_ad_q],16   // Load Q3
1102       setf.sig  FR_float_N    = GR_N        // Put integer N into rightmost sign
1103       nop.f 0
1106 { .mmi
1107       nop.m 0
1108       ldfe      FR_Q2         = [GR_ad_q],16      // Load Q2
1109       extr.u    GR_Index2     = GR_X_1, 6, 4      // Extract bits 6-9 of X_1
1112 { .mmi
1113       ldfe      FR_Q1         = [GR_ad_q]                // Load Q1
1114       shladd    GR_ad_z_2     = GR_Index2, 2, GR_ad_z_2  // Point to Z_2
1115       nop.i 0
1118 { .mmi
1119       ld4       GR_Z_2        = [GR_ad_z_2]                // Load Z_2
1120       shladd    GR_ad_tbl_2   = GR_Index2, 4, GR_ad_tbl_2  // Point to G_2
1121       nop.i 0
1124 { .mmi
1125       ldfps     FR_G2, FR_H2  = [GR_ad_tbl_2],8       // Load G_2, H_2
1126       nop.m 0
1127       nop.i 0
1130 { .mfi
1131       ldfd      FR_h2         = [GR_ad_tbl_2]             // Load h_2
1132       nop.f 0
1133       nop.i 0
1135 { .mfi
1136       setf.exp  FR_2_to_minus_N = GR_minus_N   // Form 2^(-N)
1137       nop.f 0
1138       nop.i 0
1141 { .mfi
1142       nop.m 0
1143       nop.f 0
1144       pmpyshr2.u GR_X_2       = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
1147 // WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
1148 // "DEAD" ZONE!
1149 // JUST HAVE TO INSERT 3 NOP CYCLES (nothing to do here)
1151 { .mfi
1152       nop.m 0
1153       nop.f 0
1154       nop.i 0
1157 { .mfi
1158       nop.m 0
1159       nop.f 0
1160       nop.i 0
1163 { .mfi
1164       nop.m 0
1165       nop.f 0
1166       nop.i 0
1169 { .mfi
1170       nop.m 0
1171 (p11) fma.s1    FR_Q4         = FR_Q4, FR_Neg_One, f0 // Negate Q4
1172       extr.u    GR_Index3     = GR_X_2, 1, 5          // Extract bits 1-5 of X_2
1173  };;
1175 { .mfi
1176       shladd    GR_ad_tbl_3   = GR_Index3, 4, GR_ad_tbl_3  // Point to G_3
1177           fcvt.xf   FR_float_N    = FR_float_N
1178       nop.i 0
1180 { .mfi
1181       nop.m 0
1182 (p11) fma.s1    FR_Q3         = FR_Q3, FR_Neg_One, f0 // Negate Q3
1183       nop.i 0
1186 { .mfi
1187       ldfps     FR_G3, FR_H3  = [GR_ad_tbl_3],8   // Load G_3, H_3
1188 (p11) fma.s1    FR_Q2         = FR_Q2, FR_Neg_One, f0 // Negate Q2
1189       nop.i 0
1191 { .mfi
1192       nop.m 0
1193 (p11) fma.s1    FR_Q1         = FR_Q1, FR_Neg_One, f0 // Negate Q1
1194       nop.i 0
1197 { .mfi
1198       ldfd      FR_h3         = [GR_ad_tbl_3]            // Load h_3
1199       fmpy.s1   FR_G          = FR_G, FR_G2              // G = G_1 * G_2
1200       nop.i 0
1202 { .mfi
1203       nop.m 0
1204       fadd.s1   FR_H          = FR_H, FR_H2              // H = H_1 + H_2
1205       nop.i 0
1208 { .mmf
1209       nop.m 0
1210       nop.m 0
1211       fadd.s1   FR_h          = FR_h, FR_h2              // h = h_1 + h_2
1214 { .mfi
1215       nop.m 0
1216       fmpy.s1   FR_G          = FR_G, FR_G3             // G = (G_1 * G_2) * G_3
1217       nop.i 0
1219 { .mfi
1220       nop.m 0
1221       fadd.s1   FR_H          = FR_H, FR_H3             // H = (H_1 + H_2) + H_3
1222       nop.i 0
1225 { .mfi
1226       nop.m 0
1227       fadd.s1   FR_h          = FR_h, FR_h3             // h = (h_1 + h_2) + h_3
1228       nop.i 0
1231 { .mfi
1232       nop.m 0
1233       fms.s1    FR_r          = FR_G, FR_S_hi, f1           // r = G * S_hi - 1
1234       nop.i 0
1236 { .mfi
1237       nop.m 0
1238       fma.s1    FR_Y_hi       = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
1239       nop.i 0
1242 { .mfi
1243       nop.m 0
1244       fma.s1    FR_h          = FR_float_N, FR_log2_lo, FR_h  // h=N*log2_lo+h
1245       nop.i 0
1248 { .mfi
1249       nop.m 0
1250       fma.s1    FR_poly_lo    = FR_r, FR_Q4, FR_Q3      // poly_lo = r * Q4 + Q3
1251       nop.i 0
1253 { .mfi
1254       nop.m 0
1255       fmpy.s1   FR_rsq        = FR_r, FR_r              // rsq = r * r
1256       nop.i 0
1259 { .mfi
1260       nop.m 0
1261       fma.s1    FR_poly_lo    = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
1262       nop.i 0
1264 { .mfi
1265       nop.m 0
1266       fma.s1    FR_rcub       = FR_rsq, FR_r, f0        // rcub = r^3
1267       nop.i 0
1270 .pred.rel "mutex",p12,p11
1271 { .mfi
1272       nop.m 0
1273 (p12) fma.s1    FR_poly_hi    = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1274       nop.i 0
1276 { .mfi
1277       nop.m 0
1278 (p11) fms.s1    FR_poly_hi    = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1279       nop.i 0
1283 .pred.rel "mutex",p12,p11
1284 { .mfi
1285       nop.m 0
1286 (p12) fma.s1    FR_poly_lo    = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1287       nop.i 0
1289 { .mfi
1290       nop.m 0
1291 (p11) fms.s1    FR_poly_lo    = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1292       nop.i 0
1295 { .mfi
1296       nop.m 0
1297       fadd.s0   FR_Y_lo       = FR_poly_hi, FR_poly_lo  // Y_lo=poly_hi+poly_lo
1298       nop.i 0
1300 { .mfi
1301       nop.m 0
1302 (p11) fma.s0    FR_Y_hi       = FR_Y_hi, FR_Neg_One, f0 // FR_Y_hi sign for neg
1303       nop.i 0
1306 { .mfb
1307       nop.m 0
1308       fadd.s0   FR_Res        = FR_Y_lo,FR_Y_hi    // Result=Y_lo+Y_hi
1309       br.ret.sptk   b0                         // Common exit for 2^-7 < x < inf
1312 // NEAR ZERO POLYNOMIAL INTERVAL
1313 near_0:
1314 { .mfi
1315       nop.m 0
1316       fma.s1    FR_X4         = FR_X2, FR_X2, f0 // x^4 = x^2 * x^2
1317       nop.i 0
1320 { .mfi
1321       nop.m 0
1322       fma.s1    FR_P9         = FR_C9,FR_X2,FR_C7  // p9 = C9*x^2 + C7
1323       nop.i 0
1325 { .mfi
1326       nop.m 0
1327       fma.s1    FR_P5         = FR_C5,FR_X2,FR_C3  // p5 = C5*x^2 + C3
1328       nop.i 0
1331 { .mfi
1332       nop.m 0
1333       fma.s1    FR_P3         = FR_P9,FR_X4,FR_P5  // p3 = p9*x^4 + p5
1334       nop.i 0
1337 { .mfb
1338       nop.m 0
1339       fma.s0    FR_Res        = FR_P3,FR_X3,FR_Arg // res = p3*C3 + x
1340       br.ret.sptk   b0                          // Near 0 path return
1343 GLOBAL_LIBM_END(asinhl)
1344 libm_alias_ldouble_other (asinh, asinh)