sysdeps/ia64/fpu/s_asinhl.S

   1 .file "asinhl.s"
   2
   3
   4 // Copyright (c) 2000 - 2003, Intel Corporation
   5 // All rights reserved.
   6 //
   7 //
   8 // Redistribution and use in source and binary forms, with or without
   9 // modification, are permitted provided that the following conditions are
  10 // met:
  11 //
  12 // * Redistributions of source code must retain the above copyright
  13 // notice, this list of conditions and the following disclaimer.
  14 //
  15 // * Redistributions in binary form must reproduce the above copyright
  16 // notice, this list of conditions and the following disclaimer in the
  17 // documentation and/or other materials provided with the distribution.
  18 //
  19 // * The name of Intel Corporation may not be used to endorse or promote
  20 // products derived from this software without specific prior written
  21 // permission.
  22
  23 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  24 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  25 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  26 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  27 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  28 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  29 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  30 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  31 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  32 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  33 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  34 //
  35 // Intel Corporation is the author of this code, and requests that all
  36 // problem reports or change requests be submitted to it directly at
  37 // http://www.intel.com/software/products/opensource/libraries/num.htm.
  38 //
  39 //*********************************************************************
  40 //
  41 // History:
  42 // 09/04/01 Initial version
  43 // 09/13/01 Performance improved, symmetry problems fixed
  44 // 10/10/01 Performance improved, split issues removed
  45 // 12/11/01 Changed huges_logp to not be global
  46 // 05/20/02 Cleaned up namespace and sf0 syntax
  47 // 02/10/03 Reordered header: .section, .global, .proc, .align;
  48 //          used data8 for long double table values
  49 //
  50 //*********************************************************************
  51 //
  52 // API
  53 //==============================================================
  54 // long double asinhl(long double);
  55 //
  56 // Overview of operation
  57 //==============================================================
  58 //
  59 // There are 6 paths:
  60 // 1. x = 0, [S,Q]Nan or +/-INF
  61 //    Return asinhl(x) = x + x;
  62 //
  63 // 2. x = + denormal
  64 //    Return asinhl(x) = x - x^2;
  65 //
  66 // 3. x = - denormal
  67 //    Return asinhl(x) = x + x^2;
  68 //
  69 // 4. 'Near 0': max denormal < |x| < 1/128
  70 //    Return asinhl(x) = sign(x)*(x+x^3*(c3+x^2*(c5+x^2*(c7+x^2*(c9)))));
  71 //
  72 // 5. 'Huges': |x| > 2^63
  73 //    Return asinhl(x) = sign(x)*(logl(2*x));
  74 //
  75 // 6. 'Main path': 1/128 < |x| < 2^63
  76 //    b_hi + b_lo = x + sqrt(x^2 + 1);
  77 //    asinhl(x) = sign(x)*(log_special(b_hi, b_lo));
  78 //
  79 // Algorithm description
  80 //==============================================================
  81 //
  82 // Main path algorithm
  83 // ( thanks to Peter Markstein for the idea of sqrt(x^2+1) computation! )
  84 // *************************************************************************
  85 //
  86 // There are 3 parts of x+sqrt(x^2+1) computation:
  87 //
  88 //  1) p2 = (p2_hi+p2_lo) = x^2+1 obtaining
  89 //     ------------------------------------
  90 //     p2_hi = x2_hi + 1, where x2_hi = x * x;
  91 //     p2_lo = x2_lo + p1_lo, where
  92 //                            x2_lo = FMS(x*x-x2_hi),
  93 //                            p1_lo = (1 - p2_hi) + x2_hi;
  94 //
  95 //  2) g = (g_hi+g_lo) = sqrt(p2) = sqrt(p2_hi+p2_lo)
  96 //     ----------------------------------------------
  97 //     r = invsqrt(p2_hi) (8-bit reciprocal square root approximation);
  98 //     g = p2_hi * r (first 8 bit-approximation of sqrt);
  99 //
 100 //     h = 0.5 * r;
 101 //     e = 0.5 - g * h;
 102 //     g = g * e + g (second 16 bit-approximation of sqrt);
 103 //
 104 //     h = h * e + h;
 105 //     e = 0.5 - g * h;
 106 //     g = g * e + g (third 32 bit-approximation of sqrt);
 107 //
 108 //     h = h * e + h;
 109 //     e = 0.5 - g * h;
 110 //     g_hi = g * e + g (fourth 64 bit-approximation of sqrt);
 111 //
 112 //     Remainder computation:
 113 //     h = h * e + h;
 114 //     d = (p2_hi - g_hi * g_hi) + p2_lo;
 115 //     g_lo = d * h;
 116 //
 117 //  3) b = (b_hi + b_lo) = x + g, where g = (g_hi + g_lo) = sqrt(x^2+1)
 118 //     -------------------------------------------------------------------
 119 //     b_hi = (g_hi + x) + gl;
 120 //     b_lo = (g_hi - b_hi) + x + gl;
 121 //
 122 //  Now we pass b presented as sum b_hi + b_lo to special version
 123 //  of logl function which accept a pair of arguments as
 124 //  'mutiprecision' value.
 125 //
 126 //  Special log algorithm overview
 127 //  ================================
 128 //   Here we use a table lookup method. The basic idea is that in
 129 //   order to compute logl(Arg) = logl (Arg-1) for an argument Arg in [1,2),
 130 //   we construct a value G such that G*Arg is close to 1 and that
 131 //   logl(1/G) is obtainable easily from a table of values calculated
 132 //   beforehand. Thus
 133 //
 134 //      logl(Arg) = logl(1/G) + logl((G*Arg - 1))
 135 //
 136 //   Because |G*Arg - 1| is small, the second term on the right hand
 137 //   side can be approximated by a short polynomial. We elaborate
 138 //   this method in four steps.
 139 //
 140 //   Step 0: Initialization
 141 //
 142 //   We need to calculate logl( X ). Obtain N, S_hi such that
 143 //
 144 //      X = 2^N * ( S_hi + S_lo )   exactly
 145 //
 146 //   where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
 147 //   that |S_lo| <= ulp(S_hi).
 148 //
 149 //   For the special version of logl: S_lo = b_lo
 150 //   !-----------------------------------------------!
 151 //
 152 //   Step 1: Argument Reduction
 153 //
 154 //   Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
 155 //
 156 //      G := G_1 * G_2 * G_3
 157 //      r := (G * S_hi - 1) + G * S_lo
 158 //
 159 //   These G_j's have the property that the product is exactly
 160 //   representable and that |r| < 2^(-12) as a result.
 161 //
 162 //   Step 2: Approximation
 163 //
 164 //   logl(1 + r) is approximated by a short polynomial poly(r).
 165 //
 166 //   Step 3: Reconstruction
 167 //
 168 //   Finally,
 169 //
 170 //   logl( X )   =   logl( 2^N * (S_hi + S_lo) )
 171 //                 ~=~  N*logl(2) + logl(1/G) + logl(1 + r)
 172 //                 ~=~  N*logl(2) + logl(1/G) + poly(r).
 173 //
 174 //   For detailed description see logl or log1pl function, regular path.
 175 //
 176 // Registers used
 177 //==============================================================
 178 // Floating Point registers used:
 179 // f8, input
 180 // f32 -> f101 (70 registers)
 181
 182 // General registers used:
 183 // r32 -> r57 (26 registers)
 184
 185 // Predicate registers used:
 186 // p6 -> p11
 187 // p6  for '0, NaNs, Inf' path
 188 // p7  for '+ denormals' path
 189 // p8  for 'near 0' path
 190 // p9  for 'huges' path
 191 // p10 for '- denormals' path
 192 // p11 for negative values
 193 //
 194 // Data tables
 195 //==============================================================
 196
 197 RODATA
 198 .align 64
 199
 200 // C7, C9 'near 0' polynomial coefficients
 201 LOCAL_OBJECT_START(Poly_C_near_0_79)
 202 data8 0xF8DC939BBEDD5A54, 0x00003FF9
 203 data8 0xB6DB6DAB21565AC5, 0x0000BFFA
 204 LOCAL_OBJECT_END(Poly_C_near_0_79)
 205
 206 // C3, C5 'near 0' polynomial coefficients
 207 LOCAL_OBJECT_START(Poly_C_near_0_35)
 208 data8 0x999999999991D582, 0x00003FFB
 209 data8 0xAAAAAAAAAAAAAAA9, 0x0000BFFC
 210 LOCAL_OBJECT_END(Poly_C_near_0_35)
 211
 212 // Q coeffs
 213 LOCAL_OBJECT_START(Constants_Q)
 214 data4  0x00000000,0xB1721800,0x00003FFE,0x00000000
 215 data4  0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
 216 data4  0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
 217 data4  0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
 218 data4  0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
 219 data4  0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
 220 LOCAL_OBJECT_END(Constants_Q)
 221
 222 // Z1 - 16 bit fixed
 223 LOCAL_OBJECT_START(Constants_Z_1)
 224 data4  0x00008000
 225 data4  0x00007879
 226 data4  0x000071C8
 227 data4  0x00006BCB
 228 data4  0x00006667
 229 data4  0x00006187
 230 data4  0x00005D18
 231 data4  0x0000590C
 232 data4  0x00005556
 233 data4  0x000051EC
 234 data4  0x00004EC5
 235 data4  0x00004BDB
 236 data4  0x00004925
 237 data4  0x0000469F
 238 data4  0x00004445
 239 data4  0x00004211
 240 LOCAL_OBJECT_END(Constants_Z_1)
 241
 242 // G1 and H1 - IEEE single and h1 - IEEE double
 243 LOCAL_OBJECT_START(Constants_G_H_h1)
 244 data4  0x3F800000,0x00000000
 245 data8  0x0000000000000000
 246 data4  0x3F70F0F0,0x3D785196
 247 data8  0x3DA163A6617D741C
 248 data4  0x3F638E38,0x3DF13843
 249 data8  0x3E2C55E6CBD3D5BB
 250 data4  0x3F579430,0x3E2FF9A0
 251 data8  0xBE3EB0BFD86EA5E7
 252 data4  0x3F4CCCC8,0x3E647FD6
 253 data8  0x3E2E6A8C86B12760
 254 data4  0x3F430C30,0x3E8B3AE7
 255 data8  0x3E47574C5C0739BA
 256 data4  0x3F3A2E88,0x3EA30C68
 257 data8  0x3E20E30F13E8AF2F
 258 data4  0x3F321640,0x3EB9CEC8
 259 data8  0xBE42885BF2C630BD
 260 data4  0x3F2AAAA8,0x3ECF9927
 261 data8  0x3E497F3497E577C6
 262 data4  0x3F23D708,0x3EE47FC5
 263 data8  0x3E3E6A6EA6B0A5AB
 264 data4  0x3F1D89D8,0x3EF8947D
 265 data8  0xBDF43E3CD328D9BE
 266 data4  0x3F17B420,0x3F05F3A1
 267 data8  0x3E4094C30ADB090A
 268 data4  0x3F124920,0x3F0F4303
 269 data8  0xBE28FBB2FC1FE510
 270 data4  0x3F0D3DC8,0x3F183EBF
 271 data8  0x3E3A789510FDE3FA
 272 data4  0x3F088888,0x3F20EC80
 273 data8  0x3E508CE57CC8C98F
 274 data4  0x3F042108,0x3F29516A
 275 data8  0xBE534874A223106C
 276 LOCAL_OBJECT_END(Constants_G_H_h1)
 277
 278 // Z2 - 16 bit fixed
 279 LOCAL_OBJECT_START(Constants_Z_2)
 280 data4  0x00008000
 281 data4  0x00007F81
 282 data4  0x00007F02
 283 data4  0x00007E85
 284 data4  0x00007E08
 285 data4  0x00007D8D
 286 data4  0x00007D12
 287 data4  0x00007C98
 288 data4  0x00007C20
 289 data4  0x00007BA8
 290 data4  0x00007B31
 291 data4  0x00007ABB
 292 data4  0x00007A45
 293 data4  0x000079D1
 294 data4  0x0000795D
 295 data4  0x000078EB
 296 LOCAL_OBJECT_END(Constants_Z_2)
 297
 298 // G2 and H2 - IEEE single and h2 - IEEE double
 299 LOCAL_OBJECT_START(Constants_G_H_h2)
 300 data4  0x3F800000,0x00000000
 301 data8  0x0000000000000000
 302 data4  0x3F7F00F8,0x3B7F875D
 303 data8  0x3DB5A11622C42273
 304 data4  0x3F7E03F8,0x3BFF015B
 305 data8  0x3DE620CF21F86ED3
 306 data4  0x3F7D08E0,0x3C3EE393
 307 data8  0xBDAFA07E484F34ED
 308 data4  0x3F7C0FC0,0x3C7E0586
 309 data8  0xBDFE07F03860BCF6
 310 data4  0x3F7B1880,0x3C9E75D2
 311 data8  0x3DEA370FA78093D6
 312 data4  0x3F7A2328,0x3CBDC97A
 313 data8  0x3DFF579172A753D0
 314 data4  0x3F792FB0,0x3CDCFE47
 315 data8  0x3DFEBE6CA7EF896B
 316 data4  0x3F783E08,0x3CFC15D0
 317 data8  0x3E0CF156409ECB43
 318 data4  0x3F774E38,0x3D0D874D
 319 data8  0xBE0B6F97FFEF71DF
 320 data4  0x3F766038,0x3D1CF49B
 321 data8  0xBE0804835D59EEE8
 322 data4  0x3F757400,0x3D2C531D
 323 data8  0x3E1F91E9A9192A74
 324 data4  0x3F748988,0x3D3BA322
 325 data8  0xBE139A06BF72A8CD
 326 data4  0x3F73A0D0,0x3D4AE46F
 327 data8  0x3E1D9202F8FBA6CF
 328 data4  0x3F72B9D0,0x3D5A1756
 329 data8  0xBE1DCCC4BA796223
 330 data4  0x3F71D488,0x3D693B9D
 331 data8  0xBE049391B6B7C239
 332 LOCAL_OBJECT_END(Constants_G_H_h2)
 333
 334 // G3 and H3 - IEEE single and h3 - IEEE double
 335 LOCAL_OBJECT_START(Constants_G_H_h3)
 336 data4  0x3F7FFC00,0x38800100
 337 data8  0x3D355595562224CD
 338 data4  0x3F7FF400,0x39400480
 339 data8  0x3D8200A206136FF6
 340 data4  0x3F7FEC00,0x39A00640
 341 data8  0x3DA4D68DE8DE9AF0
 342 data4  0x3F7FE400,0x39E00C41
 343 data8  0xBD8B4291B10238DC
 344 data4  0x3F7FDC00,0x3A100A21
 345 data8  0xBD89CCB83B1952CA
 346 data4  0x3F7FD400,0x3A300F22
 347 data8  0xBDB107071DC46826
 348 data4  0x3F7FCC08,0x3A4FF51C
 349 data8  0x3DB6FCB9F43307DB
 350 data4  0x3F7FC408,0x3A6FFC1D
 351 data8  0xBD9B7C4762DC7872
 352 data4  0x3F7FBC10,0x3A87F20B
 353 data8  0xBDC3725E3F89154A
 354 data4  0x3F7FB410,0x3A97F68B
 355 data8  0xBD93519D62B9D392
 356 data4  0x3F7FAC18,0x3AA7EB86
 357 data8  0x3DC184410F21BD9D
 358 data4  0x3F7FA420,0x3AB7E101
 359 data8  0xBDA64B952245E0A6
 360 data4  0x3F7F9C20,0x3AC7E701
 361 data8  0x3DB4B0ECAABB34B8
 362 data4  0x3F7F9428,0x3AD7DD7B
 363 data8  0x3D9923376DC40A7E
 364 data4  0x3F7F8C30,0x3AE7D474
 365 data8  0x3DC6E17B4F2083D3
 366 data4  0x3F7F8438,0x3AF7CBED
 367 data8  0x3DAE314B811D4394
 368 data4  0x3F7F7C40,0x3B03E1F3
 369 data8  0xBDD46F21B08F2DB1
 370 data4  0x3F7F7448,0x3B0BDE2F
 371 data8  0xBDDC30A46D34522B
 372 data4  0x3F7F6C50,0x3B13DAAA
 373 data8  0x3DCB0070B1F473DB
 374 data4  0x3F7F6458,0x3B1BD766
 375 data8  0xBDD65DDC6AD282FD
 376 data4  0x3F7F5C68,0x3B23CC5C
 377 data8  0xBDCDAB83F153761A
 378 data4  0x3F7F5470,0x3B2BC997
 379 data8  0xBDDADA40341D0F8F
 380 data4  0x3F7F4C78,0x3B33C711
 381 data8  0x3DCD1BD7EBC394E8
 382 data4  0x3F7F4488,0x3B3BBCC6
 383 data8  0xBDC3532B52E3E695
 384 data4  0x3F7F3C90,0x3B43BAC0
 385 data8  0xBDA3961EE846B3DE
 386 data4  0x3F7F34A0,0x3B4BB0F4
 387 data8  0xBDDADF06785778D4
 388 data4  0x3F7F2CA8,0x3B53AF6D
 389 data8  0x3DCC3ED1E55CE212
 390 data4  0x3F7F24B8,0x3B5BA620
 391 data8  0xBDBA31039E382C15
 392 data4  0x3F7F1CC8,0x3B639D12
 393 data8  0x3D635A0B5C5AF197
 394 data4  0x3F7F14D8,0x3B6B9444
 395 data8  0xBDDCCB1971D34EFC
 396 data4  0x3F7F0CE0,0x3B7393BC
 397 data8  0x3DC7450252CD7ADA
 398 data4  0x3F7F04F0,0x3B7B8B6D
 399 data8  0xBDB68F177D7F2A42
 400 LOCAL_OBJECT_END(Constants_G_H_h3)
 401
 402 // Assembly macros
 403 //==============================================================
 404
 405 // Floating Point Registers
 406
 407 FR_Arg          = f8
 408 FR_Res          = f8
 409 FR_AX           = f32
 410 FR_XLog_Hi      = f33
 411 FR_XLog_Lo      = f34
 412
 413     // Special logl registers
 414 FR_Y_hi         = f35
 415 FR_Y_lo         = f36
 416
 417 FR_Scale        = f37
 418 FR_X_Prime      = f38
 419 FR_S_hi         = f39
 420 FR_W            = f40
 421 FR_G            = f41
 422
 423 FR_H            = f42
 424 FR_wsq          = f43
 425 FR_w4           = f44
 426 FR_h            = f45
 427 FR_w6           = f46
 428
 429 FR_G2           = f47
 430 FR_H2           = f48
 431 FR_poly_lo      = f49
 432 FR_P8           = f50
 433 FR_poly_hi      = f51
 434
 435 FR_P7           = f52
 436 FR_h2           = f53
 437 FR_rsq          = f54
 438 FR_P6           = f55
 439 FR_r            = f56
 440
 441 FR_log2_hi      = f57
 442 FR_log2_lo      = f58
 443
 444 FR_float_N      = f59
 445 FR_Q4           = f60
 446
 447 FR_G3           = f61
 448 FR_H3           = f62
 449 FR_h3           = f63
 450
 451 FR_Q3           = f64
 452 FR_Q2           = f65
 453 FR_1LN10_hi     = f66
 454
 455 FR_Q1           = f67
 456 FR_1LN10_lo     = f68
 457 FR_P5           = f69
 458 FR_rcub         = f70
 459
 460 FR_Neg_One      = f71
 461 FR_Z            = f72
 462 FR_AA           = f73
 463 FR_BB           = f74
 464 FR_S_lo         = f75
 465 FR_2_to_minus_N = f76
 466
 467
 468     // Huge & Main path prolog registers
 469 FR_Half         = f77
 470 FR_Two          = f78
 471 FR_X2           = f79
 472 FR_P2           = f80
 473 FR_P2L          = f81
 474 FR_Rcp          = f82
 475 FR_GG           = f83
 476 FR_HH           = f84
 477 FR_EE           = f85
 478 FR_DD           = f86
 479 FR_GL           = f87
 480 FR_A            = f88
 481 FR_AL           = f89
 482 FR_B            = f90
 483 FR_BL           = f91
 484 FR_Tmp          = f92
 485
 486     // Near 0 & Huges path prolog registers
 487 FR_C3           = f93
 488 FR_C5           = f94
 489 FR_C7           = f95
 490 FR_C9           = f96
 491
 492 FR_X3           = f97
 493 FR_X4           = f98
 494 FR_P9           = f99
 495 FR_P5           = f100
 496 FR_P3           = f101
 497
 498
 499 // General Purpose Registers
 500
 501     // General prolog registers
 502 GR_PFS          = r32
 503 GR_TwoN7        = r40
 504 GR_TwoP63       = r41
 505 GR_ExpMask      = r42
 506 GR_ArgExp       = r43
 507 GR_Half         = r44
 508
 509     // Near 0 path prolog registers
 510 GR_Poly_C_35    = r45
 511 GR_Poly_C_79    = r46
 512
 513     // Special logl registers
 514 GR_Index1       = r34
 515 GR_Index2       = r35
 516 GR_signif       = r36
 517 GR_X_0          = r37
 518 GR_X_1          = r38
 519 GR_X_2          = r39
 520 GR_Z_1          = r40
 521 GR_Z_2          = r41
 522 GR_N            = r42
 523 GR_Bias         = r43
 524 GR_M            = r44
 525 GR_Index3       = r45
 526 GR_exp_2tom80   = r45
 527 GR_exp_mask     = r47
 528 GR_exp_2tom7    = r48
 529 GR_ad_ln10      = r49
 530 GR_ad_tbl_1     = r50
 531 GR_ad_tbl_2     = r51
 532 GR_ad_tbl_3     = r52
 533 GR_ad_q         = r53
 534 GR_ad_z_1       = r54
 535 GR_ad_z_2       = r55
 536 GR_ad_z_3       = r56
 537 GR_minus_N      = r57
 538
 539
 540
 541 .section .text
 542 GLOBAL_LIBM_ENTRY(asinhl)
 543
 544 { .mfi
 545       alloc     GR_PFS        = ar.pfs,0,27,0,0
 546       fma.s1    FR_P2         = FR_Arg, FR_Arg, f1  // p2 = x^2 + 1
 547       mov       GR_Half       = 0xfffe              // 0.5's exp
 548 }
 549 { .mfi
 550       addl      GR_Poly_C_79  = @ltoff(Poly_C_near_0_79), gp // C7, C9 coeffs
 551       fma.s1    FR_X2         = FR_Arg, FR_Arg, f0           // Obtain x^2
 552       addl      GR_Poly_C_35  = @ltoff(Poly_C_near_0_35), gp // C3, C5 coeffs
 553 };;
 554
 555 { .mfi
 556       getf.exp  GR_ArgExp     = FR_Arg        // get arument's exponent
 557       fabs      FR_AX         = FR_Arg        // absolute value of argument
 558       mov       GR_TwoN7      = 0xfff8        // 2^-7 exp
 559 }
 560 { .mfi
 561       ld8       GR_Poly_C_79  = [GR_Poly_C_79] // get actual coeff table address
 562       fma.s0       FR_Two        = f1, f1, f1        // construct 2.0
 563       mov       GR_ExpMask    = 0x1ffff        // mask for exp
 564 };;
 565
 566 { .mfi
 567       ld8       GR_Poly_C_35  = [GR_Poly_C_35] // get actual coeff table address
 568       fclass.m  p6,p0         = FR_Arg, 0xe7   // if arg NaN inf zero
 569       mov       GR_TwoP63     = 0x1003e        // 2^63 exp
 570 }
 571 { .mfi
 572       addl      GR_ad_z_1     = @ltoff(Constants_Z_1#),gp
 573       nop.f 0
 574       nop.i 0
 575 };;
 576
 577 { .mfi
 578       setf.exp  FR_Half       = GR_Half              // construct 0.5
 579       fclass.m  p7,p0         = FR_Arg, 0x09  //  if arg + denorm
 580       and       GR_ArgExp     = GR_ExpMask, GR_ArgExp // select exp
 581 }
 582 { .mfb
 583       ld8       GR_ad_z_1     = [GR_ad_z_1]   // Get pointer to Constants_Z_1
 584       nop.f 0
 585       nop.b 0
 586 };;
 587 { .mfi
 588       ldfe      FR_C9         = [GR_Poly_C_79],16 // load C9
 589       fclass.m  p10,p0        = FR_Arg, 0x0a    //  if arg - denorm
 590       cmp.gt    p8, p0        = GR_TwoN7,  GR_ArgExp // if arg < 2^-7 ('near 0')
 591 }
 592 { .mfb
 593       cmp.le    p9, p0        = GR_TwoP63, GR_ArgExp  // if arg > 2^63 ('huges')
 594 (p6)  fma.s0    FR_Res        = FR_Arg,f1,FR_Arg     // r = a + a
 595 (p6)  br.ret.spnt b0                            // return
 596 };;
 597 // (X^2 + 1) computation
 598 { .mfi
 599 (p8)  ldfe      FR_C5         = [GR_Poly_C_35],16        // load C5
 600       fms.s1    FR_Tmp        = f1, f1, FR_P2           // Tmp = 1 - p2
 601       add       GR_ad_tbl_1   = 0x040, GR_ad_z_1    // Point to Constants_G_H_h1
 602 }
 603 { .mfb
 604 (p8)  ldfe      FR_C7         = [GR_Poly_C_79],16        // load C7
 605 (p7)  fnma.s0   FR_Res        =  FR_Arg,FR_Arg,FR_Arg // r = a - a*a
 606 (p7)  br.ret.spnt b0                              // return
 607 };;
 608
 609 { .mfi
 610 (p8)  ldfe      FR_C3         = [GR_Poly_C_35],16     // load C3
 611       fcmp.lt.s1 p11, p12      = FR_Arg, f0     // if arg is negative
 612       add       GR_ad_q       = -0x60, GR_ad_z_1    // Point to Constants_P
 613 }
 614 { .mfb
 615       add       GR_ad_z_2     = 0x140, GR_ad_z_1    // Point to Constants_Z_2
 616 (p10) fma.s0    FR_Res        =  FR_Arg,FR_Arg,FR_Arg // r = a + a*a
 617 (p10) br.ret.spnt b0                             // return
 618 };;
 619
 620 { .mfi
 621       add       GR_ad_tbl_2   = 0x180, GR_ad_z_1    // Point to Constants_G_H_h2
 622       frsqrta.s1 FR_Rcp, p0   = FR_P2           // Rcp = 1/p2 reciprocal appr.
 623       add       GR_ad_tbl_3   = 0x280, GR_ad_z_1    // Point to Constants_G_H_h3
 624 }
 625 { .mfi
 626       nop.m 0
 627       fms.s1    FR_P2L        = FR_AX, FR_AX, FR_X2 //low part of p2=fma(X*X-p2)
 628       mov       GR_Bias       = 0x0FFFF            // Create exponent bias
 629 };;
 630
 631 { .mfb
 632       nop.m 0
 633 (p9)  fms.s1    FR_XLog_Hi    = FR_Two, FR_AX, f0  // Hi  of log1p arg = 2*X - 1
 634 (p9)  br.cond.spnt huges_logl                      // special version of log1p
 635 };;
 636
 637 { .mfb
 638       ldfe      FR_log2_hi    = [GR_ad_q],16      // Load log2_hi
 639 (p8)  fma.s1    FR_X3         = FR_X2, FR_Arg, f0        // x^3 = x^2 * x
 640 (p8)  br.cond.spnt near_0                                // Go to near 0 branch
 641 };;
 642
 643 { .mfi
 644       ldfe      FR_log2_lo    = [GR_ad_q],16      // Load log2_lo
 645       nop.f 0
 646       nop.i 0
 647 };;
 648
 649 { .mfi
 650       ldfe      FR_Q4         = [GR_ad_q],16          // Load Q4
 651       fma.s1    FR_Tmp        = FR_Tmp, f1, FR_X2       // Tmp = Tmp + x^2
 652       mov       GR_exp_mask   = 0x1FFFF        // Create exponent mask
 653 };;
 654
 655 { .mfi
 656       ldfe      FR_Q3         = [GR_ad_q],16   // Load Q3
 657       fma.s1    FR_GG         = FR_Rcp, FR_P2, f0        // g = Rcp * p2
 658                                                // 8 bit Newton Raphson iteration
 659       nop.i 0
 660 }
 661 { .mfi
 662       nop.m 0
 663       fma.s1    FR_HH         = FR_Half, FR_Rcp, f0      // h = 0.5 * Rcp
 664       nop.i 0
 665 };;
 666 { .mfi
 667       ldfe      FR_Q2         = [GR_ad_q],16      // Load Q2
 668       fnma.s1   FR_EE         = FR_GG, FR_HH, FR_Half   // e = 0.5 - g * h
 669       nop.i 0
 670 }
 671 { .mfi
 672       nop.m 0
 673       fma.s1    FR_P2L        = FR_Tmp, f1, FR_P2L // low part of p2 = Tmp + p2l
 674       nop.i 0
 675 };;
 676
 677 { .mfi
 678       ldfe      FR_Q1         = [GR_ad_q]                // Load Q1
 679       fma.s1    FR_GG         = FR_GG, FR_EE, FR_GG     // g = g * e + g
 680                                               // 16 bit Newton Raphson iteration
 681       nop.i 0
 682 }
 683 { .mfi
 684       nop.m 0
 685       fma.s1    FR_HH         = FR_HH, FR_EE, FR_HH     // h = h * e + h
 686       nop.i 0
 687 };;
 688
 689 { .mfi
 690       nop.m 0
 691       fnma.s1   FR_EE         = FR_GG, FR_HH, FR_Half   // e = 0.5 - g * h
 692       nop.i 0
 693 };;
 694
 695 { .mfi
 696       nop.m 0
 697       fma.s1    FR_GG         = FR_GG, FR_EE, FR_GG     // g = g * e + g
 698                                               // 32 bit Newton Raphson iteration
 699       nop.i 0
 700 }
 701 { .mfi
 702       nop.m 0
 703       fma.s1    FR_HH         = FR_HH, FR_EE, FR_HH     // h = h * e + h
 704       nop.i 0
 705 };;
 706
 707 { .mfi
 708       nop.m 0
 709       fnma.s1   FR_EE         = FR_GG, FR_HH, FR_Half   // e = 0.5 - g * h
 710       nop.i 0
 711 };;
 712
 713 { .mfi
 714       nop.m 0
 715       fma.s1    FR_GG         = FR_GG, FR_EE, FR_GG     // g = g * e + g
 716                                               // 64 bit Newton Raphson iteration
 717       nop.i 0
 718 }
 719 { .mfi
 720       nop.m 0
 721       fma.s1    FR_HH         = FR_HH, FR_EE, FR_HH     // h = h * e + h
 722       nop.i 0
 723 };;
 724
 725 { .mfi
 726       nop.m 0
 727       fnma.s1   FR_DD         = FR_GG, FR_GG, FR_P2  // Remainder d = g * g - p2
 728       nop.i 0
 729 }
 730 { .mfi
 731       nop.m 0
 732       fma.s1    FR_XLog_Hi     = FR_AX, f1, FR_GG // bh = z + gh
 733       nop.i 0
 734 };;
 735
 736 { .mfi
 737       nop.m 0
 738       fma.s1    FR_DD         = FR_DD, f1, FR_P2L       // add p2l: d = d + p2l
 739       nop.i 0
 740 };;
 741
 742
 743 { .mfi
 744       getf.sig  GR_signif     = FR_XLog_Hi     // Get significand of x+1
 745       fmerge.ns FR_Neg_One    = f1, f1         // Form -1.0
 746       mov       GR_exp_2tom7  = 0x0fff8        // Exponent of 2^-7
 747 };;
 748
 749 { .mfi
 750       nop.m 0
 751       fma.s1    FR_GL         = FR_DD, FR_HH, f0        // gl = d * h
 752       extr.u    GR_Index1     = GR_signif, 59, 4    // Get high 4 bits of signif
 753 }
 754 { .mfi
 755       nop.m 0
 756       fma.s1    FR_XLog_Hi     = FR_DD,  FR_HH, FR_XLog_Hi // bh = bh + gl
 757       nop.i 0
 758 };;
 759
 760 { .mmi
 761       shladd    GR_ad_z_1     = GR_Index1, 2, GR_ad_z_1  // Point to Z_1
 762       shladd    GR_ad_tbl_1   = GR_Index1, 4, GR_ad_tbl_1  // Point to G_1
 763       extr.u    GR_X_0        = GR_signif, 49, 15 // Get high 15 bits of signif.
 764 };;
 765
 766 { .mmi
 767       ld4       GR_Z_1        = [GR_ad_z_1]    // Load Z_1
 768       nop.m 0
 769       nop.i 0
 770 };;
 771
 772 { .mmi
 773       ldfps     FR_G, FR_H    = [GR_ad_tbl_1],8     // Load G_1, H_1
 774       nop.m 0
 775       nop.i 0
 776 };;
 777
 778 { .mfi
 779       nop.m 0
 780       fms.s1    FR_XLog_Lo     = FR_GG,  f1,   FR_XLog_Hi // bl = gh - bh
 781       pmpyshr2.u GR_X_1       = GR_X_0,GR_Z_1,15  // Get bits 30-15 of X_0 * Z_1
 782 };;
 783
 784 // WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
 785 // "DEAD" ZONE!
 786
 787 { .mfi
 788       nop.m 0
 789       nop.f 0
 790       nop.i 0
 791 };;
 792
 793 { .mfi
 794       nop.m 0
 795       fmerge.se FR_S_hi       =  f1,FR_XLog_Hi            // Form |x+1|
 796       nop.i 0
 797 };;
 798
 799 { .mmi
 800       getf.exp  GR_N          =  FR_XLog_Hi    // Get N = exponent of x+1
 801       ldfd      FR_h          = [GR_ad_tbl_1]        // Load h_1
 802       nop.i 0
 803 };;
 804
 805 { .mfi
 806       nop.m 0
 807       nop.f 0
 808       extr.u    GR_Index2     = GR_X_1, 6, 4      // Extract bits 6-9 of X_1
 809 };;
 810
 811
 812 { .mfi
 813       shladd    GR_ad_tbl_2   = GR_Index2, 4, GR_ad_tbl_2  // Point to G_2
 814       fma.s1    FR_XLog_Lo    = FR_XLog_Lo, f1, FR_AX // bl = bl + x
 815       mov       GR_exp_2tom80 = 0x0ffaf           // Exponent of 2^-80
 816 }
 817 { .mfi
 818       shladd    GR_ad_z_2     = GR_Index2, 2, GR_ad_z_2  // Point to Z_2
 819       nop.f 0
 820       sub       GR_N          = GR_N, GR_Bias // sub bias from exp
 821 };;
 822
 823 { .mmi
 824       ldfps     FR_G2, FR_H2  = [GR_ad_tbl_2],8       // Load G_2, H_2
 825       ld4       GR_Z_2        = [GR_ad_z_2]                // Load Z_2
 826       sub       GR_minus_N    = GR_Bias, GR_N         // Form exponent of 2^(-N)
 827 };;
 828
 829 { .mmi
 830       ldfd      FR_h2         = [GR_ad_tbl_2]             // Load h_2
 831       nop.m 0
 832       nop.i 0
 833 };;
 834
 835 { .mmi
 836       setf.sig  FR_float_N    = GR_N        // Put integer N into rightmost sign
 837       setf.exp  FR_2_to_minus_N = GR_minus_N   // Form 2^(-N)
 838       pmpyshr2.u GR_X_2       = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
 839 };;
 840
 841 // WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
 842 // BECAUSE OF POSSIBLE 10 CLOCKS STALL!
 843 // So we can negate Q coefficients there for negative values
 844
 845 { .mfi
 846       nop.m 0
 847 (p11) fma.s1    FR_Q1         = FR_Q1, FR_Neg_One, f0 // Negate Q1
 848       nop.i 0
 849 }
 850 { .mfi
 851       nop.m 0
 852       fma.s1    FR_XLog_Lo     = FR_XLog_Lo, f1, FR_GL // bl = bl + gl
 853       nop.i 0
 854 };;
 855
 856 { .mfi
 857       nop.m 0
 858 (p11) fma.s1    FR_Q2         = FR_Q2, FR_Neg_One, f0 // Negate Q2
 859       nop.i 0
 860 };;
 861
 862 { .mfi
 863       nop.m 0
 864 (p11) fma.s1    FR_Q3         = FR_Q3, FR_Neg_One, f0 // Negate Q3
 865       nop.i 0
 866 };;
 867
 868 { .mfi
 869       nop.m 0
 870 (p11) fma.s1    FR_Q4         = FR_Q4, FR_Neg_One, f0 // Negate Q4
 871       extr.u    GR_Index3     = GR_X_2, 1, 5         // Extract bits 1-5 of X_2
 872 };;
 873
 874 { .mfi
 875       shladd    GR_ad_tbl_3   = GR_Index3, 4, GR_ad_tbl_3  // Point to G_3
 876       nop.f 0
 877       nop.i 0
 878 };;
 879
 880 { .mfi
 881       ldfps     FR_G3, FR_H3  = [GR_ad_tbl_3],8   // Load G_3, H_3
 882       nop.f 0
 883       nop.i 0
 884 };;
 885
 886 { .mfi
 887       ldfd      FR_h3         = [GR_ad_tbl_3]            // Load h_3
 888           fcvt.xf   FR_float_N    = FR_float_N
 889       nop.i 0
 890 };;
 891
 892 { .mfi
 893       nop.m 0
 894       fmpy.s1   FR_G          = FR_G, FR_G2              // G = G_1 * G_2
 895       nop.i 0
 896 }
 897 { .mfi
 898       nop.m 0
 899       fadd.s1   FR_H          = FR_H, FR_H2              // H = H_1 + H_2
 900       nop.i 0
 901 };;
 902
 903 { .mfi
 904       nop.m 0
 905       fadd.s1   FR_h          = FR_h, FR_h2              // h = h_1 + h_2
 906       nop.i 0
 907 }
 908 { .mfi
 909       nop.m 0
 910       fma.s1    FR_S_lo       = FR_XLog_Lo, FR_2_to_minus_N, f0 //S_lo=S_lo*2^-N
 911       nop.i 0
 912 };;
 913
 914 { .mfi
 915       nop.m 0
 916       fmpy.s1   FR_G          = FR_G, FR_G3             // G = (G_1 * G_2) * G_3
 917       nop.i 0
 918 }
 919 { .mfi
 920       nop.m 0
 921       fadd.s1   FR_H          = FR_H, FR_H3             // H = (H_1 + H_2) + H_3
 922       nop.i 0
 923 };;
 924
 925 { .mfi
 926       nop.m 0
 927       fadd.s1   FR_h          = FR_h, FR_h3             // h = (h_1 + h_2) + h_3
 928       nop.i 0
 929 };;
 930
 931 { .mfi
 932       nop.m 0
 933       fms.s1    FR_r          = FR_G, FR_S_hi, f1           // r = G * S_hi - 1
 934       nop.i 0
 935 }
 936 { .mfi
 937       nop.m 0
 938       fma.s1    FR_Y_hi       = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
 939       nop.i 0
 940 };;
 941
 942 { .mfi
 943       nop.m 0
 944       fma.s1    FR_h          = FR_float_N, FR_log2_lo, FR_h  // h=N*log2_lo+h
 945       nop.i 0
 946 }
 947 { .mfi
 948       nop.m 0
 949       fma.s1    FR_r          = FR_G, FR_S_lo, FR_r  // r=G*S_lo+(G*S_hi-1)
 950       nop.i 0
 951 };;
 952
 953 { .mfi
 954       nop.m 0
 955       fma.s1    FR_poly_lo    = FR_r, FR_Q4, FR_Q3      // poly_lo = r * Q4 + Q3
 956       nop.i 0
 957 }
 958 { .mfi
 959       nop.m 0
 960       fmpy.s1   FR_rsq        = FR_r, FR_r              // rsq = r * r
 961       nop.i 0
 962 };;
 963
 964 { .mfi
 965       nop.m 0
 966       fma.s1    FR_poly_lo    = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
 967       nop.i 0
 968 }
 969 { .mfi
 970       nop.m 0
 971       fma.s1    FR_rcub       = FR_rsq, FR_r, f0        // rcub = r^3
 972       nop.i 0
 973 };;
 974
 975 .pred.rel "mutex",p12,p11
 976 { .mfi
 977       nop.m 0
 978 (p12) fma.s1    FR_poly_hi    = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
 979       nop.i 0
 980 }
 981 { .mfi
 982       nop.m 0
 983 (p11) fms.s1    FR_poly_hi    = FR_Q1, FR_rsq, FR_r     // poly_hi = Q1*rsq + r
 984       nop.i 0
 985 };;
 986
 987
 988 .pred.rel "mutex",p12,p11
 989 { .mfi
 990       nop.m 0
 991 (p12) fma.s1    FR_poly_lo    = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
 992       nop.i 0
 993 }
 994 { .mfi
 995       nop.m 0
 996 (p11) fms.s1    FR_poly_lo    = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
 997       nop.i 0
 998 }
 999 ;;
1000
1001 { .mfi
1002       nop.m 0
1003       fadd.s0   FR_Y_lo       = FR_poly_hi, FR_poly_lo
1004                                                              // Y_lo=poly_hi+poly_lo
1005       nop.i 0
1006 }
1007 { .mfi
1008       nop.m 0
1009 (p11) fma.s0    FR_Y_hi       = FR_Y_hi, FR_Neg_One, f0 // FR_Y_hi sign for neg
1010       nop.i 0
1011 };;
1012
1013 { .mfb
1014       nop.m 0
1015       fadd.s0   FR_Res        = FR_Y_lo,FR_Y_hi    // Result=Y_lo+Y_hi
1016       br.ret.sptk   b0                         // Common exit for 2^-7 < x < inf
1017 };;
1018
1019 // * SPECIAL VERSION OF LOGL FOR HUGE ARGUMENTS *
1020
1021 huges_logl:
1022 { .mfi
1023       getf.sig  GR_signif     = FR_XLog_Hi     // Get significand of x+1
1024       fmerge.ns FR_Neg_One    = f1, f1         // Form -1.0
1025       mov       GR_exp_2tom7  = 0x0fff8        // Exponent of 2^-7
1026 };;
1027
1028 { .mfi
1029       add       GR_ad_tbl_1   = 0x040, GR_ad_z_1    // Point to Constants_G_H_h1
1030       nop.f 0
1031       add       GR_ad_q       = -0x60, GR_ad_z_1    // Point to Constants_P
1032 }
1033 { .mfi
1034       add       GR_ad_z_2     = 0x140, GR_ad_z_1    // Point to Constants_Z_2
1035       nop.f 0
1036       add       GR_ad_tbl_2   = 0x180, GR_ad_z_1    // Point to Constants_G_H_h2
1037 };;
1038
1039 { .mfi
1040       nop.m 0
1041       nop.f 0
1042       extr.u    GR_Index1     = GR_signif, 59, 4    // Get high 4 bits of signif
1043 }
1044 { .mfi
1045       add       GR_ad_tbl_3   = 0x280, GR_ad_z_1    // Point to Constants_G_H_h3
1046       nop.f 0
1047       nop.i 0
1048 };;
1049
1050 { .mfi
1051       shladd    GR_ad_z_1     = GR_Index1, 2, GR_ad_z_1  // Point to Z_1
1052       nop.f 0
1053       extr.u    GR_X_0        = GR_signif, 49, 15 // Get high 15 bits of signif.
1054 };;
1055
1056 { .mfi
1057       ld4       GR_Z_1        = [GR_ad_z_1]    // Load Z_1
1058       nop.f 0
1059       mov       GR_exp_mask   = 0x1FFFF        // Create exponent mask
1060 }
1061 { .mfi
1062       shladd    GR_ad_tbl_1   = GR_Index1, 4, GR_ad_tbl_1  // Point to G_1
1063       nop.f 0
1064       mov       GR_Bias       = 0x0FFFF            // Create exponent bias
1065 };;
1066
1067 { .mfi
1068       ldfps     FR_G, FR_H    = [GR_ad_tbl_1],8     // Load G_1, H_1
1069       fmerge.se FR_S_hi       =  f1,FR_XLog_Hi            // Form |x+1|
1070       nop.i 0
1071 };;
1072
1073 { .mmi
1074       getf.exp  GR_N          =  FR_XLog_Hi          // Get N = exponent of x+1
1075       ldfd      FR_h          = [GR_ad_tbl_1]        // Load h_1
1076       nop.i 0
1077 };;
1078
1079 { .mfi
1080       ldfe      FR_log2_hi    = [GR_ad_q],16      // Load log2_hi
1081       nop.f 0
1082       pmpyshr2.u GR_X_1       = GR_X_0,GR_Z_1,15  // Get bits 30-15 of X_0 * Z_1
1083 };;
1084
1085 // WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
1086 // "DEAD" ZONE!
1087
1088 { .mmi
1089       ldfe      FR_log2_lo    = [GR_ad_q],16      // Load log2_lo
1090       sub       GR_N          = GR_N, GR_Bias
1091       mov       GR_exp_2tom80 = 0x0ffaf           // Exponent of 2^-80
1092 };;
1093
1094 { .mfi
1095       ldfe      FR_Q4         = [GR_ad_q],16          // Load Q4
1096       nop.f 0
1097       sub       GR_minus_N    = GR_Bias, GR_N         // Form exponent of 2^(-N)
1098 };;
1099
1100 { .mmf
1101       ldfe      FR_Q3         = [GR_ad_q],16   // Load Q3
1102       setf.sig  FR_float_N    = GR_N        // Put integer N into rightmost sign
1103       nop.f 0
1104 };;
1105
1106 { .mmi
1107       nop.m 0
1108       ldfe      FR_Q2         = [GR_ad_q],16      // Load Q2
1109       extr.u    GR_Index2     = GR_X_1, 6, 4      // Extract bits 6-9 of X_1
1110 };;
1111
1112 { .mmi
1113       ldfe      FR_Q1         = [GR_ad_q]                // Load Q1
1114       shladd    GR_ad_z_2     = GR_Index2, 2, GR_ad_z_2  // Point to Z_2
1115       nop.i 0
1116 };;
1117
1118 { .mmi
1119       ld4       GR_Z_2        = [GR_ad_z_2]                // Load Z_2
1120       shladd    GR_ad_tbl_2   = GR_Index2, 4, GR_ad_tbl_2  // Point to G_2
1121       nop.i 0
1122 };;
1123
1124 { .mmi
1125       ldfps     FR_G2, FR_H2  = [GR_ad_tbl_2],8       // Load G_2, H_2
1126       nop.m 0
1127       nop.i 0
1128 };;
1129
1130 { .mfi
1131       ldfd      FR_h2         = [GR_ad_tbl_2]             // Load h_2
1132       nop.f 0
1133       nop.i 0
1134 }
1135 { .mfi
1136       setf.exp  FR_2_to_minus_N = GR_minus_N   // Form 2^(-N)
1137       nop.f 0
1138       nop.i 0
1139 };;
1140
1141 { .mfi
1142       nop.m 0
1143       nop.f 0
1144       pmpyshr2.u GR_X_2       = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
1145 };;
1146
1147 // WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
1148 // "DEAD" ZONE!
1149 // JUST HAVE TO INSERT 3 NOP CYCLES (nothing to do here)
1150
1151 { .mfi
1152       nop.m 0
1153       nop.f 0
1154       nop.i 0
1155 };;
1156
1157 { .mfi
1158       nop.m 0
1159       nop.f 0
1160       nop.i 0
1161 };;
1162
1163 { .mfi
1164       nop.m 0
1165       nop.f 0
1166       nop.i 0
1167 };;
1168
1169 { .mfi
1170       nop.m 0
1171 (p11) fma.s1    FR_Q4         = FR_Q4, FR_Neg_One, f0 // Negate Q4
1172       extr.u    GR_Index3     = GR_X_2, 1, 5          // Extract bits 1-5 of X_2
1173  };;
1174
1175 { .mfi
1176       shladd    GR_ad_tbl_3   = GR_Index3, 4, GR_ad_tbl_3  // Point to G_3
1177           fcvt.xf   FR_float_N    = FR_float_N
1178       nop.i 0
1179 }
1180 { .mfi
1181       nop.m 0
1182 (p11) fma.s1    FR_Q3         = FR_Q3, FR_Neg_One, f0 // Negate Q3
1183       nop.i 0
1184 };;
1185
1186 { .mfi
1187       ldfps     FR_G3, FR_H3  = [GR_ad_tbl_3],8   // Load G_3, H_3
1188 (p11) fma.s1    FR_Q2         = FR_Q2, FR_Neg_One, f0 // Negate Q2
1189       nop.i 0
1190 }
1191 { .mfi
1192       nop.m 0
1193 (p11) fma.s1    FR_Q1         = FR_Q1, FR_Neg_One, f0 // Negate Q1
1194       nop.i 0
1195 };;
1196
1197 { .mfi
1198       ldfd      FR_h3         = [GR_ad_tbl_3]            // Load h_3
1199       fmpy.s1   FR_G          = FR_G, FR_G2              // G = G_1 * G_2
1200       nop.i 0
1201 }
1202 { .mfi
1203       nop.m 0
1204       fadd.s1   FR_H          = FR_H, FR_H2              // H = H_1 + H_2
1205       nop.i 0
1206 };;
1207
1208 { .mmf
1209       nop.m 0
1210       nop.m 0
1211       fadd.s1   FR_h          = FR_h, FR_h2              // h = h_1 + h_2
1212 };;
1213
1214 { .mfi
1215       nop.m 0
1216       fmpy.s1   FR_G          = FR_G, FR_G3             // G = (G_1 * G_2) * G_3
1217       nop.i 0
1218 }
1219 { .mfi
1220       nop.m 0
1221       fadd.s1   FR_H          = FR_H, FR_H3             // H = (H_1 + H_2) + H_3
1222       nop.i 0
1223 };;
1224
1225 { .mfi
1226       nop.m 0
1227       fadd.s1   FR_h          = FR_h, FR_h3             // h = (h_1 + h_2) + h_3
1228       nop.i 0
1229 };;
1230
1231 { .mfi
1232       nop.m 0
1233       fms.s1    FR_r          = FR_G, FR_S_hi, f1           // r = G * S_hi - 1
1234       nop.i 0
1235 }
1236 { .mfi
1237       nop.m 0
1238       fma.s1    FR_Y_hi       = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
1239       nop.i 0
1240 };;
1241
1242 { .mfi
1243       nop.m 0
1244       fma.s1    FR_h          = FR_float_N, FR_log2_lo, FR_h  // h=N*log2_lo+h
1245       nop.i 0
1246 };;
1247
1248 { .mfi
1249       nop.m 0
1250       fma.s1    FR_poly_lo    = FR_r, FR_Q4, FR_Q3      // poly_lo = r * Q4 + Q3
1251       nop.i 0
1252 }
1253 { .mfi
1254       nop.m 0
1255       fmpy.s1   FR_rsq        = FR_r, FR_r              // rsq = r * r
1256       nop.i 0
1257 };;
1258
1259 { .mfi
1260       nop.m 0
1261       fma.s1    FR_poly_lo    = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
1262       nop.i 0
1263 }
1264 { .mfi
1265       nop.m 0
1266       fma.s1    FR_rcub       = FR_rsq, FR_r, f0        // rcub = r^3
1267       nop.i 0
1268 };;
1269
1270 .pred.rel "mutex",p12,p11
1271 { .mfi
1272       nop.m 0
1273 (p12) fma.s1    FR_poly_hi    = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1274       nop.i 0
1275 }
1276 { .mfi
1277       nop.m 0
1278 (p11) fms.s1    FR_poly_hi    = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1279       nop.i 0
1280 };;
1281
1282
1283 .pred.rel "mutex",p12,p11
1284 { .mfi
1285       nop.m 0
1286 (p12) fma.s1    FR_poly_lo    = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1287       nop.i 0
1288 }
1289 { .mfi
1290       nop.m 0
1291 (p11) fms.s1    FR_poly_lo    = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1292       nop.i 0
1293 };;
1294
1295 { .mfi
1296       nop.m 0
1297       fadd.s0   FR_Y_lo       = FR_poly_hi, FR_poly_lo  // Y_lo=poly_hi+poly_lo
1298       nop.i 0
1299 }
1300 { .mfi
1301       nop.m 0
1302 (p11) fma.s0    FR_Y_hi       = FR_Y_hi, FR_Neg_One, f0 // FR_Y_hi sign for neg
1303       nop.i 0
1304 };;
1305
1306 { .mfb
1307       nop.m 0
1308       fadd.s0   FR_Res        = FR_Y_lo,FR_Y_hi    // Result=Y_lo+Y_hi
1309       br.ret.sptk   b0                         // Common exit for 2^-7 < x < inf
1310 };;
1311
1312 // NEAR ZERO POLYNOMIAL INTERVAL
1313 near_0:
1314 { .mfi
1315       nop.m 0
1316       fma.s1    FR_X4         = FR_X2, FR_X2, f0 // x^4 = x^2 * x^2
1317       nop.i 0
1318 };;
1319
1320 { .mfi
1321       nop.m 0
1322       fma.s1    FR_P9         = FR_C9,FR_X2,FR_C7  // p9 = C9*x^2 + C7
1323       nop.i 0
1324 }
1325 { .mfi
1326       nop.m 0
1327       fma.s1    FR_P5         = FR_C5,FR_X2,FR_C3  // p5 = C5*x^2 + C3
1328       nop.i 0
1329 };;
1330
1331 { .mfi
1332       nop.m 0
1333       fma.s1    FR_P3         = FR_P9,FR_X4,FR_P5  // p3 = p9*x^4 + p5
1334       nop.i 0
1335 };;
1336
1337 { .mfb
1338       nop.m 0
1339       fma.s0    FR_Res        = FR_P3,FR_X3,FR_Arg // res = p3*C3 + x
1340       br.ret.sptk   b0                          // Near 0 path return
1341 };;
1342
1343 GLOBAL_LIBM_END(asinhl)
1344 libm_alias_ldouble_other (asinh, asinh)