sysdeps/ia64/fpu/s_asinhl.S

   1 .file "asinhl.s"
   2
   3
   4 // Copyright (c) 2000 - 2003, Intel Corporation
   5 // All rights reserved.
   6 //
   7 // Contributed 2000 by the Intel Numerics Group, Intel Corporation
   8 //
   9 // Redistribution and use in source and binary forms, with or without
  10 // modification, are permitted provided that the following conditions are
  11 // met:
  12 //
  13 // * Redistributions of source code must retain the above copyright
  14 // notice, this list of conditions and the following disclaimer.
  15 //
  16 // * Redistributions in binary form must reproduce the above copyright
  17 // notice, this list of conditions and the following disclaimer in the
  18 // documentation and/or other materials provided with the distribution.
  19 //
  20 // * The name of Intel Corporation may not be used to endorse or promote
  21 // products derived from this software without specific prior written
  22 // permission.
  23
  24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  35 //
  36 // Intel Corporation is the author of this code, and requests that all
  37 // problem reports or change requests be submitted to it directly at
  38 // http://www.intel.com/software/products/opensource/libraries/num.htm.
  39 //
  40 //*********************************************************************
  41 //
  42 // History:
  43 // 09/04/01 Initial version
  44 // 09/13/01 Performance improved, symmetry problems fixed
  45 // 10/10/01 Performance improved, split issues removed
  46 // 12/11/01 Changed huges_logp to not be global
  47 // 05/20/02 Cleaned up namespace and sf0 syntax
  48 // 02/10/03 Reordered header: .section, .global, .proc, .align;
  49 //          used data8 for long double table values
  50 //
  51 //*********************************************************************
  52 //
  53 // API
  54 //==============================================================
  55 // long double asinhl(long double);
  56 //
  57 // Overview of operation
  58 //==============================================================
  59 //
  60 // There are 6 paths:
  61 // 1. x = 0, [S,Q]Nan or +/-INF
  62 //    Return asinhl(x) = x + x;
  63 //
  64 // 2. x = + denormal
  65 //    Return asinhl(x) = x - x^2;
  66 //
  67 // 3. x = - denormal
  68 //    Return asinhl(x) = x + x^2;
  69 //
  70 // 4. 'Near 0': max denormal < |x| < 1/128
  71 //    Return asinhl(x) = sign(x)*(x+x^3*(c3+x^2*(c5+x^2*(c7+x^2*(c9)))));
  72 //
  73 // 5. 'Huges': |x| > 2^63
  74 //    Return asinhl(x) = sign(x)*(logl(2*x));
  75 //
  76 // 6. 'Main path': 1/128 < |x| < 2^63
  77 //    b_hi + b_lo = x + sqrt(x^2 + 1);
  78 //    asinhl(x) = sign(x)*(log_special(b_hi, b_lo));
  79 //
  80 // Algorithm description
  81 //==============================================================
  82 //
  83 // Main path algorithm
  84 // ( thanks to Peter Markstein for the idea of sqrt(x^2+1) computation! )
  85 // *************************************************************************
  86 //
  87 // There are 3 parts of x+sqrt(x^2+1) computation:
  88 //
  89 //  1) p2 = (p2_hi+p2_lo) = x^2+1 obtaining
  90 //     ------------------------------------
  91 //     p2_hi = x2_hi + 1, where x2_hi = x * x;
  92 //     p2_lo = x2_lo + p1_lo, where
  93 //                            x2_lo = FMS(x*x-x2_hi),
  94 //                            p1_lo = (1 - p2_hi) + x2_hi;
  95 //
  96 //  2) g = (g_hi+g_lo) = sqrt(p2) = sqrt(p2_hi+p2_lo)
  97 //     ----------------------------------------------
  98 //     r = invsqrt(p2_hi) (8-bit reciprocal square root approximation);
  99 //     g = p2_hi * r (first 8 bit-approximation of sqrt);
 100 //
 101 //     h = 0.5 * r;
 102 //     e = 0.5 - g * h;
 103 //     g = g * e + g (second 16 bit-approximation of sqrt);
 104 //
 105 //     h = h * e + h;
 106 //     e = 0.5 - g * h;
 107 //     g = g * e + g (third 32 bit-approximation of sqrt);
 108 //
 109 //     h = h * e + h;
 110 //     e = 0.5 - g * h;
 111 //     g_hi = g * e + g (fourth 64 bit-approximation of sqrt);
 112 //
 113 //     Remainder computation:
 114 //     h = h * e + h;
 115 //     d = (p2_hi - g_hi * g_hi) + p2_lo;
 116 //     g_lo = d * h;
 117 //
 118 //  3) b = (b_hi + b_lo) = x + g, where g = (g_hi + g_lo) = sqrt(x^2+1)
 119 //     -------------------------------------------------------------------
 120 //     b_hi = (g_hi + x) + gl;
 121 //     b_lo = (g_hi - b_hi) + x + gl;
 122 //
 123 //  Now we pass b presented as sum b_hi + b_lo to special version
 124 //  of logl function which accept a pair of arguments as
 125 //  'mutiprecision' value.
 126 //
 127 //  Special log algorithm overview
 128 //  ================================
 129 //   Here we use a table lookup method. The basic idea is that in
 130 //   order to compute logl(Arg) = logl (Arg-1) for an argument Arg in [1,2),
 131 //   we construct a value G such that G*Arg is close to 1 and that
 132 //   logl(1/G) is obtainable easily from a table of values calculated
 133 //   beforehand. Thus
 134 //
 135 //      logl(Arg) = logl(1/G) + logl((G*Arg - 1))
 136 //
 137 //   Because |G*Arg - 1| is small, the second term on the right hand
 138 //   side can be approximated by a short polynomial. We elaborate
 139 //   this method in four steps.
 140 //
 141 //   Step 0: Initialization
 142 //
 143 //   We need to calculate logl( X ). Obtain N, S_hi such that
 144 //
 145 //      X = 2^N * ( S_hi + S_lo )   exactly
 146 //
 147 //   where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
 148 //   that |S_lo| <= ulp(S_hi).
 149 //
 150 //   For the special version of logl: S_lo = b_lo
 151 //   !-----------------------------------------------!
 152 //
 153 //   Step 1: Argument Reduction
 154 //
 155 //   Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
 156 //
 157 //      G := G_1 * G_2 * G_3
 158 //      r := (G * S_hi - 1) + G * S_lo
 159 //
 160 //   These G_j's have the property that the product is exactly
 161 //   representable and that |r| < 2^(-12) as a result.
 162 //
 163 //   Step 2: Approximation
 164 //
 165 //   logl(1 + r) is approximated by a short polynomial poly(r).
 166 //
 167 //   Step 3: Reconstruction
 168 //
 169 //   Finally,
 170 //
 171 //   logl( X )   =   logl( 2^N * (S_hi + S_lo) )
 172 //                 ~=~  N*logl(2) + logl(1/G) + logl(1 + r)
 173 //                 ~=~  N*logl(2) + logl(1/G) + poly(r).
 174 //
 175 //   For detailed description see logl or log1pl function, regular path.
 176 //
 177 // Registers used
 178 //==============================================================
 179 // Floating Point registers used:
 180 // f8, input
 181 // f32 -> f101 (70 registers)
 182
 183 // General registers used:
 184 // r32 -> r57 (26 registers)
 185
 186 // Predicate registers used:
 187 // p6 -> p11
 188 // p6  for '0, NaNs, Inf' path
 189 // p7  for '+ denormals' path
 190 // p8  for 'near 0' path
 191 // p9  for 'huges' path
 192 // p10 for '- denormals' path
 193 // p11 for negative values
 194 //
 195 // Data tables
 196 //==============================================================
 197
 198 RODATA
 199 .align 64
 200
 201 // C7, C9 'near 0' polynomial coefficients
 202 LOCAL_OBJECT_START(Poly_C_near_0_79)
 203 data8 0xF8DC939BBEDD5A54, 0x00003FF9
 204 data8 0xB6DB6DAB21565AC5, 0x0000BFFA
 205 LOCAL_OBJECT_END(Poly_C_near_0_79)
 206
 207 // C3, C5 'near 0' polynomial coefficients
 208 LOCAL_OBJECT_START(Poly_C_near_0_35)
 209 data8 0x999999999991D582, 0x00003FFB
 210 data8 0xAAAAAAAAAAAAAAA9, 0x0000BFFC
 211 LOCAL_OBJECT_END(Poly_C_near_0_35)
 212
 213 // Q coeffs
 214 LOCAL_OBJECT_START(Constants_Q)
 215 data4  0x00000000,0xB1721800,0x00003FFE,0x00000000
 216 data4  0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
 217 data4  0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
 218 data4  0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
 219 data4  0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
 220 data4  0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
 221 LOCAL_OBJECT_END(Constants_Q)
 222
 223 // Z1 - 16 bit fixed
 224 LOCAL_OBJECT_START(Constants_Z_1)
 225 data4  0x00008000
 226 data4  0x00007879
 227 data4  0x000071C8
 228 data4  0x00006BCB
 229 data4  0x00006667
 230 data4  0x00006187
 231 data4  0x00005D18
 232 data4  0x0000590C
 233 data4  0x00005556
 234 data4  0x000051EC
 235 data4  0x00004EC5
 236 data4  0x00004BDB
 237 data4  0x00004925
 238 data4  0x0000469F
 239 data4  0x00004445
 240 data4  0x00004211
 241 LOCAL_OBJECT_END(Constants_Z_1)
 242
 243 // G1 and H1 - IEEE single and h1 - IEEE double
 244 LOCAL_OBJECT_START(Constants_G_H_h1)
 245 data4  0x3F800000,0x00000000
 246 data8  0x0000000000000000
 247 data4  0x3F70F0F0,0x3D785196
 248 data8  0x3DA163A6617D741C
 249 data4  0x3F638E38,0x3DF13843
 250 data8  0x3E2C55E6CBD3D5BB
 251 data4  0x3F579430,0x3E2FF9A0
 252 data8  0xBE3EB0BFD86EA5E7
 253 data4  0x3F4CCCC8,0x3E647FD6
 254 data8  0x3E2E6A8C86B12760
 255 data4  0x3F430C30,0x3E8B3AE7
 256 data8  0x3E47574C5C0739BA
 257 data4  0x3F3A2E88,0x3EA30C68
 258 data8  0x3E20E30F13E8AF2F
 259 data4  0x3F321640,0x3EB9CEC8
 260 data8  0xBE42885BF2C630BD
 261 data4  0x3F2AAAA8,0x3ECF9927
 262 data8  0x3E497F3497E577C6
 263 data4  0x3F23D708,0x3EE47FC5
 264 data8  0x3E3E6A6EA6B0A5AB
 265 data4  0x3F1D89D8,0x3EF8947D
 266 data8  0xBDF43E3CD328D9BE
 267 data4  0x3F17B420,0x3F05F3A1
 268 data8  0x3E4094C30ADB090A
 269 data4  0x3F124920,0x3F0F4303
 270 data8  0xBE28FBB2FC1FE510
 271 data4  0x3F0D3DC8,0x3F183EBF
 272 data8  0x3E3A789510FDE3FA
 273 data4  0x3F088888,0x3F20EC80
 274 data8  0x3E508CE57CC8C98F
 275 data4  0x3F042108,0x3F29516A
 276 data8  0xBE534874A223106C
 277 LOCAL_OBJECT_END(Constants_G_H_h1)
 278
 279 // Z2 - 16 bit fixed
 280 LOCAL_OBJECT_START(Constants_Z_2)
 281 data4  0x00008000
 282 data4  0x00007F81
 283 data4  0x00007F02
 284 data4  0x00007E85
 285 data4  0x00007E08
 286 data4  0x00007D8D
 287 data4  0x00007D12
 288 data4  0x00007C98
 289 data4  0x00007C20
 290 data4  0x00007BA8
 291 data4  0x00007B31
 292 data4  0x00007ABB
 293 data4  0x00007A45
 294 data4  0x000079D1
 295 data4  0x0000795D
 296 data4  0x000078EB
 297 LOCAL_OBJECT_END(Constants_Z_2)
 298
 299 // G2 and H2 - IEEE single and h2 - IEEE double
 300 LOCAL_OBJECT_START(Constants_G_H_h2)
 301 data4  0x3F800000,0x00000000
 302 data8  0x0000000000000000
 303 data4  0x3F7F00F8,0x3B7F875D
 304 data8  0x3DB5A11622C42273
 305 data4  0x3F7E03F8,0x3BFF015B
 306 data8  0x3DE620CF21F86ED3
 307 data4  0x3F7D08E0,0x3C3EE393
 308 data8  0xBDAFA07E484F34ED
 309 data4  0x3F7C0FC0,0x3C7E0586
 310 data8  0xBDFE07F03860BCF6
 311 data4  0x3F7B1880,0x3C9E75D2
 312 data8  0x3DEA370FA78093D6
 313 data4  0x3F7A2328,0x3CBDC97A
 314 data8  0x3DFF579172A753D0
 315 data4  0x3F792FB0,0x3CDCFE47
 316 data8  0x3DFEBE6CA7EF896B
 317 data4  0x3F783E08,0x3CFC15D0
 318 data8  0x3E0CF156409ECB43
 319 data4  0x3F774E38,0x3D0D874D
 320 data8  0xBE0B6F97FFEF71DF
 321 data4  0x3F766038,0x3D1CF49B
 322 data8  0xBE0804835D59EEE8
 323 data4  0x3F757400,0x3D2C531D
 324 data8  0x3E1F91E9A9192A74
 325 data4  0x3F748988,0x3D3BA322
 326 data8  0xBE139A06BF72A8CD
 327 data4  0x3F73A0D0,0x3D4AE46F
 328 data8  0x3E1D9202F8FBA6CF
 329 data4  0x3F72B9D0,0x3D5A1756
 330 data8  0xBE1DCCC4BA796223
 331 data4  0x3F71D488,0x3D693B9D
 332 data8  0xBE049391B6B7C239
 333 LOCAL_OBJECT_END(Constants_G_H_h2)
 334
 335 // G3 and H3 - IEEE single and h3 - IEEE double
 336 LOCAL_OBJECT_START(Constants_G_H_h3)
 337 data4  0x3F7FFC00,0x38800100
 338 data8  0x3D355595562224CD
 339 data4  0x3F7FF400,0x39400480
 340 data8  0x3D8200A206136FF6
 341 data4  0x3F7FEC00,0x39A00640
 342 data8  0x3DA4D68DE8DE9AF0
 343 data4  0x3F7FE400,0x39E00C41
 344 data8  0xBD8B4291B10238DC
 345 data4  0x3F7FDC00,0x3A100A21
 346 data8  0xBD89CCB83B1952CA
 347 data4  0x3F7FD400,0x3A300F22
 348 data8  0xBDB107071DC46826
 349 data4  0x3F7FCC08,0x3A4FF51C
 350 data8  0x3DB6FCB9F43307DB
 351 data4  0x3F7FC408,0x3A6FFC1D
 352 data8  0xBD9B7C4762DC7872
 353 data4  0x3F7FBC10,0x3A87F20B
 354 data8  0xBDC3725E3F89154A
 355 data4  0x3F7FB410,0x3A97F68B
 356 data8  0xBD93519D62B9D392
 357 data4  0x3F7FAC18,0x3AA7EB86
 358 data8  0x3DC184410F21BD9D
 359 data4  0x3F7FA420,0x3AB7E101
 360 data8  0xBDA64B952245E0A6
 361 data4  0x3F7F9C20,0x3AC7E701
 362 data8  0x3DB4B0ECAABB34B8
 363 data4  0x3F7F9428,0x3AD7DD7B
 364 data8  0x3D9923376DC40A7E
 365 data4  0x3F7F8C30,0x3AE7D474
 366 data8  0x3DC6E17B4F2083D3
 367 data4  0x3F7F8438,0x3AF7CBED
 368 data8  0x3DAE314B811D4394
 369 data4  0x3F7F7C40,0x3B03E1F3
 370 data8  0xBDD46F21B08F2DB1
 371 data4  0x3F7F7448,0x3B0BDE2F
 372 data8  0xBDDC30A46D34522B
 373 data4  0x3F7F6C50,0x3B13DAAA
 374 data8  0x3DCB0070B1F473DB
 375 data4  0x3F7F6458,0x3B1BD766
 376 data8  0xBDD65DDC6AD282FD
 377 data4  0x3F7F5C68,0x3B23CC5C
 378 data8  0xBDCDAB83F153761A
 379 data4  0x3F7F5470,0x3B2BC997
 380 data8  0xBDDADA40341D0F8F
 381 data4  0x3F7F4C78,0x3B33C711
 382 data8  0x3DCD1BD7EBC394E8
 383 data4  0x3F7F4488,0x3B3BBCC6
 384 data8  0xBDC3532B52E3E695
 385 data4  0x3F7F3C90,0x3B43BAC0
 386 data8  0xBDA3961EE846B3DE
 387 data4  0x3F7F34A0,0x3B4BB0F4
 388 data8  0xBDDADF06785778D4
 389 data4  0x3F7F2CA8,0x3B53AF6D
 390 data8  0x3DCC3ED1E55CE212
 391 data4  0x3F7F24B8,0x3B5BA620
 392 data8  0xBDBA31039E382C15
 393 data4  0x3F7F1CC8,0x3B639D12
 394 data8  0x3D635A0B5C5AF197
 395 data4  0x3F7F14D8,0x3B6B9444
 396 data8  0xBDDCCB1971D34EFC
 397 data4  0x3F7F0CE0,0x3B7393BC
 398 data8  0x3DC7450252CD7ADA
 399 data4  0x3F7F04F0,0x3B7B8B6D
 400 data8  0xBDB68F177D7F2A42
 401 LOCAL_OBJECT_END(Constants_G_H_h3)
 402
 403 // Assembly macros
 404 //==============================================================
 405
 406 // Floating Point Registers
 407
 408 FR_Arg          = f8
 409 FR_Res          = f8
 410 FR_AX           = f32
 411 FR_XLog_Hi      = f33
 412 FR_XLog_Lo      = f34
 413
 414     // Special logl registers
 415 FR_Y_hi         = f35
 416 FR_Y_lo         = f36
 417
 418 FR_Scale        = f37
 419 FR_X_Prime      = f38
 420 FR_S_hi         = f39
 421 FR_W            = f40
 422 FR_G            = f41
 423
 424 FR_H            = f42
 425 FR_wsq          = f43
 426 FR_w4           = f44
 427 FR_h            = f45
 428 FR_w6           = f46
 429
 430 FR_G2           = f47
 431 FR_H2           = f48
 432 FR_poly_lo      = f49
 433 FR_P8           = f50
 434 FR_poly_hi      = f51
 435
 436 FR_P7           = f52
 437 FR_h2           = f53
 438 FR_rsq          = f54
 439 FR_P6           = f55
 440 FR_r            = f56
 441
 442 FR_log2_hi      = f57
 443 FR_log2_lo      = f58
 444
 445 FR_float_N      = f59
 446 FR_Q4           = f60
 447
 448 FR_G3           = f61
 449 FR_H3           = f62
 450 FR_h3           = f63
 451
 452 FR_Q3           = f64
 453 FR_Q2           = f65
 454 FR_1LN10_hi     = f66
 455
 456 FR_Q1           = f67
 457 FR_1LN10_lo     = f68
 458 FR_P5           = f69
 459 FR_rcub         = f70
 460
 461 FR_Neg_One      = f71
 462 FR_Z            = f72
 463 FR_AA           = f73
 464 FR_BB           = f74
 465 FR_S_lo         = f75
 466 FR_2_to_minus_N = f76
 467
 468
 469     // Huge & Main path prolog registers
 470 FR_Half         = f77
 471 FR_Two          = f78
 472 FR_X2           = f79
 473 FR_P2           = f80
 474 FR_P2L          = f81
 475 FR_Rcp          = f82
 476 FR_GG           = f83
 477 FR_HH           = f84
 478 FR_EE           = f85
 479 FR_DD           = f86
 480 FR_GL           = f87
 481 FR_A            = f88
 482 FR_AL           = f89
 483 FR_B            = f90
 484 FR_BL           = f91
 485 FR_Tmp          = f92
 486
 487     // Near 0 & Huges path prolog registers
 488 FR_C3           = f93
 489 FR_C5           = f94
 490 FR_C7           = f95
 491 FR_C9           = f96
 492
 493 FR_X3           = f97
 494 FR_X4           = f98
 495 FR_P9           = f99
 496 FR_P5           = f100
 497 FR_P3           = f101
 498
 499
 500 // General Purpose Registers
 501
 502     // General prolog registers
 503 GR_PFS          = r32
 504 GR_TwoN7        = r40
 505 GR_TwoP63       = r41
 506 GR_ExpMask      = r42
 507 GR_ArgExp       = r43
 508 GR_Half         = r44
 509
 510     // Near 0 path prolog registers
 511 GR_Poly_C_35    = r45
 512 GR_Poly_C_79    = r46
 513
 514     // Special logl registers
 515 GR_Index1       = r34
 516 GR_Index2       = r35
 517 GR_signif       = r36
 518 GR_X_0          = r37
 519 GR_X_1          = r38
 520 GR_X_2          = r39
 521 GR_Z_1          = r40
 522 GR_Z_2          = r41
 523 GR_N            = r42
 524 GR_Bias         = r43
 525 GR_M            = r44
 526 GR_Index3       = r45
 527 GR_exp_2tom80   = r45
 528 GR_exp_mask     = r47
 529 GR_exp_2tom7    = r48
 530 GR_ad_ln10      = r49
 531 GR_ad_tbl_1     = r50
 532 GR_ad_tbl_2     = r51
 533 GR_ad_tbl_3     = r52
 534 GR_ad_q         = r53
 535 GR_ad_z_1       = r54
 536 GR_ad_z_2       = r55
 537 GR_ad_z_3       = r56
 538 GR_minus_N      = r57
 539
 540
 541
 542 .section .text
 543 GLOBAL_LIBM_ENTRY(asinhl)
 544
 545 { .mfi
 546       alloc     GR_PFS        = ar.pfs,0,27,0,0
 547       fma.s1    FR_P2         = FR_Arg, FR_Arg, f1  // p2 = x^2 + 1
 548       mov       GR_Half       = 0xfffe              // 0.5's exp
 549 }
 550 { .mfi
 551       addl      GR_Poly_C_79  = @ltoff(Poly_C_near_0_79), gp // C7, C9 coeffs
 552       fma.s1    FR_X2         = FR_Arg, FR_Arg, f0           // Obtain x^2
 553       addl      GR_Poly_C_35  = @ltoff(Poly_C_near_0_35), gp // C3, C5 coeffs
 554 };;
 555
 556 { .mfi
 557       getf.exp  GR_ArgExp     = FR_Arg        // get arument's exponent
 558       fabs      FR_AX         = FR_Arg        // absolute value of argument
 559       mov       GR_TwoN7      = 0xfff8        // 2^-7 exp
 560 }
 561 { .mfi
 562       ld8       GR_Poly_C_79  = [GR_Poly_C_79] // get actual coeff table address
 563       fma.s0       FR_Two        = f1, f1, f1        // construct 2.0
 564       mov       GR_ExpMask    = 0x1ffff        // mask for exp
 565 };;
 566
 567 { .mfi
 568       ld8       GR_Poly_C_35  = [GR_Poly_C_35] // get actual coeff table address
 569       fclass.m  p6,p0         = FR_Arg, 0xe7   // if arg NaN inf zero
 570       mov       GR_TwoP63     = 0x1003e        // 2^63 exp
 571 }
 572 { .mfi
 573       addl      GR_ad_z_1     = @ltoff(Constants_Z_1#),gp
 574       nop.f 0
 575       nop.i 0
 576 };;
 577
 578 { .mfi
 579       setf.exp  FR_Half       = GR_Half              // construct 0.5
 580       fclass.m  p7,p0         = FR_Arg, 0x09  //  if arg + denorm
 581       and       GR_ArgExp     = GR_ExpMask, GR_ArgExp // select exp
 582 }
 583 { .mfb
 584       ld8       GR_ad_z_1     = [GR_ad_z_1]   // Get pointer to Constants_Z_1
 585       nop.f 0
 586       nop.b 0
 587 };;
 588 { .mfi
 589       ldfe      FR_C9         = [GR_Poly_C_79],16 // load C9
 590       fclass.m  p10,p0        = FR_Arg, 0x0a    //  if arg - denorm
 591       cmp.gt    p8, p0        = GR_TwoN7,  GR_ArgExp // if arg < 2^-7 ('near 0')
 592 }
 593 { .mfb
 594       cmp.le    p9, p0        = GR_TwoP63, GR_ArgExp  // if arg > 2^63 ('huges')
 595 (p6)  fma.s0    FR_Res        = FR_Arg,f1,FR_Arg     // r = a + a
 596 (p6)  br.ret.spnt b0                            // return
 597 };;
 598 // (X^2 + 1) computation
 599 { .mfi
 600 (p8)  ldfe      FR_C5         = [GR_Poly_C_35],16        // load C5
 601       fms.s1    FR_Tmp        = f1, f1, FR_P2           // Tmp = 1 - p2
 602       add       GR_ad_tbl_1   = 0x040, GR_ad_z_1    // Point to Constants_G_H_h1
 603 }
 604 { .mfb
 605 (p8)  ldfe      FR_C7         = [GR_Poly_C_79],16        // load C7
 606 (p7)  fnma.s0   FR_Res        =  FR_Arg,FR_Arg,FR_Arg // r = a - a*a
 607 (p7)  br.ret.spnt b0                              // return
 608 };;
 609
 610 { .mfi
 611 (p8)  ldfe      FR_C3         = [GR_Poly_C_35],16     // load C3
 612       fcmp.lt.s1 p11, p12      = FR_Arg, f0     // if arg is negative
 613       add       GR_ad_q       = -0x60, GR_ad_z_1    // Point to Constants_P
 614 }
 615 { .mfb
 616       add       GR_ad_z_2     = 0x140, GR_ad_z_1    // Point to Constants_Z_2
 617 (p10) fma.s0    FR_Res        =  FR_Arg,FR_Arg,FR_Arg // r = a + a*a
 618 (p10) br.ret.spnt b0                             // return
 619 };;
 620
 621 { .mfi
 622       add       GR_ad_tbl_2   = 0x180, GR_ad_z_1    // Point to Constants_G_H_h2
 623       frsqrta.s1 FR_Rcp, p0   = FR_P2           // Rcp = 1/p2 reciprocal appr.
 624       add       GR_ad_tbl_3   = 0x280, GR_ad_z_1    // Point to Constants_G_H_h3
 625 }
 626 { .mfi
 627       nop.m 0
 628       fms.s1    FR_P2L        = FR_AX, FR_AX, FR_X2 //low part of p2=fma(X*X-p2)
 629       mov       GR_Bias       = 0x0FFFF            // Create exponent bias
 630 };;
 631
 632 { .mfb
 633       nop.m 0
 634 (p9)  fms.s1    FR_XLog_Hi    = FR_Two, FR_AX, f0  // Hi  of log1p arg = 2*X - 1
 635 (p9)  br.cond.spnt huges_logl                      // special version of log1p
 636 };;
 637
 638 { .mfb
 639       ldfe      FR_log2_hi    = [GR_ad_q],16      // Load log2_hi
 640 (p8)  fma.s1    FR_X3         = FR_X2, FR_Arg, f0        // x^3 = x^2 * x
 641 (p8)  br.cond.spnt near_0                                // Go to near 0 branch
 642 };;
 643
 644 { .mfi
 645       ldfe      FR_log2_lo    = [GR_ad_q],16      // Load log2_lo
 646       nop.f 0
 647       nop.i 0
 648 };;
 649
 650 { .mfi
 651       ldfe      FR_Q4         = [GR_ad_q],16          // Load Q4
 652       fma.s1    FR_Tmp        = FR_Tmp, f1, FR_X2       // Tmp = Tmp + x^2
 653       mov       GR_exp_mask   = 0x1FFFF        // Create exponent mask
 654 };;
 655
 656 { .mfi
 657       ldfe      FR_Q3         = [GR_ad_q],16   // Load Q3
 658       fma.s1    FR_GG         = FR_Rcp, FR_P2, f0        // g = Rcp * p2
 659                                                // 8 bit Newton Raphson iteration
 660       nop.i 0
 661 }
 662 { .mfi
 663       nop.m 0
 664       fma.s1    FR_HH         = FR_Half, FR_Rcp, f0      // h = 0.5 * Rcp
 665       nop.i 0
 666 };;
 667 { .mfi
 668       ldfe      FR_Q2         = [GR_ad_q],16      // Load Q2
 669       fnma.s1   FR_EE         = FR_GG, FR_HH, FR_Half   // e = 0.5 - g * h
 670       nop.i 0
 671 }
 672 { .mfi
 673       nop.m 0
 674       fma.s1    FR_P2L        = FR_Tmp, f1, FR_P2L // low part of p2 = Tmp + p2l
 675       nop.i 0
 676 };;
 677
 678 { .mfi
 679       ldfe      FR_Q1         = [GR_ad_q]                // Load Q1
 680       fma.s1    FR_GG         = FR_GG, FR_EE, FR_GG     // g = g * e + g
 681                                               // 16 bit Newton Raphson iteration
 682       nop.i 0
 683 }
 684 { .mfi
 685       nop.m 0
 686       fma.s1    FR_HH         = FR_HH, FR_EE, FR_HH     // h = h * e + h
 687       nop.i 0
 688 };;
 689
 690 { .mfi
 691       nop.m 0
 692       fnma.s1   FR_EE         = FR_GG, FR_HH, FR_Half   // e = 0.5 - g * h
 693       nop.i 0
 694 };;
 695
 696 { .mfi
 697       nop.m 0
 698       fma.s1    FR_GG         = FR_GG, FR_EE, FR_GG     // g = g * e + g
 699                                               // 32 bit Newton Raphson iteration
 700       nop.i 0
 701 }
 702 { .mfi
 703       nop.m 0
 704       fma.s1    FR_HH         = FR_HH, FR_EE, FR_HH     // h = h * e + h
 705       nop.i 0
 706 };;
 707
 708 { .mfi
 709       nop.m 0
 710       fnma.s1   FR_EE         = FR_GG, FR_HH, FR_Half   // e = 0.5 - g * h
 711       nop.i 0
 712 };;
 713
 714 { .mfi
 715       nop.m 0
 716       fma.s1    FR_GG         = FR_GG, FR_EE, FR_GG     // g = g * e + g
 717                                               // 64 bit Newton Raphson iteration
 718       nop.i 0
 719 }
 720 { .mfi
 721       nop.m 0
 722       fma.s1    FR_HH         = FR_HH, FR_EE, FR_HH     // h = h * e + h
 723       nop.i 0
 724 };;
 725
 726 { .mfi
 727       nop.m 0
 728       fnma.s1   FR_DD         = FR_GG, FR_GG, FR_P2  // Remainder d = g * g - p2
 729       nop.i 0
 730 }
 731 { .mfi
 732       nop.m 0
 733       fma.s1    FR_XLog_Hi     = FR_AX, f1, FR_GG // bh = z + gh
 734       nop.i 0
 735 };;
 736
 737 { .mfi
 738       nop.m 0
 739       fma.s1    FR_DD         = FR_DD, f1, FR_P2L       // add p2l: d = d + p2l
 740       nop.i 0
 741 };;
 742
 743
 744 { .mfi
 745       getf.sig  GR_signif     = FR_XLog_Hi     // Get significand of x+1
 746       fmerge.ns FR_Neg_One    = f1, f1         // Form -1.0
 747       mov       GR_exp_2tom7  = 0x0fff8        // Exponent of 2^-7
 748 };;
 749
 750 { .mfi
 751       nop.m 0
 752       fma.s1    FR_GL         = FR_DD, FR_HH, f0        // gl = d * h
 753       extr.u    GR_Index1     = GR_signif, 59, 4    // Get high 4 bits of signif
 754 }
 755 { .mfi
 756       nop.m 0
 757       fma.s1    FR_XLog_Hi     = FR_DD,  FR_HH, FR_XLog_Hi // bh = bh + gl
 758       nop.i 0
 759 };;
 760
 761 { .mmi
 762       shladd    GR_ad_z_1     = GR_Index1, 2, GR_ad_z_1  // Point to Z_1
 763       shladd    GR_ad_tbl_1   = GR_Index1, 4, GR_ad_tbl_1  // Point to G_1
 764       extr.u    GR_X_0        = GR_signif, 49, 15 // Get high 15 bits of signif.
 765 };;
 766
 767 { .mmi
 768       ld4       GR_Z_1        = [GR_ad_z_1]    // Load Z_1
 769       nop.m 0
 770       nop.i 0
 771 };;
 772
 773 { .mmi
 774       ldfps     FR_G, FR_H    = [GR_ad_tbl_1],8     // Load G_1, H_1
 775       nop.m 0
 776       nop.i 0
 777 };;
 778
 779 { .mfi
 780       nop.m 0
 781       fms.s1    FR_XLog_Lo     = FR_GG,  f1,   FR_XLog_Hi // bl = gh - bh
 782       pmpyshr2.u GR_X_1       = GR_X_0,GR_Z_1,15  // Get bits 30-15 of X_0 * Z_1
 783 };;
 784
 785 // WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
 786 // "DEAD" ZONE!
 787
 788 { .mfi
 789       nop.m 0
 790       nop.f 0
 791       nop.i 0
 792 };;
 793
 794 { .mfi
 795       nop.m 0
 796       fmerge.se FR_S_hi       =  f1,FR_XLog_Hi            // Form |x+1|
 797       nop.i 0
 798 };;
 799
 800 { .mmi
 801       getf.exp  GR_N          =  FR_XLog_Hi    // Get N = exponent of x+1
 802       ldfd      FR_h          = [GR_ad_tbl_1]        // Load h_1
 803       nop.i 0
 804 };;
 805
 806 { .mfi
 807       nop.m 0
 808       nop.f 0
 809       extr.u    GR_Index2     = GR_X_1, 6, 4      // Extract bits 6-9 of X_1
 810 };;
 811
 812
 813 { .mfi
 814       shladd    GR_ad_tbl_2   = GR_Index2, 4, GR_ad_tbl_2  // Point to G_2
 815       fma.s1    FR_XLog_Lo    = FR_XLog_Lo, f1, FR_AX // bl = bl + x
 816       mov       GR_exp_2tom80 = 0x0ffaf           // Exponent of 2^-80
 817 }
 818 { .mfi
 819       shladd    GR_ad_z_2     = GR_Index2, 2, GR_ad_z_2  // Point to Z_2
 820       nop.f 0
 821       sub       GR_N          = GR_N, GR_Bias // sub bias from exp
 822 };;
 823
 824 { .mmi
 825       ldfps     FR_G2, FR_H2  = [GR_ad_tbl_2],8       // Load G_2, H_2
 826       ld4       GR_Z_2        = [GR_ad_z_2]                // Load Z_2
 827       sub       GR_minus_N    = GR_Bias, GR_N         // Form exponent of 2^(-N)
 828 };;
 829
 830 { .mmi
 831       ldfd      FR_h2         = [GR_ad_tbl_2]             // Load h_2
 832       nop.m 0
 833       nop.i 0
 834 };;
 835
 836 { .mmi
 837       setf.sig  FR_float_N    = GR_N        // Put integer N into rightmost sign
 838       setf.exp  FR_2_to_minus_N = GR_minus_N   // Form 2^(-N)
 839       pmpyshr2.u GR_X_2       = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
 840 };;
 841
 842 // WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
 843 // BECAUSE OF POSSIBLE 10 CLOCKS STALL!
 844 // So we can negate Q coefficients there for negative values
 845
 846 { .mfi
 847       nop.m 0
 848 (p11) fma.s1    FR_Q1         = FR_Q1, FR_Neg_One, f0 // Negate Q1
 849       nop.i 0
 850 }
 851 { .mfi
 852       nop.m 0
 853       fma.s1    FR_XLog_Lo     = FR_XLog_Lo, f1, FR_GL // bl = bl + gl
 854       nop.i 0
 855 };;
 856
 857 { .mfi
 858       nop.m 0
 859 (p11) fma.s1    FR_Q2         = FR_Q2, FR_Neg_One, f0 // Negate Q2
 860       nop.i 0
 861 };;
 862
 863 { .mfi
 864       nop.m 0
 865 (p11) fma.s1    FR_Q3         = FR_Q3, FR_Neg_One, f0 // Negate Q3
 866       nop.i 0
 867 };;
 868
 869 { .mfi
 870       nop.m 0
 871 (p11) fma.s1    FR_Q4         = FR_Q4, FR_Neg_One, f0 // Negate Q4
 872       extr.u    GR_Index3     = GR_X_2, 1, 5         // Extract bits 1-5 of X_2
 873 };;
 874
 875 { .mfi
 876       shladd    GR_ad_tbl_3   = GR_Index3, 4, GR_ad_tbl_3  // Point to G_3
 877       nop.f 0
 878       nop.i 0
 879 };;
 880
 881 { .mfi
 882       ldfps     FR_G3, FR_H3  = [GR_ad_tbl_3],8   // Load G_3, H_3
 883       nop.f 0
 884       nop.i 0
 885 };;
 886
 887 { .mfi
 888       ldfd      FR_h3         = [GR_ad_tbl_3]            // Load h_3
 889           fcvt.xf   FR_float_N    = FR_float_N
 890       nop.i 0
 891 };;
 892
 893 { .mfi
 894       nop.m 0
 895       fmpy.s1   FR_G          = FR_G, FR_G2              // G = G_1 * G_2
 896       nop.i 0
 897 }
 898 { .mfi
 899       nop.m 0
 900       fadd.s1   FR_H          = FR_H, FR_H2              // H = H_1 + H_2
 901       nop.i 0
 902 };;
 903
 904 { .mfi
 905       nop.m 0
 906       fadd.s1   FR_h          = FR_h, FR_h2              // h = h_1 + h_2
 907       nop.i 0
 908 }
 909 { .mfi
 910       nop.m 0
 911       fma.s1    FR_S_lo       = FR_XLog_Lo, FR_2_to_minus_N, f0 //S_lo=S_lo*2^-N
 912       nop.i 0
 913 };;
 914
 915 { .mfi
 916       nop.m 0
 917       fmpy.s1   FR_G          = FR_G, FR_G3             // G = (G_1 * G_2) * G_3
 918       nop.i 0
 919 }
 920 { .mfi
 921       nop.m 0
 922       fadd.s1   FR_H          = FR_H, FR_H3             // H = (H_1 + H_2) + H_3
 923       nop.i 0
 924 };;
 925
 926 { .mfi
 927       nop.m 0
 928       fadd.s1   FR_h          = FR_h, FR_h3             // h = (h_1 + h_2) + h_3
 929       nop.i 0
 930 };;
 931
 932 { .mfi
 933       nop.m 0
 934       fms.s1    FR_r          = FR_G, FR_S_hi, f1           // r = G * S_hi - 1
 935       nop.i 0
 936 }
 937 { .mfi
 938       nop.m 0
 939       fma.s1    FR_Y_hi       = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
 940       nop.i 0
 941 };;
 942
 943 { .mfi
 944       nop.m 0
 945       fma.s1    FR_h          = FR_float_N, FR_log2_lo, FR_h  // h=N*log2_lo+h
 946       nop.i 0
 947 }
 948 { .mfi
 949       nop.m 0
 950       fma.s1    FR_r          = FR_G, FR_S_lo, FR_r  // r=G*S_lo+(G*S_hi-1)
 951       nop.i 0
 952 };;
 953
 954 { .mfi
 955       nop.m 0
 956       fma.s1    FR_poly_lo    = FR_r, FR_Q4, FR_Q3      // poly_lo = r * Q4 + Q3
 957       nop.i 0
 958 }
 959 { .mfi
 960       nop.m 0
 961       fmpy.s1   FR_rsq        = FR_r, FR_r              // rsq = r * r
 962       nop.i 0
 963 };;
 964
 965 { .mfi
 966       nop.m 0
 967       fma.s1    FR_poly_lo    = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
 968       nop.i 0
 969 }
 970 { .mfi
 971       nop.m 0
 972       fma.s1    FR_rcub       = FR_rsq, FR_r, f0        // rcub = r^3
 973       nop.i 0
 974 };;
 975
 976 .pred.rel "mutex",p12,p11
 977 { .mfi
 978       nop.m 0
 979 (p12) fma.s1    FR_poly_hi    = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
 980       nop.i 0
 981 }
 982 { .mfi
 983       nop.m 0
 984 (p11) fms.s1    FR_poly_hi    = FR_Q1, FR_rsq, FR_r     // poly_hi = Q1*rsq + r
 985       nop.i 0
 986 };;
 987
 988
 989 .pred.rel "mutex",p12,p11
 990 { .mfi
 991       nop.m 0
 992 (p12) fma.s1    FR_poly_lo    = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
 993       nop.i 0
 994 }
 995 { .mfi
 996       nop.m 0
 997 (p11) fms.s1    FR_poly_lo    = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
 998       nop.i 0
 999 }
1000 ;;
1001
1002 { .mfi
1003       nop.m 0
1004       fadd.s0   FR_Y_lo       = FR_poly_hi, FR_poly_lo
1005                                                              // Y_lo=poly_hi+poly_lo
1006       nop.i 0
1007 }
1008 { .mfi
1009       nop.m 0
1010 (p11) fma.s0    FR_Y_hi       = FR_Y_hi, FR_Neg_One, f0 // FR_Y_hi sign for neg
1011       nop.i 0
1012 };;
1013
1014 { .mfb
1015       nop.m 0
1016       fadd.s0   FR_Res        = FR_Y_lo,FR_Y_hi    // Result=Y_lo+Y_hi
1017       br.ret.sptk   b0                         // Common exit for 2^-7 < x < inf
1018 };;
1019
1020 // * SPECIAL VERSION OF LOGL FOR HUGE ARGUMENTS *
1021
1022 huges_logl:
1023 { .mfi
1024       getf.sig  GR_signif     = FR_XLog_Hi     // Get significand of x+1
1025       fmerge.ns FR_Neg_One    = f1, f1         // Form -1.0
1026       mov       GR_exp_2tom7  = 0x0fff8        // Exponent of 2^-7
1027 };;
1028
1029 { .mfi
1030       add       GR_ad_tbl_1   = 0x040, GR_ad_z_1    // Point to Constants_G_H_h1
1031       nop.f 0
1032       add       GR_ad_q       = -0x60, GR_ad_z_1    // Point to Constants_P
1033 }
1034 { .mfi
1035       add       GR_ad_z_2     = 0x140, GR_ad_z_1    // Point to Constants_Z_2
1036       nop.f 0
1037       add       GR_ad_tbl_2   = 0x180, GR_ad_z_1    // Point to Constants_G_H_h2
1038 };;
1039
1040 { .mfi
1041       nop.m 0
1042       nop.f 0
1043       extr.u    GR_Index1     = GR_signif, 59, 4    // Get high 4 bits of signif
1044 }
1045 { .mfi
1046       add       GR_ad_tbl_3   = 0x280, GR_ad_z_1    // Point to Constants_G_H_h3
1047       nop.f 0
1048       nop.i 0
1049 };;
1050
1051 { .mfi
1052       shladd    GR_ad_z_1     = GR_Index1, 2, GR_ad_z_1  // Point to Z_1
1053       nop.f 0
1054       extr.u    GR_X_0        = GR_signif, 49, 15 // Get high 15 bits of signif.
1055 };;
1056
1057 { .mfi
1058       ld4       GR_Z_1        = [GR_ad_z_1]    // Load Z_1
1059       nop.f 0
1060       mov       GR_exp_mask   = 0x1FFFF        // Create exponent mask
1061 }
1062 { .mfi
1063       shladd    GR_ad_tbl_1   = GR_Index1, 4, GR_ad_tbl_1  // Point to G_1
1064       nop.f 0
1065       mov       GR_Bias       = 0x0FFFF            // Create exponent bias
1066 };;
1067
1068 { .mfi
1069       ldfps     FR_G, FR_H    = [GR_ad_tbl_1],8     // Load G_1, H_1
1070       fmerge.se FR_S_hi       =  f1,FR_XLog_Hi            // Form |x+1|
1071       nop.i 0
1072 };;
1073
1074 { .mmi
1075       getf.exp  GR_N          =  FR_XLog_Hi          // Get N = exponent of x+1
1076       ldfd      FR_h          = [GR_ad_tbl_1]        // Load h_1
1077       nop.i 0
1078 };;
1079
1080 { .mfi
1081       ldfe      FR_log2_hi    = [GR_ad_q],16      // Load log2_hi
1082       nop.f 0
1083       pmpyshr2.u GR_X_1       = GR_X_0,GR_Z_1,15  // Get bits 30-15 of X_0 * Z_1
1084 };;
1085
1086 // WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
1087 // "DEAD" ZONE!
1088
1089 { .mmi
1090       ldfe      FR_log2_lo    = [GR_ad_q],16      // Load log2_lo
1091       sub       GR_N          = GR_N, GR_Bias
1092       mov       GR_exp_2tom80 = 0x0ffaf           // Exponent of 2^-80
1093 };;
1094
1095 { .mfi
1096       ldfe      FR_Q4         = [GR_ad_q],16          // Load Q4
1097       nop.f 0
1098       sub       GR_minus_N    = GR_Bias, GR_N         // Form exponent of 2^(-N)
1099 };;
1100
1101 { .mmf
1102       ldfe      FR_Q3         = [GR_ad_q],16   // Load Q3
1103       setf.sig  FR_float_N    = GR_N        // Put integer N into rightmost sign
1104       nop.f 0
1105 };;
1106
1107 { .mmi
1108       nop.m 0
1109       ldfe      FR_Q2         = [GR_ad_q],16      // Load Q2
1110       extr.u    GR_Index2     = GR_X_1, 6, 4      // Extract bits 6-9 of X_1
1111 };;
1112
1113 { .mmi
1114       ldfe      FR_Q1         = [GR_ad_q]                // Load Q1
1115       shladd    GR_ad_z_2     = GR_Index2, 2, GR_ad_z_2  // Point to Z_2
1116       nop.i 0
1117 };;
1118
1119 { .mmi
1120       ld4       GR_Z_2        = [GR_ad_z_2]                // Load Z_2
1121       shladd    GR_ad_tbl_2   = GR_Index2, 4, GR_ad_tbl_2  // Point to G_2
1122       nop.i 0
1123 };;
1124
1125 { .mmi
1126       ldfps     FR_G2, FR_H2  = [GR_ad_tbl_2],8       // Load G_2, H_2
1127       nop.m 0
1128       nop.i 0
1129 };;
1130
1131 { .mfi
1132       ldfd      FR_h2         = [GR_ad_tbl_2]             // Load h_2
1133       nop.f 0
1134       nop.i 0
1135 }
1136 { .mfi
1137       setf.exp  FR_2_to_minus_N = GR_minus_N   // Form 2^(-N)
1138       nop.f 0
1139       nop.i 0
1140 };;
1141
1142 { .mfi
1143       nop.m 0
1144       nop.f 0
1145       pmpyshr2.u GR_X_2       = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
1146 };;
1147
1148 // WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
1149 // "DEAD" ZONE!
1150 // JUST HAVE TO INSERT 3 NOP CYCLES (nothing to do here)
1151
1152 { .mfi
1153       nop.m 0
1154       nop.f 0
1155       nop.i 0
1156 };;
1157
1158 { .mfi
1159       nop.m 0
1160       nop.f 0
1161       nop.i 0
1162 };;
1163
1164 { .mfi
1165       nop.m 0
1166       nop.f 0
1167       nop.i 0
1168 };;
1169
1170 { .mfi
1171       nop.m 0
1172 (p11) fma.s1    FR_Q4         = FR_Q4, FR_Neg_One, f0 // Negate Q4
1173       extr.u    GR_Index3     = GR_X_2, 1, 5          // Extract bits 1-5 of X_2
1174  };;
1175
1176 { .mfi
1177       shladd    GR_ad_tbl_3   = GR_Index3, 4, GR_ad_tbl_3  // Point to G_3
1178           fcvt.xf   FR_float_N    = FR_float_N
1179       nop.i 0
1180 }
1181 { .mfi
1182       nop.m 0
1183 (p11) fma.s1    FR_Q3         = FR_Q3, FR_Neg_One, f0 // Negate Q3
1184       nop.i 0
1185 };;
1186
1187 { .mfi
1188       ldfps     FR_G3, FR_H3  = [GR_ad_tbl_3],8   // Load G_3, H_3
1189 (p11) fma.s1    FR_Q2         = FR_Q2, FR_Neg_One, f0 // Negate Q2
1190       nop.i 0
1191 }
1192 { .mfi
1193       nop.m 0
1194 (p11) fma.s1    FR_Q1         = FR_Q1, FR_Neg_One, f0 // Negate Q1
1195       nop.i 0
1196 };;
1197
1198 { .mfi
1199       ldfd      FR_h3         = [GR_ad_tbl_3]            // Load h_3
1200       fmpy.s1   FR_G          = FR_G, FR_G2              // G = G_1 * G_2
1201       nop.i 0
1202 }
1203 { .mfi
1204       nop.m 0
1205       fadd.s1   FR_H          = FR_H, FR_H2              // H = H_1 + H_2
1206       nop.i 0
1207 };;
1208
1209 { .mmf
1210       nop.m 0
1211       nop.m 0
1212       fadd.s1   FR_h          = FR_h, FR_h2              // h = h_1 + h_2
1213 };;
1214
1215 { .mfi
1216       nop.m 0
1217       fmpy.s1   FR_G          = FR_G, FR_G3             // G = (G_1 * G_2) * G_3
1218       nop.i 0
1219 }
1220 { .mfi
1221       nop.m 0
1222       fadd.s1   FR_H          = FR_H, FR_H3             // H = (H_1 + H_2) + H_3
1223       nop.i 0
1224 };;
1225
1226 { .mfi
1227       nop.m 0
1228       fadd.s1   FR_h          = FR_h, FR_h3             // h = (h_1 + h_2) + h_3
1229       nop.i 0
1230 };;
1231
1232 { .mfi
1233       nop.m 0
1234       fms.s1    FR_r          = FR_G, FR_S_hi, f1           // r = G * S_hi - 1
1235       nop.i 0
1236 }
1237 { .mfi
1238       nop.m 0
1239       fma.s1    FR_Y_hi       = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
1240       nop.i 0
1241 };;
1242
1243 { .mfi
1244       nop.m 0
1245       fma.s1    FR_h          = FR_float_N, FR_log2_lo, FR_h  // h=N*log2_lo+h
1246       nop.i 0
1247 };;
1248
1249 { .mfi
1250       nop.m 0
1251       fma.s1    FR_poly_lo    = FR_r, FR_Q4, FR_Q3      // poly_lo = r * Q4 + Q3
1252       nop.i 0
1253 }
1254 { .mfi
1255       nop.m 0
1256       fmpy.s1   FR_rsq        = FR_r, FR_r              // rsq = r * r
1257       nop.i 0
1258 };;
1259
1260 { .mfi
1261       nop.m 0
1262       fma.s1    FR_poly_lo    = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
1263       nop.i 0
1264 }
1265 { .mfi
1266       nop.m 0
1267       fma.s1    FR_rcub       = FR_rsq, FR_r, f0        // rcub = r^3
1268       nop.i 0
1269 };;
1270
1271 .pred.rel "mutex",p12,p11
1272 { .mfi
1273       nop.m 0
1274 (p12) fma.s1    FR_poly_hi    = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1275       nop.i 0
1276 }
1277 { .mfi
1278       nop.m 0
1279 (p11) fms.s1    FR_poly_hi    = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1280       nop.i 0
1281 };;
1282
1283
1284 .pred.rel "mutex",p12,p11
1285 { .mfi
1286       nop.m 0
1287 (p12) fma.s1    FR_poly_lo    = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1288       nop.i 0
1289 }
1290 { .mfi
1291       nop.m 0
1292 (p11) fms.s1    FR_poly_lo    = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1293       nop.i 0
1294 };;
1295
1296 { .mfi
1297       nop.m 0
1298       fadd.s0   FR_Y_lo       = FR_poly_hi, FR_poly_lo  // Y_lo=poly_hi+poly_lo
1299       nop.i 0
1300 }
1301 { .mfi
1302       nop.m 0
1303 (p11) fma.s0    FR_Y_hi       = FR_Y_hi, FR_Neg_One, f0 // FR_Y_hi sign for neg
1304       nop.i 0
1305 };;
1306
1307 { .mfb
1308       nop.m 0
1309       fadd.s0   FR_Res        = FR_Y_lo,FR_Y_hi    // Result=Y_lo+Y_hi
1310       br.ret.sptk   b0                         // Common exit for 2^-7 < x < inf
1311 };;
1312
1313 // NEAR ZERO POLYNOMIAL INTERVAL
1314 near_0:
1315 { .mfi
1316       nop.m 0
1317       fma.s1    FR_X4         = FR_X2, FR_X2, f0 // x^4 = x^2 * x^2
1318       nop.i 0
1319 };;
1320
1321 { .mfi
1322       nop.m 0
1323       fma.s1    FR_P9         = FR_C9,FR_X2,FR_C7  // p9 = C9*x^2 + C7
1324       nop.i 0
1325 }
1326 { .mfi
1327       nop.m 0
1328       fma.s1    FR_P5         = FR_C5,FR_X2,FR_C3  // p5 = C5*x^2 + C3
1329       nop.i 0
1330 };;
1331
1332 { .mfi
1333       nop.m 0
1334       fma.s1    FR_P3         = FR_P9,FR_X4,FR_P5  // p3 = p9*x^4 + p5
1335       nop.i 0
1336 };;
1337
1338 { .mfb
1339       nop.m 0
1340       fma.s0    FR_Res        = FR_P3,FR_X3,FR_Arg // res = p3*C3 + x
1341       br.ret.sptk   b0                          // Near 0 path return
1342 };;
1343
1344 GLOBAL_LIBM_END(asinhl)
1345
1346
1347