ports/sysdeps/ia64/fpu/e_powl.S

   1 .file "powl.s"
   2
   3
   4 // Copyright (c) 2000 - 2003, Intel Corporation
   5 // All rights reserved.
   6 //
   7 // Contributed 2000 by the Intel Numerics Group, Intel Corporation
   8 //
   9 // Redistribution and use in source and binary forms, with or without
  10 // modification, are permitted provided that the following conditions are
  11 // met:
  12 //
  13 // * Redistributions of source code must retain the above copyright
  14 // notice, this list of conditions and the following disclaimer.
  15 //
  16 // * Redistributions in binary form must reproduce the above copyright
  17 // notice, this list of conditions and the following disclaimer in the
  18 // documentation and/or other materials provided with the distribution.
  19 //
  20 // * The name of Intel Corporation may not be used to endorse or promote
  21 // products derived from this software without specific prior written
  22 // permission.
  23
  24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  35 //
  36 // Intel Corporation is the author of this code, and requests that all
  37 // problem reports or change requests be submitted to it directly at
  38 // http://www.intel.com/software/products/opensource/libraries/num.htm.
  39 //
  40 //*********************************************************************
  41 //
  42 // Function:   powl(x,y), where
  43 //                          y
  44 //             powl(x,y) = x , for double extended precision x and y values
  45 //
  46 //*********************************************************************
  47 //
  48 // History:
  49 // 02/02/00 (Hand Optimized)
  50 // 04/04/00 Unwind support added
  51 // 08/15/00 Bundle added after call to __libm_error_support to properly
  52 //          set [the previously overwritten] GR_Parameter_RESULT.
  53 // 01/22/01 Corrected results for powl(1,inf), powl(1,nan), and
  54 //          powl(snan,0) to be 1 per C99, not nan.  Fixed many flag settings.
  55 // 02/06/01 Call __libm_error support if over/underflow when y=2.
  56 // 04/17/01 Support added for y close to 1 and x a non-special value.
  57 //          Shared software under/overflow detection for all paths
  58 // 02/07/02 Corrected sf3 setting to disable traps
  59 // 05/13/02 Improved performance of all paths
  60 // 02/10/03 Reordered header: .section, .global, .proc, .align;
  61 //          used data8 for long double table values
  62 // 04/17/03 Added missing mutex directive
  63 // 10/13/03 Corrected .endp names to match .proc names
  64 //
  65 //*********************************************************************
  66 //
  67 // Resources Used:
  68 //
  69 //    Floating-Point Registers:
  70 //                        f8  (Input x and Return Value)
  71 //                        f9  (Input y)
  72 //                        f10-f15,f32-f79
  73 //
  74 //    General Purpose Registers:
  75 //                        Locals r14-24,r32-r65
  76 //                        Parameters to __libm_error_support r62,r63,r64,r65
  77 //
  78 //    Predicate Registers: p6-p15
  79 //
  80 //*********************************************************************
  81 //
  82 //  Special Cases and IEEE special conditions:
  83 //
  84 //    Denormal fault raised on denormal inputs
  85 //    Overflow exceptions raised when appropriate for pow
  86 //    Underflow exceptions raised when appropriate for pow
  87 //    (Error Handling Routine called for overflow and Underflow)
  88 //    Inexact raised when appropriate by algorithm
  89 //
  90 //  1.  (anything) ** NatVal or (NatVal) ** anything  is NatVal
  91 //  2.  X or Y unsupported or sNaN                    is qNaN/Invalid
  92 //  3.  (anything) ** 0  is 1
  93 //  4.  (anything) ** 1  is itself
  94 //  5.  (anything except 1) ** qNAN is qNAN
  95 //  6.  qNAN ** (anything except 0) is qNAN
  96 //  7.  +-(|x| > 1) **  +INF is +INF
  97 //  8.  +-(|x| > 1) **  -INF is +0
  98 //  9.  +-(|x| < 1) **  +INF is +0
  99 //  10. +-(|x| < 1) **  -INF is +INF
 100 //  11. +-1         ** +-INF is +1
 101 //  12. +0 ** (+anything except 0, NAN)               is +0
 102 //  13. -0 ** (+anything except 0, NAN, odd integer)  is +0
 103 //  14. +0 ** (-anything except 0, NAN)               is +INF/div_0
 104 //  15. -0 ** (-anything except 0, NAN, odd integer)  is +INF/div_0
 105 //  16. -0 ** (odd integer) = -( +0 ** (odd integer) )
 106 //  17. +INF ** (+anything except 0,NAN) is +INF
 107 //  18. +INF ** (-anything except 0,NAN) is +0
 108 //  19. -INF ** (anything except NAN)  = -0 ** (-anything)
 109 //  20. (-anything) ** (integer) is (-1)**(integer)*(+anything**integer)
 110 //  21. (-anything except 0 and inf) ** (non-integer) is qNAN/Invalid
 111 //  22. X or Y denorm/unorm and denorm/unorm operand trap is enabled,
 112 //      generate denorm/unorm fault except if invalid or div_0 raised.
 113 //
 114 //*********************************************************************
 115 //
 116 //  Algorithm
 117 //  =========
 118 //
 119 //  Special Cases
 120 //
 121 //    If Y = 2,    return X*X.
 122 //    If Y = 0.5,  return sqrt(X).
 123 //
 124 //  Compute log(X) to extra precision.
 125 //
 126 //  ker_log_80( X, logX_hi, logX_lo, Safe );
 127 //
 128 //   ...logX_hi + logX_lo approximates log(X) to roughly 80
 129 //   ...significant bits of accuracy.
 130 //
 131 //  Compute Y*log(X) to extra precision.
 132 //
 133 //    P_hi := Y * logX_hi
 134 //    P_lo := Y * logX_hi - P_hi       ...using FMA
 135 //    P_lo := Y * logX_lo + P_lo       ...using FMA
 136 //
 137 //  Compute exp(P_hi + P_lo)
 138 //
 139 //    Flag := 2;
 140 //    Expo_Range := 2; (assuming double-extended power function)
 141 //    ker_exp_64( P_hi, P_lo, Flag, Expo_Range,
 142 //                Z_hi, Z_lo, scale, Safe )
 143 //
 144 //    scale := sgn * scale
 145 //
 146 //    If (Safe) then ...result will not over/underflow
 147 //       return scale*Z_hi + (scale*Z_lo)
 148 //       quickly
 149 //    Else
 150 //       take necessary precaution in computing
 151 //       scale*Z_hi + (scale*Z_lo)
 152 //       to set possible exceptions correctly.
 153 //    End If
 154 //
 155 //  Case_Y_Special
 156 //
 157 //   ...Follow the order of the case checks
 158 //
 159 //   If Y is +-0, return +1 without raising any exception.
 160 //   If Y is +1,  return X  without raising any exception.
 161 //   If Y is qNaN, return Y without exception.
 162 //   If X is qNaN, return X without exception.
 163 //
 164 //   At this point, X is real and Y is +-inf.
 165 //   Thus |X| can only be 1, strictly bigger than 1, or
 166 //   strictly less than 1.
 167 //
 168 //   If |X| < 1, then
 169 //   return ( Y == +inf?  +0 : +inf )
 170 //   elseif |X| > 1, then
 171 //   return ( Y == +inf? +0 : +inf )
 172 //   else
 173 //   goto Case_Invalid
 174 //
 175 //  Case_X_Special
 176 //
 177 //   ...Follow the order of the case checks
 178 //   ...Note that Y is real, finite, non-zero, and not +1.
 179 //
 180 //   If X is qNaN, return X without exception.
 181 //
 182 //   If X is +-0,
 183 //   return ( Y > 0 ? +0 : +inf )
 184 //
 185 //   If X is +inf
 186 //   return ( Y > 0 ? +inf : +0 )
 187 //
 188 //   If X is -inf
 189 //   return -0 ** -Y
 190 //   return ( Y > 0 ? +inf : +0 )
 191 //
 192 //  Case_Invalid
 193 //
 194 //   Return 0 * inf to generate a quiet NaN together
 195 //   with an invalid exception.
 196 //
 197 //  Implementation
 198 //  ==============
 199 //
 200 //   We describe the quick branch since this part is important
 201 //   in reaching the normal case efficiently.
 202 //
 203 //  STAGE 1
 204 //  -------
 205 //   This stage contains two threads.
 206 //
 207 //   Stage1.Thread1
 208 //
 209 //     fclass.m   X_excep,  X_ok   = X, (NatVal or s/qNaN) or
 210 //                              +-0, +-infinity
 211 //
 212 //     fclass.nm  X_unsupp, X_supp = X, (NatVal or s/qNaN) or
 213 //                              +-(0, unnorm, norm, infinity)
 214 //
 215 //     X_norm := fnorm( X ) with traps disabled
 216 //
 217 //     If (X_excep)  goto Filtering (Step 2)
 218 //     If (X_unsupp) goto Filtering (Step 2)
 219 //
 220 //     Stage1.Thread2
 221 //     ..............
 222 //
 223 //     fclass.m   Y_excep,  Y_ok   = Y, (NatVal or s/qNaN) or
 224 //                              +-0, +-infinity
 225 //
 226 //     fclass.nm  Y_unsupp, Y_supp = Y, (NatVal or s/qNaN) or
 227 //                              +-(0, unnorm, norm, infinity)
 228 //
 229 //     Y_norm := fnorm( Y ) with traps disabled
 230 //
 231 //     If (Y_excep)  goto Filtering (Step 2)
 232 //     If (Y_unsupp) goto Filtering (Step 2)
 233 //
 234 //
 235 //  STAGE 2
 236 //  -------
 237 //  This stage contains two threads.
 238 //
 239 //     Stage2.Thread1
 240 //     ..............
 241 //
 242 //     Set X_lt_0 if X < 0 (using fcmp)
 243 //     sgn := +1.0
 244 //     If (X_lt_0) goto Filtering (Step 2)
 245 //
 246 //     Stage2.Thread2
 247 //     ..............
 248 //
 249 //     Set Y_is_1 if Y = +1 (using fcmp)
 250 //     If (Y_is_1) goto Filtering (Step 2)
 251 //
 252 //   STAGE 3
 253 //   -------
 254 //   This stage contains two threads.
 255 //
 256 //
 257 //   Stage3.Thread1
 258 //   ..............
 259 //
 260 //     X := fnorm(X) in prevailing traps
 261 //
 262 //
 263 //     Stage3.Thread2
 264 //     ..............
 265 //
 266 //     Y := fnorm(Y) in prevailing traps
 267 //
 268 //   STAGE 4
 269 //   -------
 270 //
 271 //   Go to Case_Normal.
 272 //
 273
 274
 275 // ************* DO NOT CHANGE ORDER OF THESE TABLES ********************
 276
 277 // double-extended 1/ln(2)
 278 // 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88
 279 // 3fff b8aa 3b29 5c17 f0bc
 280 // For speed the significand will be loaded directly with a movl and setf.sig
 281 //   and the exponent will be bias+63 instead of bias+0.  Thus subsequent
 282 //   computations need to scale appropriately.
 283 // The constant 2^12/ln(2) is needed for the computation of N.  This is also
 284 //   obtained by scaling the computations.
 285 //
 286 // Two shifting constants are loaded directly with movl and setf.d.
 287 //   1. RSHF_2TO51 = 1.1000..00 * 2^(63-12)
 288 //        This constant is added to x*1/ln2 to shift the integer part of
 289 //        x*2^12/ln2 into the rightmost bits of the significand.
 290 //        The result of this fma is N_signif.
 291 //   2. RSHF       = 1.1000..00 * 2^(63)
 292 //        This constant is subtracted from N_signif * 2^(-51) to give
 293 //        the integer part of N, N_fix, as a floating-point number.
 294 //        The result of this fms is float_N.
 295 RODATA
 296
 297 .align 16
 298 // L_hi, L_lo
 299 LOCAL_OBJECT_START(Constants_exp_64_Arg)
 300 data8 0xB17217F400000000,0x00003FF2 // L_hi = hi part log(2)/2^12
 301 data8 0xF473DE6AF278ECE6,0x00003FD4 // L_lo = lo part log(2)/2^12
 302 LOCAL_OBJECT_END(Constants_exp_64_Arg)
 303
 304 LOCAL_OBJECT_START(Constants_exp_64_A)
 305 // Reversed
 306 data8 0xAAAAAAABB1B736A0,0x00003FFA
 307 data8 0xAAAAAAAB90CD6327,0x00003FFC
 308 data8 0xFFFFFFFFFFFFFFFF,0x00003FFD
 309 LOCAL_OBJECT_END(Constants_exp_64_A)
 310
 311 LOCAL_OBJECT_START(Constants_exp_64_P)
 312 // Reversed
 313 data8 0xD00D6C8143914A8A,0x00003FF2
 314 data8 0xB60BC4AC30304B30,0x00003FF5
 315 data8 0x888888887474C518,0x00003FF8
 316 data8 0xAAAAAAAA8DAE729D,0x00003FFA
 317 data8 0xAAAAAAAAAAAAAF61,0x00003FFC
 318 data8 0x80000000000004C7,0x00003FFE
 319 LOCAL_OBJECT_END(Constants_exp_64_P)
 320
 321 LOCAL_OBJECT_START(Constants_exp_64_T1)
 322 data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
 323 data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
 324 data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC
 325 data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D
 326 data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA
 327 data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516
 328 data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A
 329 data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4
 330 data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B
 331 data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD
 332 data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15
 333 data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B
 334 data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5
 335 data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A
 336 data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177
 337 data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C
 338 LOCAL_OBJECT_END(Constants_exp_64_T1)
 339
 340 LOCAL_OBJECT_START(Constants_exp_64_T2)
 341 data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
 342 data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
 343 data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
 344 data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
 345 data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
 346 data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
 347 data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
 348 data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
 349 data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
 350 data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
 351 data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
 352 data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
 353 data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
 354 data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
 355 data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
 356 data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37
 357 LOCAL_OBJECT_END(Constants_exp_64_T2)
 358
 359 LOCAL_OBJECT_START(Constants_exp_64_W1)
 360 data8 0x0000000000000000, 0xBE384454171EC4B4
 361 data8 0xBE6947414AA72766, 0xBE5D32B6D42518F8
 362 data8 0x3E68D96D3A319149, 0xBE68F4DA62415F36
 363 data8 0xBE6DDA2FC9C86A3B, 0x3E6B2E50F49228FE
 364 data8 0xBE49C0C21188B886, 0x3E64BFC21A4C2F1F
 365 data8 0xBE6A2FBB2CB98B54, 0x3E5DC5DE9A55D329
 366 data8 0x3E69649039A7AACE, 0x3E54728B5C66DBA5
 367 data8 0xBE62B0DBBA1C7D7D, 0x3E576E0409F1AF5F
 368 data8 0x3E6125001A0DD6A1, 0xBE66A419795FBDEF
 369 data8 0xBE5CDE8CE1BD41FC, 0xBE621376EA54964F
 370 data8 0x3E6370BE476E76EE, 0x3E390D1A3427EB92
 371 data8 0x3E1336DE2BF82BF8, 0xBE5FF1CBD0F7BD9E
 372 data8 0xBE60A3550CEB09DD, 0xBE5CA37E0980F30D
 373 data8 0xBE5C541B4C082D25, 0xBE5BBECA3B467D29
 374 data8 0xBE400D8AB9D946C5, 0xBE5E2A0807ED374A
 375 data8 0xBE66CB28365C8B0A, 0x3E3AAD5BD3403BCA
 376 data8 0x3E526055C7EA21E0, 0xBE442C75E72880D6
 377 data8 0x3E58B2BB85222A43, 0xBE5AAB79522C42BF
 378 data8 0xBE605CB4469DC2BC, 0xBE589FA7A48C40DC
 379 data8 0xBE51C2141AA42614, 0xBE48D087C37293F4
 380 data8 0x3E367A1CA2D673E0, 0xBE51BEBB114F7A38
 381 data8 0xBE6348E5661A4B48, 0xBDF526431D3B9962
 382 data8 0x3E3A3B5E35A78A53, 0xBE46C46C1CECD788
 383 data8 0xBE60B7EC7857D689, 0xBE594D3DD14F1AD7
 384 data8 0xBE4F9C304C9A8F60, 0xBE52187302DFF9D2
 385 data8 0xBE5E4C8855E6D68F, 0xBE62140F667F3DC4
 386 data8 0xBE36961B3BF88747, 0x3E602861C96EC6AA
 387 data8 0xBE3B5151D57FD718, 0x3E561CD0FC4A627B
 388 data8 0xBE3A5217CA913FEA, 0x3E40A3CC9A5D193A
 389 data8 0xBE5AB71310A9C312, 0x3E4FDADBC5F57719
 390 data8 0x3E361428DBDF59D5, 0x3E5DB5DB61B4180D
 391 data8 0xBE42AD5F7408D856, 0x3E2A314831B2B707
 392 LOCAL_OBJECT_END(Constants_exp_64_W1)
 393
 394 LOCAL_OBJECT_START(Constants_exp_64_W2)
 395 data8 0x0000000000000000, 0xBE641F2537A3D7A2
 396 data8 0xBE68DD57AD028C40, 0xBE5C77D8F212B1B6
 397 data8 0x3E57878F1BA5B070, 0xBE55A36A2ECAE6FE
 398 data8 0xBE620608569DFA3B, 0xBE53B50EA6D300A3
 399 data8 0x3E5B5EF2223F8F2C, 0xBE56A0D9D6DE0DF4
 400 data8 0xBE64EEF3EAE28F51, 0xBE5E5AE2367EA80B
 401 data8 0x3E47CB1A5FCBC02D, 0xBE656BA09BDAFEB7
 402 data8 0x3E6E70C6805AFEE7, 0xBE6E0509A3415EBA
 403 data8 0xBE56856B49BFF529, 0x3E66DD3300508651
 404 data8 0x3E51165FC114BC13, 0x3E53333DC453290F
 405 data8 0x3E6A072B05539FDA, 0xBE47CD877C0A7696
 406 data8 0xBE668BF4EB05C6D9, 0xBE67C3E36AE86C93
 407 data8 0xBE533904D0B3E84B, 0x3E63E8D9556B53CE
 408 data8 0x3E212C8963A98DC8, 0xBE33138F032A7A22
 409 data8 0x3E530FA9BC584008, 0xBE6ADF82CCB93C97
 410 data8 0x3E5F91138370EA39, 0x3E5443A4FB6A05D8
 411 data8 0x3E63DACD181FEE7A, 0xBE62B29DF0F67DEC
 412 data8 0x3E65C4833DDE6307, 0x3E5BF030D40A24C1
 413 data8 0x3E658B8F14E437BE, 0xBE631C29ED98B6C7
 414 data8 0x3E6335D204CF7C71, 0x3E529EEDE954A79D
 415 data8 0x3E5D9257F64A2FB8, 0xBE6BED1B854ED06C
 416 data8 0x3E5096F6D71405CB, 0xBE3D4893ACB9FDF5
 417 data8 0xBDFEB15801B68349, 0x3E628D35C6A463B9
 418 data8 0xBE559725ADE45917, 0xBE68C29C042FC476
 419 data8 0xBE67593B01E511FA, 0xBE4A4313398801ED
 420 data8 0x3E699571DA7C3300, 0x3E5349BE08062A9E
 421 data8 0x3E5229C4755BB28E, 0x3E67E42677A1F80D
 422 data8 0xBE52B33F6B69C352, 0xBE6B3550084DA57F
 423 data8 0xBE6DB03FD1D09A20, 0xBE60CBC42161B2C1
 424 data8 0x3E56ED9C78A2B771, 0xBE508E319D0FA795
 425 data8 0xBE59482AFD1A54E9, 0xBE2A17CEB07FD23E
 426 data8 0x3E68BF5C17365712, 0x3E3956F9B3785569
 427 LOCAL_OBJECT_END(Constants_exp_64_W2)
 428
 429 LOCAL_OBJECT_START(Constants_log_80_P)
 430 // P_8, P_7, ..., P_1
 431 data8 0xCCCE8B883B1042BC, 0x0000BFFB // P_8
 432 data8 0xE38997B7CADC2149, 0x00003FFB // P_7
 433 data8 0xFFFFFFFEB1ACB090, 0x0000BFFB // P_6
 434 data8 0x9249249806481C81, 0x00003FFC // P_5
 435 data8 0x0000000000000000, 0x00000000 // Pad for bank conflicts
 436 data8 0xAAAAAAAAAAAAB0EF, 0x0000BFFC // P_4
 437 data8 0xCCCCCCCCCCC91416, 0x00003FFC // P_3
 438 data8 0x8000000000000000, 0x0000BFFD // P_2
 439 data8 0xAAAAAAAAAAAAAAAB, 0x00003FFD // P_1
 440 LOCAL_OBJECT_END(Constants_log_80_P)
 441
 442 LOCAL_OBJECT_START(Constants_log_80_Q)
 443 // log2_hi, log2_lo, Q_6, Q_5, Q_4, Q_3, Q_2, Q_1
 444 data8 0xB172180000000000,0x00003FFE
 445 data8 0x82E308654361C4C6,0x0000BFE2
 446 data8 0x92492453A51BE0AF,0x00003FFC
 447 data8 0xAAAAAB73A0CFD29F,0x0000BFFC
 448 data8 0xCCCCCCCCCCCE3872,0x00003FFC
 449 data8 0xFFFFFFFFFFFFB4FB,0x0000BFFC
 450 data8 0xAAAAAAAAAAAAAAAB,0x00003FFD
 451 data8 0x8000000000000000,0x0000BFFE
 452 LOCAL_OBJECT_END(Constants_log_80_Q)
 453
 454 LOCAL_OBJECT_START(Constants_log_80_Z_G_H_h1)
 455 // Z1 - 16 bit fixed, G1 and H1 IEEE single, h1 IEEE double
 456 data4 0x00008000,0x3F800000,0x00000000,0x00000000
 457 data4 0x00000000,0x00000000,0x00000000,0x00000000
 458 data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000
 459 data4 0xEBA0E0D1,0x8B1D330B,0x00003FDA,0x00000000
 460 data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000
 461 data4 0x9EADD553,0xE2AF365E,0x00003FE2,0x00000000
 462 data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000
 463 data4 0x752F34A2,0xF585FEC3,0x0000BFE3,0x00000000
 464 data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000
 465 data4 0x893B03F3,0xF3546435,0x00003FE2,0x00000000
 466 data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000
 467 data4 0x39CDD2AC,0xBABA62E0,0x00003FE4,0x00000000
 468 data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000
 469 data4 0x457978A1,0x8718789F,0x00003FE2,0x00000000
 470 data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000
 471 data4 0x3185E56A,0x9442DF96,0x0000BFE4,0x00000000
 472 data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000
 473 data4 0x2BBE2CBD,0xCBF9A4BF,0x00003FE4,0x00000000
 474 data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000
 475 data4 0x852D5935,0xF3537535,0x00003FE3,0x00000000
 476 data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000
 477 data4 0x46CDF32F,0xA1F1E699,0x0000BFDF,0x00000000
 478 data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000
 479 data4 0xD8484CE3,0x84A61856,0x00003FE4,0x00000000
 480 data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000
 481 data4 0xFF28821B,0xC7DD97E0,0x0000BFE2,0x00000000
 482 data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000
 483 data4 0xEF1FD32F,0xD3C4A887,0x00003FE3,0x00000000
 484 data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000
 485 data4 0x464C76DA,0x84672BE6,0x00003FE5,0x00000000
 486 data4 0x00004211,0x3F042108,0x3F29516A,0x00000000
 487 data4 0x18835FB9,0x9A43A511,0x0000BFE5,0x00000000
 488 LOCAL_OBJECT_END(Constants_log_80_Z_G_H_h1)
 489
 490 LOCAL_OBJECT_START(Constants_log_80_Z_G_H_h2)
 491 // Z2 - 16 bit fixed, G2 and H2 IEEE single, h2 IEEE double
 492 data4 0x00008000,0x3F800000,0x00000000,0x00000000
 493 data4 0x00000000,0x00000000,0x00000000,0x00000000
 494 data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000
 495 data4 0x211398BF,0xAD08B116,0x00003FDB,0x00000000
 496 data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000
 497 data4 0xC376958E,0xB106790F,0x00003FDE,0x00000000
 498 data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000
 499 data4 0x79A7679A,0xFD03F242,0x0000BFDA,0x00000000
 500 data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000
 501 data4 0x05E7AE08,0xF03F81C3,0x0000BFDF,0x00000000
 502 data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000
 503 data4 0x049EB22F,0xD1B87D3C,0x00003FDE,0x00000000
 504 data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000
 505 data4 0x3A9E81E0,0xFABC8B95,0x00003FDF,0x00000000
 506 data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000
 507 data4 0x7C4B5443,0xF5F3653F,0x00003FDF,0x00000000
 508 data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000
 509 data4 0xF65A1773,0xE78AB204,0x00003FE0,0x00000000
 510 data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000
 511 data4 0x7B8EF695,0xDB7CBFFF,0x0000BFE0,0x00000000
 512 data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000
 513 data4 0xCF773FB3,0xC0241AEA,0x0000BFE0,0x00000000
 514 data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000
 515 data4 0xC9539FDF,0xFC8F4D48,0x00003FE1,0x00000000
 516 data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000
 517 data4 0x954665C2,0x9CD035FB,0x0000BFE1,0x00000000
 518 data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000
 519 data4 0xDD367A30,0xEC9017C7,0x00003FE1,0x00000000
 520 data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000
 521 data4 0xCB11189C,0xEE6625D3,0x0000BFE1,0x00000000
 522 data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000
 523 data4 0xBE11C424,0xA49C8DB5,0x0000BFE0,0x00000000
 524 LOCAL_OBJECT_END(Constants_log_80_Z_G_H_h2)
 525
 526 LOCAL_OBJECT_START(Constants_log_80_h3_G_H)
 527 // h3 IEEE double extended, H3 and G3 IEEE single
 528 data4 0x112666B0,0xAAACAAB1,0x00003FD3,0x3F7FFC00
 529 data4 0x9B7FAD21,0x90051030,0x00003FD8,0x3F7FF400
 530 data4 0xF4D783C4,0xA6B46F46,0x00003FDA,0x3F7FEC00
 531 data4 0x11C6DDCA,0xDA148D88,0x0000BFD8,0x3F7FE400
 532 data4 0xCA964D95,0xCE65C1D8,0x0000BFD8,0x3F7FDC00
 533 data4 0x23412D13,0x883838EE,0x0000BFDB,0x3F7FD400
 534 data4 0x983ED687,0xB7E5CFA1,0x00003FDB,0x3F7FCC08
 535 data4 0xE3C3930B,0xDBE23B16,0x0000BFD9,0x3F7FC408
 536 data4 0x48AA4DFC,0x9B92F1FC,0x0000BFDC,0x3F7FBC10
 537 data4 0xCE9C8F7E,0x9A8CEB15,0x0000BFD9,0x3F7FB410
 538 data4 0x0DECE74A,0x8C220879,0x00003FDC,0x3F7FAC18
 539 data4 0x2F053150,0xB25CA912,0x0000BFDA,0x3F7FA420
 540 data4 0xD9A5BE20,0xA5876555,0x00003FDB,0x3F7F9C20
 541 data4 0x2053F087,0xC919BB6E,0x00003FD9,0x3F7F9428
 542 data4 0x041E9A77,0xB70BDA79,0x00003FDC,0x3F7F8C30
 543 data4 0xEA1C9C30,0xF18A5C08,0x00003FDA,0x3F7F8438
 544 data4 0x796D89E5,0xA3790D84,0x0000BFDD,0x3F7F7C40
 545 data4 0xA2915A3A,0xE1852369,0x0000BFDD,0x3F7F7448
 546 data4 0xA39ED868,0xD803858F,0x00003FDC,0x3F7F6C50
 547 data4 0x9417EBB7,0xB2EEE356,0x0000BFDD,0x3F7F6458
 548 data4 0x9BB0D07F,0xED5C1F8A,0x0000BFDC,0x3F7F5C68
 549 data4 0xE87C740A,0xD6D201A0,0x0000BFDD,0x3F7F5470
 550 data4 0x1CA74025,0xE8DEBF5E,0x00003FDC,0x3F7F4C78
 551 data4 0x1F34A7EB,0x9A995A97,0x0000BFDC,0x3F7F4488
 552 data4 0x359EED97,0x9CB0F742,0x0000BFDA,0x3F7F3C90
 553 data4 0xBBC6A1C8,0xD6F833C2,0x0000BFDD,0x3F7F34A0
 554 data4 0xE71090EC,0xE1F68F2A,0x00003FDC,0x3F7F2CA8
 555 data4 0xC160A74F,0xD1881CF1,0x0000BFDB,0x3F7F24B8
 556 data4 0xD78CB5A4,0x9AD05AE2,0x00003FD6,0x3F7F1CC8
 557 data4 0x9A77DC4B,0xE658CB8E,0x0000BFDD,0x3F7F14D8
 558 data4 0x6BD6D312,0xBA281296,0x00003FDC,0x3F7F0CE0
 559 data4 0xF95210D0,0xB478BBEB,0x0000BFDB,0x3F7F04F0
 560 data4 0x38800100,0x39400480,0x39A00640,0x39E00C41 // H's start here
 561 data4 0x3A100A21,0x3A300F22,0x3A4FF51C,0x3A6FFC1D
 562 data4 0x3A87F20B,0x3A97F68B,0x3AA7EB86,0x3AB7E101
 563 data4 0x3AC7E701,0x3AD7DD7B,0x3AE7D474,0x3AF7CBED
 564 data4 0x3B03E1F3,0x3B0BDE2F,0x3B13DAAA,0x3B1BD766
 565 data4 0x3B23CC5C,0x3B2BC997,0x3B33C711,0x3B3BBCC6
 566 data4 0x3B43BAC0,0x3B4BB0F4,0x3B53AF6D,0x3B5BA620
 567 data4 0x3B639D12,0x3B6B9444,0x3B7393BC,0x3B7B8B6D
 568 LOCAL_OBJECT_END(Constants_log_80_h3_G_H)
 569
 570 GR_sig_inv_ln2      = r14
 571 GR_rshf_2to51       = r15
 572 GR_exp_2tom51       = r16
 573 GR_rshf             = r17
 574 GR_exp_half         = r18
 575 GR_sign_mask        = r19
 576 GR_exp_square_oflow = r20
 577 GR_exp_square_uflow = r21
 578 GR_exp_ynear1_oflow = r22
 579 GR_exp_ynear1_uflow = r23
 580 GR_signif_Z         = r24
 581
 582 GR_signexp_x        = r32
 583
 584 GR_exp_x            = r33
 585
 586 GR_Table_Ptr        = r34
 587
 588 GR_Table_Ptr1       = r35
 589
 590 GR_Index1           = r36
 591
 592 GR_Index2           = r37
 593 GR_Expo_X           = r37
 594
 595 GR_M                = r38
 596
 597 GR_X_0              = r39
 598 GR_Mask             = r39
 599
 600 GR_X_1              = r40
 601 GR_W1_ptr           = r40
 602
 603 GR_W2_ptr           = r41
 604 GR_X_2              = r41
 605
 606 GR_Z_1              = r42
 607 GR_M2               = r42
 608
 609 GR_M1               = r43
 610 GR_Z_2              = r43
 611
 612 GR_N                = r44
 613 GR_k                = r44
 614
 615 GR_Big_Pos_Exp      = r45
 616
 617 GR_exp_pos_max      = r46
 618
 619 GR_exp_bias_p_k     = r47
 620
 621 GR_Index3           = r48
 622 GR_temp             = r48
 623
 624 GR_vsm_expo         = r49
 625
 626 GR_T1_ptr           = r50
 627 GR_P_ptr1           = r50
 628 GR_T2_ptr           = r51
 629 GR_P_ptr2           = r51
 630 GR_N_fix            = r52
 631 GR_exp_y            = r53
 632 GR_signif_y         = r54
 633 GR_signexp_y        = r55
 634 GR_fraction_y       = r55
 635 GR_low_order_bit    = r56
 636 GR_exp_mask         = r57
 637 GR_exp_bias         = r58
 638 GR_y_sign           = r59
 639 GR_table_base       = r60
 640 GR_ptr_exp_Arg      = r61
 641 GR_Delta_Exp        = r62
 642 GR_Special_Exp      = r63
 643 GR_exp_neg_max      = r64
 644 GR_Big_Neg_Exp      = r65
 645
 646 //** Registers for unwind support
 647
 648 GR_SAVE_PFS         = r59
 649 GR_SAVE_B0          = r60
 650 GR_SAVE_GP          = r61
 651 GR_Parameter_X      = r62
 652 GR_Parameter_Y      = r63
 653 GR_Parameter_RESULT = r64
 654 GR_Parameter_TAG    = r65
 655
 656 //**
 657
 658 FR_Input_X          = f8
 659 FR_Result           = f8
 660 FR_Input_Y          = f9
 661
 662 FR_Neg              = f10
 663 FR_P_hi             = f10
 664 FR_X                = f10
 665
 666 FR_Half             = f11
 667 FR_h_3              = f11
 668 FR_poly_hi          = f11
 669
 670 FR_Sgn              = f12
 671
 672 FR_half_W           = f13
 673
 674 FR_X_cor            = f14
 675 FR_P_lo             = f14
 676
 677 FR_W                = f15
 678
 679 FR_X_lo             = f32
 680
 681 FR_S                = f33
 682 FR_W3               = f33
 683
 684 FR_Y_hi             = f34
 685 FR_logx_hi          = f34
 686
 687 FR_Z                = f35
 688 FR_logx_lo          = f35
 689 FR_GS_hi            = f35
 690 FR_Y_lo             = f35
 691
 692 FR_r_cor            = f36
 693 FR_Scale            = f36
 694
 695 FR_G_1              = f37
 696 FR_G                = f37
 697 FR_Wsq              = f37
 698 FR_temp             = f37
 699
 700 FR_H_1              = f38
 701 FR_H                = f38
 702 FR_W4               = f38
 703
 704 FR_h                = f39
 705 FR_h_1              = f39
 706 FR_N                = f39
 707 FR_P_7              = f39
 708
 709 FR_G_2              = f40
 710 FR_P_8              = f40
 711 FR_L_hi             = f40
 712
 713 FR_H_2              = f41
 714 FR_L_lo             = f41
 715 FR_A_1              = f41
 716
 717 FR_h_2              = f42
 718
 719 FR_W1               = f43
 720
 721 FR_G_3              = f44
 722 FR_P_8              = f44
 723 FR_T1               = f44
 724
 725 FR_log2_hi          = f45
 726 FR_W2               = f45
 727
 728 FR_GS_lo            = f46
 729 FR_T2               = f46
 730
 731 FR_W_1_p1           = f47
 732 FR_H_3              = f47
 733
 734 FR_float_N          = f48
 735
 736 FR_A_2              = f49
 737
 738 FR_Q_4              = f50
 739 FR_r4               = f50
 740
 741 FR_Q_3              = f51
 742 FR_A_3              = f51
 743
 744 FR_Q_2              = f52
 745 FR_P_2              = f52
 746
 747 FR_Q_1              = f53
 748 FR_P_1              = f53
 749 FR_T                = f53
 750
 751 FR_Wp1              = f54
 752 FR_Q_5              = f54
 753 FR_P_3              = f54
 754
 755 FR_Q_6              = f55
 756
 757 FR_log2_lo          = f56
 758 FR_Two              = f56
 759
 760 FR_Big              = f57
 761
 762 FR_neg_2_mK         = f58
 763
 764 FR_r                = f59
 765
 766 FR_poly_lo          = f60
 767
 768 FR_poly             = f61
 769
 770 FR_P_5              = f62
 771 FR_Result_small     = f62
 772
 773 FR_rsq              = f63
 774
 775 FR_Delta            = f64
 776
 777 FR_save_Input_X     = f65
 778 FR_norm_X           = f66
 779 FR_norm_Y           = f67
 780 FR_Y_lo_2           = f68
 781
 782 FR_P_6              = f69
 783 FR_Result_big       = f69
 784
 785 FR_RSHF_2TO51       = f70
 786 FR_INV_LN2_2TO63    = f71
 787 FR_2TOM51           = f72
 788 FR_RSHF             = f73
 789 FR_TMP1             = f74
 790 FR_TMP2             = f75
 791 FR_TMP3             = f76
 792 FR_Tscale           = f77
 793 FR_P_4              = f78
 794 FR_NBig             = f79
 795
 796
 797 .section .text
 798 GLOBAL_LIBM_ENTRY(powl)
 799 //
 800 //     Get significand of x.  It is the critical path.
 801 //
 802 { .mfi
 803       getf.sig GR_signif_Z = FR_Input_X    // Get significand of x
 804       fclass.m p11, p12 = FR_Input_X, 0x0b // Test x unorm
 805       nop.i 999
 806 }
 807 { .mfi
 808       nop.m 999
 809       fnorm.s1 FR_norm_X = FR_Input_X      // Normalize x
 810       mov GR_exp_half = 0xffff - 1         // Exponent for 0.5
 811 }
 812 ;;
 813
 814 { .mfi
 815       alloc  r32 = ar.pfs,0,30,4,0
 816       fclass.m p7, p0 =  FR_Input_Y, 0x1E7 // Test y natval, nan, inf, zero
 817       mov GR_exp_pos_max = 0x13fff         // Max exponent for pos oflow test
 818 }
 819 { .mfi
 820       addl GR_table_base = @ltoff(Constants_exp_64_Arg#), gp // Ptr to tables
 821       fnorm.s1 FR_norm_Y = FR_Input_Y      // Normalize y
 822       mov GR_exp_neg_max = 0x33fff         // Max exponent for neg oflow test
 823 }
 824 ;;
 825
 826 { .mfi
 827       getf.exp GR_signexp_y = FR_Input_Y   // Get sign and exp of y
 828 (p12) fclass.m p11, p0 =  FR_Input_Y, 0x0b // Test y unorm
 829       mov GR_sign_mask = 0x20000           // Sign mask
 830 }
 831 { .mfi
 832       ld8 GR_table_base = [GR_table_base]  // Get base address for tables
 833       fadd.s1 FR_Two = f1, f1              // Form 2.0 for square test
 834       mov GR_exp_mask = 0x1FFFF            // Exponent mask
 835 }
 836 ;;
 837
 838 { .mfi
 839       getf.sig GR_signif_y = FR_Input_Y    // Get significand of y
 840       fclass.m p6, p0 =  FR_Input_X, 0x1E7 // Test x natval, nan, inf, zero
 841       nop.i 999
 842 }
 843 ;;
 844
 845 { .mfi
 846       getf.exp GR_signexp_x = FR_Input_X   // Get signexp of x
 847       fmerge.s FR_save_Input_X = FR_Input_X, FR_Input_X
 848       extr.u GR_Index1 = GR_signif_Z, 59, 4  // Extract upper 4 signif bits of x
 849 }
 850 { .mfb
 851       setf.exp FR_Half = GR_exp_half       // Load half
 852       nop.f 999
 853 (p11) br.cond.spnt  POWL_DENORM            // Branch if x or y denorm/unorm
 854 }
 855 ;;
 856
 857 // Return here from POWL_DENORM
 858 POWL_COMMON:
 859 { .mfi
 860       setf.exp FR_Big = GR_exp_pos_max     // Form big pos value for oflow test
 861       fclass.nm p11, p0 = FR_Input_Y, 0x1FF // Test Y unsupported
 862       shl GR_Index1 = GR_Index1,5          // Adjust index1 pointer x 32
 863 }
 864 { .mfi
 865       add GR_Table_Ptr = 0x7c0, GR_table_base // Constants_log_80_Z_G_H_h1
 866       fma.s1 FR_Sgn = f1,f1,f0             // Assume result positive
 867       mov GR_exp_bias = 0xFFFF             // Form exponent bias
 868 }
 869 ;;
 870
 871 //
 872 //     Identify NatVals, NaNs, Infs, and Zeros.
 873 //
 874 //
 875 //     Remove sign bit from exponent of y.
 876 //     Check for x = 1
 877 //     Branch on Infs, Nans, Zeros, and Natvals
 878 //     Check to see that exponent < 0
 879 //
 880 { .mfi
 881       setf.exp FR_NBig = GR_exp_neg_max    // Form big neg value for oflow test
 882       fclass.nm p8, p0 =  FR_Input_X, 0x1FF  // Test X unsupported
 883       and GR_exp_y = GR_exp_mask,GR_signexp_y // Get biased exponent of y
 884 }
 885 { .mfb
 886       add GR_Index1 = GR_Index1,GR_Table_Ptr
 887       nop.f 999
 888 (p6)  br.cond.spnt POWL_64_SPECIAL         // Branch if x natval, nan, inf, zero
 889 }
 890 ;;
 891
 892 //     load Z_1 from Index1
 893
 894 // There is logic starting here to determine if y is an integer when x < 0.
 895 // If 0 < |y| < 1 then clearly y is not an integer.
 896 // If |y| > 1, then the significand of y is shifted left by the size of
 897 //    the exponent of y.  This preserves the lsb of the integer part + the
 898 //    fractional bits.  The lsb of the integer can be tested to determine if
 899 //    the integer is even or odd.  The fractional bits can be tested.  If zero,
 900 //    then y is an integer.
 901 //
 902 { .mfi
 903       ld2 GR_Z_1 =[GR_Index1],4            // Load Z_1
 904       fmerge.s FR_Z = f0, FR_norm_X        // Z = |x|
 905       extr.u GR_X_0 = GR_signif_Z, 49, 15  // Extract X_0 from significand
 906 }
 907 { .mfb
 908       cmp.lt p9, p0 = GR_exp_y,GR_exp_bias // Test 0 < |y| < 1
 909       nop.f 999
 910 (p7)  br.cond.spnt POWL_64_SPECIAL         // Branch if y natval, nan, inf, zero
 911 }
 912 ;;
 913
 914 { .mfb
 915       ldfs  FR_G_1 = [GR_Index1],4         // Load G_1
 916       fcmp.eq.s1 p10, p0 =  FR_Input_Y, f1 // Test Y = +1.0
 917 (p8)  br.cond.spnt POWL_64_UNSUPPORT       // Branch if x unsupported
 918 }
 919 ;;
 920
 921 //
 922 //     X_0  = High order 15 bit of Z
 923 //
 924 { .mfb
 925       ldfs  FR_H_1 = [GR_Index1],8             // Load H_1
 926 (p9)  fcmp.lt.unc.s1 p9, p0 = FR_Input_X, f0   // Test x<0, 0 <|y|<1
 927 (p11) br.cond.spnt POWL_64_UNSUPPORT           // Branch if y unsupported
 928 }
 929 ;;
 930
 931 { .mfi
 932       ldfe FR_h_1 = [GR_Index1]                // Load h_1
 933       fcmp.eq.s1 p7, p0 =  FR_Input_Y, FR_Two  // Test y = 2.0
 934       pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15     // X_1 = X_0 * Z_1 (bits 15-30)
 935                                                // Wait 4 cycles to use result
 936 }
 937 { .mfi
 938       add GR_Table_Ptr = 0x9c0, GR_table_base  // Constants_log_80_Z_G_H_h2
 939       nop.f 999
 940       sub GR_exp_y = GR_exp_y,GR_exp_bias      // Get true exponent of y
 941 }
 942 ;;
 943
 944 //
 945 //      Branch for (x < 0) and Y not an integer.
 946 //
 947 { .mfb
 948       nop.m 999
 949       fcmp.lt.s1 p6, p0  =  FR_Input_X, f0     // Test x < 0
 950 (p9)  br.cond.spnt POWL_64_XNEG                // Branch if x < 0, 0 < |y| < 1
 951 }
 952 ;;
 953
 954 { .mfi
 955       nop.m 999
 956       fcmp.eq.s1 p12, p0 =  FR_Input_X, f1     // Test x=+1.0
 957       nop.i 999
 958 }
 959 { .mfb
 960       nop.m 999
 961       fsub.s1 FR_W = FR_Z, f1                  // W = Z - 1
 962 (p7)  br.cond.spnt POWL_64_SQUARE              // Branch if y=2
 963 }
 964 ;;
 965
 966 { .mfi
 967       nop.m 999
 968 (p10) fmpy.s0 FR_Result = FR_Input_X, f1       // If y=+1.0, result=x
 969 (p6)  shl GR_fraction_y=  GR_signif_y,GR_exp_y // Get lsb of int + fraction
 970                                                // Wait 4 cycles to use result
 971 }
 972 ;;
 973
 974 { .mfi
 975       nop.m 999
 976 (p12) fma.s0 FR_Result = FR_Input_Y, f0, f1    // If x=1.0, result=1, chk denorm
 977       extr.u GR_Index2 = GR_X_1, 6, 4          // Extract index2
 978 }
 979 ;;
 980
 981 //
 982 //     N = exponent of Z
 983 //
 984 { .mib
 985       getf.exp GR_N =  FR_Z                    // Get exponent of Z (also x)
 986       shl GR_Index2=GR_Index2,5                // Index2  x 32 bytes
 987 (p10) br.ret.spnt  b0                          // Exit if y=+1.0
 988 }
 989 ;;
 990
 991 { .mib
 992       add GR_Index2 = GR_Index2, GR_Table_Ptr  // Pointer to table 2
 993       nop.i 999
 994 (p12) br.ret.spnt  b0                          // Exit if x=+1.0
 995 }
 996 ;;
 997
 998 { .mmi
 999       ld2 GR_Z_2 =[GR_Index2],4                // Load Z_2
1000 ;;
1001       ldfs  FR_G_2 = [GR_Index2],4             // Load G_2
1002       nop.i 999
1003 }
1004 ;;
1005
1006 { .mii
1007       ldfs  FR_H_2 = [GR_Index2],8             // Load H_2
1008 (p6)  tbit.nz.unc p9, p0 = GR_fraction_y, 63   // Test x<0 and y odd integer
1009       add GR_Table_Ptr = 0xbcc, GR_table_base  // Constants_log_80_h3_G_H, G_3
1010 }
1011 ;;
1012
1013 //
1014 //      For x < 0 and y odd integer,, set sign = -1.
1015 //
1016 { .mfi
1017       getf.exp GR_M = FR_W                      // Get signexp of W
1018       nop.f 999
1019       pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15      // X_2 = X_1 * Z_2 (bits 15-30)
1020 }
1021 { .mfi
1022       ldfe FR_h_2 = [GR_Index2]                // Load h_2
1023 (p9)  fnma.s1 FR_Sgn = f1, f1, f0          // If x<0, y odd int, result negative
1024       sub GR_N = GR_N, GR_exp_bias             // Get true exponent of x = N
1025 }
1026 ;;
1027
1028 { .mfi
1029       add GR_Table_Ptr1 = 0xdc0, GR_table_base // Ptr to H_3
1030       fcmp.eq.s0 p11, p0 = FR_Input_Y, FR_Half // Test y=0.5, also set denorm
1031 (p6)  shl GR_fraction_y=  GR_fraction_y, 1     // Shift left 1 to get fraction
1032 }
1033 ;;
1034
1035 { .mmb
1036       setf.sig FR_float_N = GR_N
1037 (p6)  cmp.ne.unc p8, p0 = GR_fraction_y, r0    // Test x<0 and y not integer
1038 (p8)  br.cond.spnt POWL_64_XNEG                // Branch if x<0 and y not int
1039 }
1040 ;;
1041
1042 //
1043 //      Raise possible denormal operand exception for both X and Y.
1044 //      Set pointers in case |x| near 1
1045 //      Branch to embedded sqrt(x) if y=0.5
1046 //
1047 { .mfi
1048       add GR_P_ptr1 = 0x6b0, GR_table_base // Constants_log_80_P, P8, NEAR path
1049       fcmp.eq.s0 p12, p0 =  FR_Input_X, FR_Input_Y // Dummy to set denormal
1050       add GR_P_ptr2 = 0x700, GR_table_base // Constants_log_80_P, P4, NEAR path
1051 }
1052 { .mfb
1053       cmp.eq p15, p14 =  r0, r0            // Assume result safe (no over/under)
1054       fsub.s1  FR_Delta = FR_Input_Y,f1    // Delta = y - 1.0
1055 (p11) br.cond.spnt POWL_64_SQRT            // Branch if y=0.5
1056 }
1057 ;;
1058
1059 //
1060 //     Computes ln( x ) to extra precision
1061 //     Input  FR 1: FR_X
1062 //     Output FR 2: FR_Y_hi
1063 //     Output FR 3: FR_Y_lo
1064 //     Output PR 1: PR_Safe
1065 //
1066 { .mfi
1067       and GR_M = GR_exp_mask, GR_M            // Mask to get exponent of W
1068       nop.f 999
1069       extr.u GR_Index3 = GR_X_2, 1, 5         // Get index3
1070 }
1071 ;;
1072
1073 { .mmi
1074       shladd GR_Table_Ptr1 = GR_Index3,2,GR_Table_Ptr1 // Ptr to H_3
1075       shladd GR_Index3 = GR_Index3,4,GR_Table_Ptr      // Ptr to G_3
1076       sub GR_M = GR_M, GR_exp_bias            // Get true exponent of W
1077 }
1078 ;;
1079
1080 { .mib
1081       ldfs FR_G_3 = [GR_Index3],-12           // Load G_3
1082       cmp.gt  p7, p14 =  -8, GR_M             // Test if |x-1| < 2^-8
1083 (p7)  br.cond.spnt LOGL80_NEAR                // Branch if |x-1| < 2^-8
1084 }
1085 ;;
1086
1087 // Here if |x-1| >= 2^-8
1088 { .mmf
1089       ldfs FR_H_3 = [GR_Table_Ptr1]           // Load H_3
1090       nop.m 999
1091       nop.f 999
1092 }
1093 ;;
1094
1095 { .mfi
1096       ldfe FR_h_3 = [GR_Index3]               // Load h_3
1097       fmerge.se FR_S =  f1,FR_Z               // S = merge of 1.0 and signif(Z)
1098       nop.i 999
1099 }
1100 { .mfi
1101       add GR_Table_Ptr = 0x740, GR_table_base // Constants_log_80_Q
1102       fmpy.s1 FR_G = FR_G_1, FR_G_2           // G = G_1 * G_2
1103       nop.i 999
1104 }
1105 ;;
1106
1107 //
1108 //     Begin Loading Q's -  load log2_hi part
1109 //
1110 { .mfi
1111       ldfe FR_log2_hi = [GR_Table_Ptr],16     // Load log2_hi
1112       fadd.s1 FR_H = FR_H_1, FR_H_2           // H = H_1 + H_2
1113       nop.i 999
1114 };;
1115
1116 //
1117 //     h = h_1 + h_2
1118 //
1119 { .mfi
1120       ldfe FR_log2_lo = [GR_Table_Ptr],16     // Load log2_lo
1121       fadd.s1 FR_h = FR_h_1, FR_h_2           // h = h_1 + h_2
1122       nop.i 999
1123 }
1124 ;;
1125
1126 { .mfi
1127       ldfe FR_Q_6 = [GR_Table_Ptr],16         // Load Q_6
1128       fcvt.xf FR_float_N = FR_float_N
1129       nop.i 999
1130 }
1131 ;;
1132
1133 { .mfi
1134       ldfe FR_Q_5 = [GR_Table_Ptr],16         // Load Q_5
1135       nop.f 999
1136       nop.i 999
1137 }
1138 ;;
1139
1140 //
1141 //     G = G_1 * G_2 * G_3
1142 //
1143 { .mfi
1144       ldfe FR_Q_4 = [GR_Table_Ptr],16         // Load Q_4
1145       fmpy.s1 FR_G = FR_G, FR_G_3
1146       nop.i 999
1147 }
1148 ;;
1149
1150 //
1151 //     H = H_1 + H_2 + H_3
1152 //
1153 { .mfi
1154       ldfe FR_Q_3 = [GR_Table_Ptr],16         // Load Q_3
1155       fadd.s1 FR_H = FR_H, FR_H_3
1156       nop.i 999
1157 }
1158 ;;
1159
1160 //
1161 //     Y_lo = poly + Y_lo
1162 //
1163 //     h = h_1 + h_2 + h_3
1164 //
1165 { .mfi
1166       ldfe FR_Q_2 = [GR_Table_Ptr],16         // Load Q_2
1167       fadd.s1 FR_h = FR_h, FR_h_3
1168       nop.i 999
1169 }
1170 ;;
1171
1172 //
1173 //     GS_hi = G*S
1174 //     r = G*S -1
1175 //
1176 { .mfi
1177       ldfe FR_Q_1 = [GR_Table_Ptr],16         // Load Q_1
1178       fmpy.s1 FR_GS_hi = FR_G, FR_S
1179       nop.i 999
1180 }
1181 { .mfi
1182       nop.m 999
1183       fms.s1 FR_r = FR_G, FR_S, f1
1184       nop.i 999
1185 }
1186 ;;
1187
1188 //
1189 //     poly_lo = Q_5 + r * Q_6
1190 //
1191 { .mfi
1192       getf.exp GR_Delta_Exp =  FR_Delta     // Get signexp of y-1 for exp calc
1193       fma.s1 FR_poly_lo = FR_r, FR_Q_6, FR_Q_5
1194       nop.i 999
1195 }
1196 //
1197 //     r_cor = GS_hi -1
1198 //
1199 { .mfi
1200       nop.m 999
1201       fsub.s1 FR_r_cor = FR_GS_hi, f1
1202       nop.i 999
1203 }
1204 ;;
1205
1206 //
1207 //     GS_lo  = G*S - GS_hi
1208 //
1209 { .mfi
1210       nop.m 999
1211       fms.s1 FR_GS_lo = FR_G, FR_S, FR_GS_hi
1212       nop.i 999
1213 }
1214 ;;
1215
1216 //
1217 //     rsq = r * r
1218 //
1219 { .mfi
1220       nop.m 999
1221       fmpy.s1 FR_rsq = FR_r, FR_r
1222       nop.i 999
1223 }
1224 //
1225 //     G = float_N*log2_hi + H
1226 //
1227 { .mfi
1228       nop.m 999
1229       fma.s1 FR_G = FR_float_N, FR_log2_hi, FR_H
1230       nop.i 999
1231 }
1232 ;;
1233
1234 //
1235 //     Y_lo = float_N*log2_lo + h
1236 //
1237 { .mfi
1238       nop.m 999
1239       fma.s1 FR_Y_lo = FR_float_N, FR_log2_lo, FR_h
1240       nop.i 999
1241 }
1242 ;;
1243
1244 //
1245 //      poly_lo = Q_4 + r * poly_lo
1246 //      r_cor = r_cor - r
1247 //
1248 { .mfi
1249       nop.m 999
1250       fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_4
1251       nop.i 999
1252 }
1253 { .mfi
1254       nop.m 999
1255       fsub.s1 FR_r_cor = FR_r_cor, FR_r
1256       nop.i 999
1257 }
1258 ;;
1259
1260 //
1261 //      poly_hi = r * Q_2 + Q_1
1262 //      Y_hi = G + r
1263 //
1264 { .mfi
1265       nop.m 999
1266       fma.s1 FR_poly = FR_r, FR_Q_2, FR_Q_1
1267       nop.i 999
1268 }
1269 { .mfi
1270       nop.m 999
1271       fadd.s1 FR_Y_hi = FR_G, FR_r
1272       nop.i 999
1273 }
1274 ;;
1275
1276 //
1277 //      poly_lo = Q_3 + r * poly_lo
1278 //      r_cor = r_cor + GS_lo
1279 //
1280 { .mfi
1281       nop.m 999
1282       fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_3
1283       nop.i 999
1284 }
1285 { .mfi
1286       nop.m 999
1287       fadd.s1 FR_r_cor = FR_r_cor, FR_GS_lo
1288       nop.i 999
1289 }
1290 ;;
1291
1292 //
1293 //      Y_lo = G - Y_hi
1294 //
1295 { .mfi
1296       nop.m 999
1297       fsub.s1 FR_Y_lo_2 = FR_G, FR_Y_hi
1298       nop.i 999
1299 }
1300 ;;
1301
1302 //
1303 //      r_cor = r_cor + Y_lo
1304 //      poly = poly_hi + rsq * poly_lo
1305 //
1306 { .mfi
1307       add  GR_Table_Ptr   = 0x0, GR_table_base   // Constants_exp_64_Arg
1308       fadd.s1 FR_r_cor = FR_r_cor, FR_Y_lo
1309       nop.i 999
1310 }
1311 { .mfi
1312       nop.m 999
1313       fma.s1 FR_poly = FR_rsq, FR_poly_lo, FR_poly
1314       nop.i 999
1315 }
1316 ;;
1317
1318 //
1319 //      Load L_hi
1320 //      Load L_lo
1321 //      all long before they are needed.
1322 //      They are used in LOGL_RETURN PATH
1323 //
1324 //      Y_lo =  Y_lo + r
1325 //      poly = rsq * poly + r_cor
1326 //
1327 { .mfi
1328       ldfe FR_L_hi = [GR_Table_Ptr],16           // Load L_hi
1329       fadd.s1 FR_Y_lo = FR_Y_lo_2, FR_r
1330       nop.i 999
1331 }
1332 { .mfi
1333       nop.m 999
1334       fma.s1 FR_poly = FR_rsq, FR_poly, FR_r_cor
1335       nop.i 999
1336 }
1337 ;;
1338
1339 { .mfb
1340       ldfe FR_L_lo = [GR_Table_Ptr],16           // Load L_lo
1341       fadd.s1 FR_Y_lo = FR_Y_lo, FR_poly
1342       br.cond.sptk LOGL_RETURN                   // Branch to common code
1343 }
1344 ;;
1345
1346
1347 LOGL80_NEAR:
1348 // Here if |x-1| < 2^-8
1349 //
1350 //     Branch LOGL80_NEAR
1351 //
1352
1353 { .mmf
1354       ldfe FR_P_8 = [GR_P_ptr1],16           // Load P_8
1355       ldfe FR_P_4 = [GR_P_ptr2],16           // Load P_4
1356       fmpy.s1 FR_Wsq = FR_W, FR_W
1357 }
1358 ;;
1359
1360 { .mmi
1361       ldfe FR_P_7 = [GR_P_ptr1],16           // Load P_7
1362       ldfe FR_P_3 = [GR_P_ptr2],16           // Load P_3
1363       nop.i 999
1364 }
1365 ;;
1366
1367 { .mmi
1368       ldfe FR_P_6 = [GR_P_ptr1],16           // Load P_6
1369       ldfe FR_P_2 = [GR_P_ptr2],16           // Load P_2
1370       nop.i 999
1371 }
1372 ;;
1373
1374 { .mmi
1375       ldfe FR_P_5 = [GR_P_ptr1],16           // Load P_5
1376       ldfe FR_P_1 = [GR_P_ptr2],16           // Load P_1
1377       nop.i 999
1378 }
1379 ;;
1380
1381 { .mfi
1382       getf.exp GR_Delta_Exp =  FR_Delta      // Get signexp of y-1 for exp calc
1383       fmpy.s1 FR_W4 = FR_Wsq, FR_Wsq
1384       nop.i 999
1385 }
1386 { .mfi
1387       add  GR_Table_Ptr = 0x0, GR_table_base // Constants_exp_64_Arg
1388       fmpy.s1 FR_W3 = FR_Wsq, FR_W
1389       nop.i 999
1390 }
1391 ;;
1392
1393 { .mfi
1394       nop.m 999
1395       fmpy.s1 FR_half_W = FR_Half, FR_W
1396       nop.i 999
1397 }
1398 ;;
1399
1400 { .mfi
1401       ldfe FR_L_hi = [GR_Table_Ptr],16
1402       fma.s1 FR_poly_lo = FR_W, FR_P_8,FR_P_7
1403       nop.i 999
1404 }
1405 { .mfi
1406       nop.m 999
1407       fma.s1 FR_poly = FR_W, FR_P_4, FR_P_3
1408       nop.i 999
1409 }
1410 ;;
1411
1412 { .mfi
1413       ldfe FR_L_lo = [GR_Table_Ptr],16
1414       fnma.s1 FR_Y_hi = FR_W, FR_half_W, FR_W
1415       nop.i 999
1416 }
1417 ;;
1418
1419 { .mfi
1420       nop.m 999
1421       fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_6
1422       nop.i 999
1423 }
1424 { .mfi
1425       nop.m 999
1426       fma.s1 FR_poly = FR_W, FR_poly, FR_P_2
1427       nop.i 999
1428 }
1429 ;;
1430
1431 { .mfi
1432       nop.m 999
1433       fsub.s1 FR_Y_lo = FR_W, FR_Y_hi
1434       nop.i 999
1435 }
1436 ;;
1437
1438 { .mfi
1439       nop.m 999
1440       fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_5
1441       nop.i 999
1442 }
1443 { .mfi
1444       nop.m 999
1445       fma.s1 FR_poly = FR_W, FR_poly, FR_P_1
1446       nop.i 999
1447 }
1448 ;;
1449
1450 { .mfi
1451       nop.m 999
1452       fnma.s1 FR_Y_lo = FR_W, FR_half_W, FR_Y_lo
1453       nop.i 999
1454 }
1455 ;;
1456
1457 { .mfi
1458       nop.m 999
1459       fma.s1 FR_poly = FR_poly_lo, FR_W4, FR_poly
1460       nop.i 999
1461 }
1462 ;;
1463
1464 { .mfi
1465       nop.m 999
1466       fma.s1 FR_Y_lo = FR_poly, FR_W3, FR_Y_lo
1467       nop.i 999
1468 }
1469 ;;
1470
1471
1472 LOGL_RETURN:
1473 // Common code for completion of both logx paths
1474
1475 //
1476 //     L_hi, L_lo already loaded.
1477 //
1478 //
1479 //     kernel_log_80 computed ln(X)
1480 //     and return logX_hi and logX_lo as results.
1481 //     PR_pow_Safe set as well.
1482 //
1483 //
1484 //     Compute Y * (logX_hi + logX_lo)
1485 //     P_hi -> X
1486 //     P_lo -> X_cor
1487 //     (Manipulate names so that inputs are in
1488 //     the place kernel_exp expects them)
1489 //
1490 //     This function computes exp( x  + x_cor)
1491 //     Input  FR 1: FR_X
1492 //     Input  FR 2: FR_X_cor
1493 //     Output FR 3: FR_Y_hi
1494 //     Output FR 4: FR_Y_lo
1495 //     Output FR 5: FR_Scale
1496 //     Output PR 1: PR_Safe
1497 //
1498 //     P15 is True
1499 //
1500 // Load constants used in computing N using right-shift technique
1501 { .mlx
1502       mov GR_exp_2tom51 = 0xffff-51
1503       movl GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc  // significand of 1/ln2
1504 }
1505 { .mlx
1506       add  GR_Special_Exp = -50,GR_exp_bias
1507       movl GR_rshf_2to51 = 0x4718000000000000   // 1.10000 2^(63+51)
1508 }
1509 ;;
1510
1511 //
1512 //     Point to Table of W1s
1513 //     Point to Table of W2s
1514 //
1515 { .mmi
1516       add GR_W1_ptr   = 0x2b0, GR_table_base    // Constants_exp_64_W1
1517       add GR_W2_ptr   = 0x4b0, GR_table_base    // Constants_exp_64_W2
1518       cmp.le p6,p0= GR_Delta_Exp,GR_Special_Exp
1519 };;
1520
1521 // Form two constants we need
1522 //  1/ln2 * 2^63  to compute  w = x * 1/ln2 * 128
1523 //  1.1000..000 * 2^(63+63-12) to right shift int(N) into the significand
1524
1525 { .mfi
1526       setf.sig  FR_INV_LN2_2TO63 = GR_sig_inv_ln2 // form 1/ln2 * 2^63
1527       nop.f 999
1528       and GR_Delta_Exp=GR_Delta_Exp,GR_exp_mask  // Get exponent of y-1
1529 }
1530 { .mlx
1531       setf.d  FR_RSHF_2TO51 = GR_rshf_2to51    // Form const 1.1000 * 2^(63+51)
1532       movl GR_rshf = 0x43e8000000000000        // 1.10000 2^63 for right shift
1533 }
1534 ;;
1535
1536 { .mfi
1537       nop.m 999
1538       fmpy.s1 FR_X_lo = FR_Input_Y, FR_logx_lo // logx_lo is Y_lo
1539       cmp.eq  p15, p0=  r0, r0                 // Set p15, assume safe
1540 };;
1541
1542 { .mmi
1543       setf.exp FR_2TOM51 = GR_exp_2tom51 // Form 2^-51 for scaling float_N
1544       setf.d  FR_RSHF = GR_rshf          // Form right shift const 1.1000 * 2^63
1545       add GR_Table_Ptr1   = 0x50, GR_table_base // Constants_exp_64_P for
1546                                                 // EXPL_SMALL path
1547 }
1548 ;;
1549
1550 { .mmi
1551       ldfe FR_P_6 = [GR_Table_Ptr1],16          // Load P_6 for EXPL_SMALL path
1552 ;;
1553       ldfe FR_P_5 = [GR_Table_Ptr1],16          // Load P_5 for EXPL_SMALL path
1554       nop.i 999
1555 }
1556 ;;
1557
1558 { .mfi
1559       ldfe FR_P_4 = [GR_Table_Ptr1],16          // Load P_4 for EXPL_SMALL path
1560       fma.s1 FR_P_hi = FR_Input_Y, FR_logx_hi,FR_X_lo  // logx_hi ix Y_hi
1561       nop.i 999
1562 }
1563 ;;
1564
1565 { .mmi
1566       ldfe FR_P_3 = [GR_Table_Ptr1],16          // Load P_3 for EXPL_SMALL path
1567 ;;
1568       ldfe FR_P_2 = [GR_Table_Ptr1],16          // Load P_2 for EXPL_SMALL path
1569       nop.i 999
1570 }
1571 ;;
1572
1573 // N = X * Inv_log2_by_2^12
1574 // By adding 1.10...0*2^63 we shift and get round_int(N_signif) in significand.
1575 // We actually add 1.10...0*2^51 to X * Inv_log2 to do the same thing.
1576 { .mfi
1577       ldfe FR_P_1 = [GR_Table_Ptr1]             // Load P_1 for EXPL_SMALL path
1578       fma.s1 FR_N = FR_X, FR_INV_LN2_2TO63, FR_RSHF_2TO51
1579       nop.i 999
1580 }
1581 { .mfb
1582       nop.m 999
1583       fms.s1 FR_P_lo= FR_Input_Y, FR_logx_hi, FR_P_hi  // P_hi is X
1584 (p6)  br.cond.spnt POWL_Y_ALMOST_1              // Branch if |y-1| < 2^-50
1585 }
1586 ;;
1587
1588 { .mmi
1589       getf.exp GR_Expo_X = FR_X
1590       add GR_T1_ptr   = 0x0b0, GR_table_base    // Constants_exp_64_T1
1591       add GR_T2_ptr   = 0x1b0, GR_table_base    // Constants_exp_64_T2
1592 }
1593 ;;
1594
1595 // float_N = round_int(N)
1596 // The signficand of N contains the rounded integer part of X * 2^12/ln2,
1597 // as a twos complement number in the lower bits (that is, it may be negative).
1598 // That twos complement number (called N) is put into GR_N_fix.
1599
1600 // Since N is scaled by 2^51, it must be multiplied by 2^-51
1601 // before the shift constant 1.10000 * 2^63 is subtracted to yield float_N.
1602 // Thus, float_N contains the floating point version of N
1603
1604
1605 { .mfi
1606       add  GR_Table_Ptr   = 0x20, GR_table_base    // Constants_exp_64_A
1607       fms.s1 FR_float_N = FR_N, FR_2TOM51, FR_RSHF // Form float_N
1608       nop.i 999
1609 }
1610 //     Create low part of Y(ln(x)_hi + ln(x)_lo) as P_lo
1611 { .mfi
1612       mov GR_Big_Pos_Exp = 0x3ffe               // 16382, largest safe exponent
1613       fadd.s1 FR_P_lo = FR_P_lo, FR_X_lo
1614       mov GR_Big_Neg_Exp = -0x3ffd              // -16381 smallest safe exponent
1615 };;
1616
1617 { .mfi
1618       nop.m 999
1619       fmpy.s1 FR_rsq = FR_X, FR_X               // rsq = X*X for EXPL_SMALL path
1620       mov GR_vsm_expo = -70                     // Exponent for very small path
1621 }
1622 { .mfi
1623       nop.m 999
1624       fma.s1 FR_poly_lo = FR_P_6, FR_X, FR_P_5  // poly_lo for EXPL_SMALL path
1625       add GR_temp = 0x1,r0                      // For tiny signif if small path
1626 }
1627 ;;
1628
1629 //
1630 //      If expo_X < -6 goto exp_small
1631 //
1632 { .mmi
1633       getf.sig GR_N_fix = FR_N
1634       ldfe FR_A_3 = [GR_Table_Ptr],16         // Load A_3
1635       and GR_Expo_X = GR_Expo_X, GR_exp_mask  // Get exponent of X
1636 }
1637 ;;
1638
1639 { .mfi
1640       ldfe FR_A_2 = [GR_Table_Ptr],16         // Load A_2
1641       nop.f 999
1642       sub GR_Expo_X = GR_Expo_X, GR_exp_bias  // Get true exponent of X
1643 }
1644 ;;
1645
1646 //
1647 //     If -6 > Expo_X, set P9 and branch
1648 //
1649 { .mfb
1650       cmp.gt  p9, p0  =  -6, GR_Expo_X
1651       fnma.s1 FR_r = FR_L_hi, FR_float_N, FR_X // r = X - L_hi * float_N
1652 (p9)  br.cond.spnt EXPL_SMALL                  // Branch if |X| < 2^-6
1653 }
1654 ;;
1655
1656 //
1657 //     If 14 <= Expo_X, set P10
1658 //
1659 { .mib
1660       cmp.le  p10, p0 =  14, GR_Expo_X
1661       nop.i 999
1662 (p10) br.cond.spnt EXPL_HUGE                   // Branch if |X| >= 2^14
1663 }
1664 ;;
1665
1666 //
1667 //      Load single T1
1668 //      Load single T2
1669 //      W_1_p1 = W_1 + 1
1670 //
1671 { .mmi
1672       nop.m 999
1673       nop.m 999
1674       extr.u GR_M1 = GR_N_fix, 6, 6            // Extract index M_1
1675 }
1676 ;;
1677
1678 //
1679 //      k = extr.u(N_fix,0,6)
1680 //
1681 { .mmi
1682       shladd GR_W1_ptr = GR_M1,3,GR_W1_ptr     // Point to W1
1683       shladd GR_T1_ptr = GR_M1,2,GR_T1_ptr     // Point to T1
1684       extr.u GR_M2 = GR_N_fix, 0, 6            // Extract index M_2
1685 }
1686 ;;
1687
1688 // N_fix is only correct up to 50 bits because of our right shift technique.
1689 // Actually in the normal path we will have restricted K to about 14 bits.
1690 // Somewhat arbitrarily we extract 32 bits.
1691 { .mmi
1692       ldfd  FR_W1 = [GR_W1_ptr]
1693       shladd GR_W2_ptr = GR_M2,3,GR_W2_ptr     // Point to W2
1694       extr GR_k = GR_N_fix, 12, 32             // Extract k
1695 }
1696 ;;
1697
1698 { .mfi
1699       ldfs  FR_T1 = [GR_T1_ptr]
1700       fnma.s1 FR_r = FR_L_lo, FR_float_N, FR_r
1701       shladd GR_T2_ptr = GR_M2,2,GR_T2_ptr     // Point to T2
1702 }
1703 { .mfi
1704       add GR_exp_bias_p_k = GR_exp_bias, GR_k
1705       nop.f 999
1706       cmp.gt  p14,p15 = GR_k,GR_Big_Pos_Exp
1707 }
1708 ;;
1709
1710 //
1711 //      if k < big_neg_exp, set p14 and Safe=False
1712 //
1713 { .mmi
1714       ldfs  FR_T2 = [GR_T2_ptr]
1715 (p15) cmp.lt p14,p15 = GR_k,GR_Big_Neg_Exp
1716       nop.i 999
1717 }
1718 ;;
1719
1720 { .mmi
1721       setf.exp FR_Scale = GR_exp_bias_p_k
1722       ldfd  FR_W2 = [GR_W2_ptr]
1723       nop.i 999
1724 }
1725 ;;
1726
1727 { .mfi
1728       ldfe FR_A_1 = [GR_Table_Ptr],16
1729       fadd.s1 FR_r = FR_r, FR_X_cor
1730       nop.i 999
1731 }
1732 ;;
1733
1734 { .mfi
1735       nop.m 999
1736       fadd.s1 FR_W_1_p1 = FR_W1, f1
1737       nop.i 999
1738 }
1739 ;;
1740
1741 { .mfi
1742       nop.m 999
1743       fma.s1 FR_poly = FR_r, FR_A_3, FR_A_2
1744       nop.i 999
1745 }
1746 { .mfi
1747       nop.m 999
1748       fmpy.s1 FR_rsq = FR_r, FR_r
1749       nop.i 999
1750 }
1751 ;;
1752
1753 { .mfi
1754       nop.m 999
1755       fmpy.s1 FR_T = FR_T1, FR_T2
1756       nop.i 999
1757 }
1758 ;;
1759
1760 { .mfi
1761       nop.m 999
1762       fma.s1 FR_W = FR_W2, FR_W_1_p1, FR_W1
1763       nop.i 999
1764 }
1765 ;;
1766
1767 { .mfi
1768       nop.m 999
1769       fma.s1 FR_TMP1 = FR_Scale, FR_Sgn, f0
1770       nop.i 999
1771 }
1772 ;;
1773
1774 { .mfi
1775       nop.m 999
1776       fma.s1 FR_poly = FR_r, FR_poly, FR_A_1
1777       nop.i 999
1778 }
1779 ;;
1780
1781 { .mfi
1782       nop.m 999
1783       fma.s1 FR_TMP2 = FR_T, f1, f0            // TMP2 = Y_hi = T
1784       nop.i 999
1785 }
1786 ;;
1787
1788 { .mfi
1789       nop.m 999
1790       fadd.s1 FR_Wp1 = FR_W, f1
1791       nop.i 999
1792 }
1793 ;;
1794
1795 { .mfi
1796       nop.m 999
1797       fma.s1 FR_poly = FR_rsq, FR_poly,FR_r
1798       nop.i 999
1799 }
1800 ;;
1801
1802 { .mfi
1803       nop.m 999
1804       fma.s1 FR_Tscale = FR_T, FR_TMP1, f0    // Scale * Sgn * T
1805       nop.i 999
1806 }
1807 { .mfi
1808       nop.m 999
1809       fma.s1 FR_Y_lo = FR_Wp1, FR_poly, FR_W
1810       nop.i 999
1811 }
1812 ;;
1813
1814 { .mfb
1815       nop.m 999
1816       fmpy.s1 FR_TMP3 = FR_Y_lo, FR_Tscale
1817       br.cond.sptk POWL_64_SHARED
1818 }
1819 ;;
1820
1821
1822 EXPL_SMALL:
1823 // Here if |ylogx| < 2^-6
1824 //
1825 //     Begin creating lsb to perturb final result
1826 //
1827 { .mfi
1828       setf.sig FR_temp = GR_temp
1829       fma.s1 FR_poly_lo = FR_poly_lo, FR_X, FR_P_4
1830       cmp.lt  p12, p0 =  GR_Expo_X, GR_vsm_expo   // Test |ylogx| < 2^-70
1831 }
1832 { .mfi
1833       nop.m 999
1834       fma.s1 FR_poly_hi = FR_P_2, FR_X, FR_P_1
1835       nop.i 999
1836 }
1837 ;;
1838
1839 { .mfi
1840       nop.m 999
1841       fmpy.s1 FR_TMP2 = f1, f1
1842       nop.i 999
1843 }
1844 { .mfi
1845       nop.m 999
1846       fmpy.s1 FR_TMP1 = FR_Sgn, f1
1847       nop.i 999
1848 }
1849 ;;
1850
1851 { .mfi
1852       nop.m 999
1853       fmpy.s1 FR_r4 = FR_rsq, FR_rsq
1854 (p12) cmp.eq  p15, p0 =  r0, r0                   // Set safe if |ylogx| < 2^-70
1855 }
1856 { .mfb
1857       nop.m 999
1858 (p12) fmpy.s1 FR_TMP3 = FR_Sgn, FR_X
1859 (p12) br.cond.spnt POWL_64_SHARED                 // Branch if |ylogx| < 2^-70
1860 }
1861 ;;
1862
1863 { .mfi
1864       nop.m 999
1865       fma.s1 FR_poly_lo = FR_poly_lo, FR_X, FR_P_3
1866       nop.i 999
1867 }
1868 { .mfi
1869       nop.m 999
1870       fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, FR_X
1871       nop.i 999
1872 }
1873 ;;
1874
1875 { .mfi
1876       nop.m 999
1877       fma.s1 FR_Y_lo = FR_poly_lo, FR_r4, FR_poly_hi
1878       nop.i 999
1879 }
1880 ;;
1881
1882 { .mfi
1883       nop.m 999
1884       fmpy.s1 FR_TMP3 = FR_Y_lo, FR_TMP1      // Add sign info
1885       nop.i 999
1886 }
1887 ;;
1888
1889 //
1890 //     Toggle on last bit of Y_lo
1891 //     Set lsb of Y_lo to 1
1892 //
1893 { .mfi
1894       nop.m 999
1895       for FR_temp = FR_Y_lo,FR_temp
1896       nop.i 999
1897 }
1898 ;;
1899
1900 { .mfb
1901       nop.m 999
1902       fmerge.se FR_TMP3 = FR_TMP3,FR_temp
1903       br.cond.sptk POWL_64_SHARED
1904 }
1905 ;;
1906
1907
1908 EXPL_HUGE:
1909 // Here if |ylogx| >= 2^14
1910 { .mfi
1911       mov GR_temp = 0x0A1DC               // If X < 0, exponent -24100
1912       fcmp.gt.s1 p12, p13 =  FR_X, f0     // Test X > 0
1913       cmp.eq  p14, p15 =  r0, r0          // Set Safe to false
1914 }
1915 ;;
1916
1917 { .mmi
1918 (p12) mov GR_Mask = 0x15DC0               // If X > 0, exponent +24000
1919 (p13) mov GR_Mask = 0x0A240               // If X < 0, exponent -24000
1920       nop.i 999
1921 }
1922 ;;
1923
1924 { .mmf
1925       setf.exp FR_TMP2 = GR_Mask          // Form Y_hi = TMP2
1926 (p13) setf.exp FR_Y_lo = GR_temp          // If X < 0, Y_lo = 2^-24100
1927 (p12) mov FR_Y_lo = f1                    // IF X > 0, Y_lo = 1.0
1928 }
1929 ;;
1930
1931 { .mfi
1932       nop.m 999
1933       fmpy.s1 FR_TMP1 = FR_TMP2, FR_Sgn   // TMP1 = Y_hi * Sgn
1934       nop.i 999
1935 }
1936 ;;
1937
1938 { .mfb
1939       nop.m 999
1940       fmpy.s1 FR_TMP3 = FR_Y_lo,FR_TMP1   // TMP3 = Y_lo * (Y_hi * Sgn)
1941       br.cond.sptk POWL_64_SHARED
1942 }
1943 ;;
1944
1945 POWL_Y_ALMOST_1:
1946 // Here if delta = |y-1| < 2^-50
1947 //
1948 //  x**(1 + delta) = x * e (ln(x)*delta) = x ( 1 + ln(x) * delta)
1949 //
1950 // Computation will be safe for 2^-16381 <= x < 2^16383
1951
1952 { .mfi
1953        mov GR_exp_ynear1_oflow = 0xffff + 16383
1954        fma.s1 FR_TMP1 = FR_Input_X,FR_Delta,f0
1955        and GR_exp_x = GR_exp_mask, GR_signexp_x
1956 }
1957 ;;
1958
1959 { .mfi
1960        cmp.lt  p15, p14 =  GR_exp_x, GR_exp_ynear1_oflow
1961        fma.s1 FR_TMP2 = FR_logx_hi,f1,FR_X_lo
1962        mov GR_exp_ynear1_uflow = 0xffff - 16381
1963 }
1964 ;;
1965
1966 { .mfb
1967 (p15)  cmp.ge  p15, p14 =  GR_exp_x, GR_exp_ynear1_uflow
1968        fma.s1 FR_TMP3 = FR_Input_X,f1,f0
1969        br.cond.sptk POWL_64_SHARED
1970 };;
1971
1972 POWL_64_SQUARE:
1973 //
1974 //      Here if x not zero and y=2.
1975 //
1976 //      Setup for multipath code
1977 //
1978 { .mfi
1979       mov GR_exp_square_oflow = 0xffff + 8192   // Exponent where x*x overflows
1980       fmerge.se FR_TMP1 = FR_Input_X, FR_Input_X
1981       and GR_exp_x = GR_exp_mask, GR_signexp_x  // Get exponent of x
1982 }
1983 ;;
1984
1985 { .mfi
1986       cmp.lt  p15, p14 =  GR_exp_x, GR_exp_square_oflow // Decide safe/unsafe
1987       fmerge.se FR_TMP2 = FR_Input_X, FR_Input_X
1988       mov GR_exp_square_uflow = 0xffff - 8191   // Exponent where x*x underflows
1989 }
1990 ;;
1991
1992 { .mfi
1993 (p15) cmp.ge  p15, p14 =  GR_exp_x, GR_exp_square_uflow // Decide safe/unsafe
1994       fma.s1 FR_TMP3 = f0,f0,f0
1995       nop.i 999
1996 }
1997 ;;
1998
1999 //
2000 //      This is the shared path that will set overflow and underflow.
2001 //
2002 POWL_64_SHARED:
2003
2004 //
2005 //      Return if no danger of over or underflow.
2006 //
2007 { .mfb
2008       nop.m 999
2009       fma.s0 FR_Result = FR_TMP1, FR_TMP2, FR_TMP3
2010 (p15) br.ret.sptk  b0      // Main path return if certain no over/underflow
2011 }
2012 ;;
2013
2014 //
2015 //      S0 user supplied status
2016 //      S2 user supplied status + WRE + TD  (Overflows)
2017 //      S2 user supplied status + FZ + TD   (Underflows)
2018 //
2019 //
2020 //     If (Safe) is true, then
2021 //        Compute result using user supplied status field.
2022 //        No overflow or underflow here, but perhaps inexact.
2023 //        Return
2024 //     Else
2025 //       Determine if overflow or underflow was raised.
2026 //       Fetch +/- overflow threshold for IEEE double extended
2027
2028 { .mfi
2029       nop.m 999
2030       fsetc.s2 0x7F,0x41       // For underflow test, set S2=User+TD+FTZ
2031       nop.i 999
2032 }
2033 ;;
2034
2035 { .mfi
2036       nop.m 999
2037       fma.s2 FR_Result_small = FR_TMP1, FR_TMP2, FR_TMP3
2038       nop.i 999
2039 }
2040 ;;
2041
2042 { .mfi
2043       nop.m 999
2044       fsetc.s2 0x7F,0x42       // For overflow test, set S2=User+TD+WRE
2045       nop.i 999
2046 }
2047 ;;
2048
2049 { .mfi
2050       nop.m 999
2051       fma.s2 FR_Result_big = FR_TMP1, FR_TMP2,FR_TMP3
2052       nop.i 999
2053 }
2054 ;;
2055
2056 { .mfi
2057       nop.m 999
2058       fsetc.s2 0x7F,0x40       // Reset S2=User
2059       nop.i 999
2060 }
2061 ;;
2062
2063 { .mfi
2064       nop.m 999
2065       fclass.m p11, p0 = FR_Result_small, 0x00F // Test small result unorm/zero
2066       nop.i 999
2067 }
2068 ;;
2069
2070 { .mfi
2071       nop.m 999
2072       fcmp.ge.s1 p8, p0 = FR_Result_big , FR_Big // Test >= + oflow threshold
2073       nop.i 999
2074 }
2075 ;;
2076
2077 { .mfb
2078 (p11) mov   GR_Parameter_TAG = 19                // Set tag for underflow
2079       fcmp.le.s1 p9, p0 = FR_Result_big, FR_NBig // Test <= - oflow threshold
2080 (p11) br.cond.spnt __libm_error_region           // Branch if pow underflowed
2081 }
2082 ;;
2083
2084 { .mfb
2085 (p8)  mov   GR_Parameter_TAG = 18                // Set tag for overflow
2086       nop.f 999
2087 (p8)  br.cond.spnt __libm_error_region           // Branch if pow +overflow
2088 }
2089 ;;
2090
2091 { .mbb
2092 (p9)  mov   GR_Parameter_TAG = 18                // Set tag for overflow
2093 (p9)  br.cond.spnt __libm_error_region           // Branch if pow -overflow
2094       br.ret.sptk  b0                            // Branch if result really ok
2095 }
2096 ;;
2097
2098
2099 POWL_64_SPECIAL:
2100 // Here if x or y is NatVal, nan, inf, or zero
2101 { .mfi
2102       nop.m 999
2103       fcmp.eq.s1 p15, p0 =  FR_Input_X, f1  // Test x=+1
2104       nop.i 999
2105 }
2106 ;;
2107
2108 { .mfi
2109       nop.m 999
2110       fclass.m p8, p0 =  FR_Input_X, 0x143  // Test x natval, snan
2111       nop.i 999
2112 }
2113 ;;
2114
2115 { .mfi
2116       nop.m 999
2117 (p15) fcmp.eq.unc.s0 p6,p0 = FR_Input_Y, f0 // If x=1, flag invalid if y=SNaN
2118       nop.i 999
2119 }
2120 { .mfb
2121       nop.m 999
2122 (p15) fmpy.s0 FR_Result = f1,f1             // If x=1, result=1
2123 (p15) br.ret.spnt b0                        // Exit if x=1
2124 }
2125 ;;
2126
2127 { .mfi
2128       nop.m 999
2129       fclass.m p6, p0 =  FR_Input_Y, 0x007  // Test y zero
2130       nop.i 999
2131 }
2132 ;;
2133
2134 { .mfi
2135       nop.m 999
2136       fclass.m p9, p0 =  FR_Input_Y, 0x143  // Test y natval, snan
2137       nop.i 999
2138 }
2139 ;;
2140
2141 { .mfi
2142       nop.m 999
2143       fclass.m p10, p0 =  FR_Input_X, 0x083 // Test x qnan
2144       nop.i 999
2145 }
2146 { .mfi
2147       nop.m 999
2148 (p8)  fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X // If x=snan, result=qnan
2149 (p6)  cmp.ne p8,p0 = r0,r0     // Don't exit if x=snan, y=0 ==> result=+1
2150 }
2151 ;;
2152
2153 { .mfi
2154       nop.m 999
2155 (p6)  fclass.m.unc p15, p0 =  FR_Input_X,0x007   // Test x=0, y=0
2156       nop.i 999
2157 }
2158 { .mfb
2159       nop.m 999
2160 (p9)  fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X // If y=snan, result=qnan
2161 (p8)  br.ret.spnt b0                             // Exit if x=snan, y not 0,
2162                                                  //   result=qnan
2163 }
2164 ;;
2165
2166 { .mfi
2167       nop.m 999
2168       fcmp.eq.s1 p7, p0 =  FR_Input_Y, f1        // Test y +1.0
2169       nop.i 999
2170 }
2171 { .mfb
2172       nop.m 999
2173 (p10) fmpy.s0 FR_Result = FR_Input_X, f0         // If x=qnan, result=qnan
2174 (p9)  br.ret.spnt b0                             // Exit if y=snan, result=qnan
2175 }
2176 ;;
2177
2178 { .mfi
2179       nop.m 999
2180 (p6)  fclass.m.unc p8, p0 =  FR_Input_X,0x0C3    // Test x=nan, y=0
2181       nop.i 999
2182 }
2183 ;;
2184
2185 { .mfi
2186       nop.m 999
2187 (p6)  fcmp.eq.s0 p9,p0 = FR_Input_X, f0          // If y=0, flag if x denormal
2188       nop.i 999
2189 }
2190 { .mfi
2191       nop.m 999
2192 (p6)  fadd.s0 FR_Result = f1, f0                 // If y=0, result=1
2193       nop.i 999
2194 }
2195 ;;
2196
2197 { .mfi
2198       nop.m 999
2199       fclass.m p11, p0 =  FR_Input_Y, 0x083      // Test y qnan
2200       nop.i 999
2201 }
2202 { .mfb
2203 (p15) mov GR_Parameter_TAG = 20                  // Error tag for x=0, y=0
2204 (p7)  fmpy.s0 FR_Result = FR_Input_X,f1          // If y=1, result=x
2205 (p15) br.cond.spnt __libm_error_region           // Branch if x=0, y=0, result=1
2206 }
2207 ;;
2208
2209 { .mfb
2210 (p8)  mov GR_Parameter_TAG = 23                  // Error tag for x=nan, y=0
2211       fclass.m p14, p0 =  FR_Input_Y, 0x023      // Test y inf
2212 (p8)  br.cond.spnt __libm_error_region           // Branch if x=snan, y=0,
2213                                                  //   result=1
2214 }
2215 ;;
2216
2217 { .mfb
2218       nop.m 999
2219       fclass.m p13, p0 =  FR_Input_X, 0x023      // Test x inf
2220 (p6)  br.ret.spnt b0                             // Exit y=0, x not nan or 0,
2221                                                  //   result=1
2222 }
2223 ;;
2224
2225 { .mfb
2226       nop.m 999
2227 (p14) fcmp.eq.unc.s1 p0,p14 = FR_Input_X,f0      // Test x not 0, y=inf
2228 (p7)  br.ret.spnt b0                             // Exit y=1, x not snan,
2229                                                  //   result=x
2230 }
2231 ;;
2232
2233 { .mfb
2234       nop.m 999
2235 (p10) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X  // If x=qnan, y not snan,
2236                                                  //   result=qnan
2237 (p10) br.ret.spnt b0                             // Exit x=qnan, y not snan,
2238                                                  //   result=qnan
2239 }
2240 ;;
2241
2242 { .mfb
2243       nop.m 999
2244 (p11) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X  // If y=qnan, x not nan or 1,
2245                                                  //   result=qnan
2246 (p11) br.ret.spnt b0                             // Exit y=qnan, x not nan or 1,
2247                                                  //   result=qnan
2248 }
2249 ;;
2250
2251 { .mbb
2252       nop.m 999
2253 (p14) br.cond.spnt POWL_64_Y_IS_INF           // Branch if y=inf, x not 1 or nan
2254 (p13) br.cond.spnt POWL_64_X_IS_INF           // Branch if x=inf, y not 1 or nan
2255 }
2256 ;;
2257
2258
2259 POWL_64_X_IS_ZERO:
2260 // Here if x=0, y not nan or 1 or inf or 0
2261
2262 // There is logic starting here to determine if y is an integer when x = 0.
2263 // If 0 < |y| < 1 then clearly y is not an integer.
2264 // If |y| > 1, then the significand of y is shifted left by the size of
2265 //    the exponent of y.  This preserves the lsb of the integer part + the
2266 //    fractional bits.  The lsb of the integer can be tested to determine if
2267 //    the integer is even or odd.  The fractional bits can be tested.  If zero,
2268 //    then y is an integer.
2269 //
2270 { .mfi
2271       and GR_exp_y = GR_exp_mask,GR_signexp_y   // Get biased exponent of y
2272       nop.f 999
2273       and GR_y_sign = GR_sign_mask,GR_signexp_y // Get sign of y
2274 }
2275 ;;
2276
2277 //
2278 //     Maybe y is < 1 already, so
2279 //     can never be an integer.
2280 //
2281 { .mfi
2282       cmp.lt  p9, p8 = GR_exp_y,GR_exp_bias     // Test 0 < |y| < 1
2283       nop.f 999
2284       sub GR_exp_y = GR_exp_y,GR_exp_bias       // Get true exponent of y
2285 }
2286 ;;
2287
2288 //
2289 //     Shift significand of y looking for nonzero bits
2290 //     For y > 1, shift signif_y exp_y bits to the left
2291 //     For y < 1, turn on 4 low order bits of significand of y
2292 //     so that the fraction will always be non-zero
2293 //
2294 { .mmi
2295 (p9)  or  GR_exp_y=  0xF,GR_signif_y            // Force nonzero fraction if y<1
2296 ;;
2297       nop.m 999
2298 (p8)  shl GR_exp_y=  GR_signif_y,GR_exp_y       // Get lsb of int + fraction
2299                                                 // Wait 4 cycles to use result
2300 }
2301 ;;
2302
2303 { .mmi
2304       nop.m 999
2305 ;;
2306       nop.m 999
2307       nop.i 999
2308 }
2309 ;;
2310
2311 { .mmi
2312       nop.m 999
2313 ;;
2314       nop.m 999
2315       shl GR_fraction_y=  GR_exp_y,1            // Shift left 1 to get fraction
2316 }
2317 ;;
2318
2319 //
2320 //     Integer part of y  shifted off.
2321 //     Get y's low even or odd bit - y might not be an int.
2322 //
2323 { .mii
2324       cmp.eq  p13,p0  =  GR_fraction_y, r0      // Test for y integer
2325       cmp.eq  p8,p0 =  GR_y_sign, r0            // Test for y > 0
2326 ;;
2327 (p13) tbit.nz.unc p13,p0 = GR_exp_y, 63         // Test if y an odd integer
2328 }
2329 ;;
2330
2331 { .mfi
2332 (p13) cmp.eq.unc p13,p14 =  GR_y_sign, r0   // Test y pos odd integer
2333 (p8)  fcmp.eq.s0 p12,p0 = FR_Input_Y, f0    // If x=0 and y>0 flag if y denormal
2334       nop.i 999
2335 }
2336 ;;
2337
2338 //
2339 //     Return +/-0 when x=+/-0 and y is positive odd integer
2340 //
2341 { .mfb
2342       nop.m 999
2343 (p13) mov FR_Result = FR_Input_X            // If x=0,  y pos odd int, result=x
2344 (p13) br.ret.spnt b0                        // Exit x=0, y pos odd int, result=x
2345 }
2346 ;;
2347
2348 //
2349 //     Return +/-inf when x=+/-0 and y is negative odd int
2350 //
2351 { .mfb
2352 (p14) mov GR_Parameter_TAG = 21
2353 (p14) frcpa.s0 FR_Result, p0 = f1, FR_Input_X  // Result +-inf, set Z flag
2354 (p14) br.cond.spnt __libm_error_region
2355 }
2356 ;;
2357
2358 //
2359 //     Return +0 when x=+/-0 and y positive and not an odd integer
2360 //
2361 { .mfb
2362       nop.m 999
2363 (p8)  mov FR_Result = f0      // If x=0, y>0 and not odd integer, result=+0
2364 (p8)  br.ret.sptk b0          // Exit x=0, y>0 and not odd integer, result=+0
2365 }
2366 ;;
2367
2368 //
2369 //     Return +inf when x=+/-0 and y is negative and not odd int
2370 //
2371 { .mfb
2372       mov GR_Parameter_TAG = 21
2373       frcpa.s0 FR_Result, p10 = f1,f0   // Result +inf, raise Z flag
2374       br.cond.sptk __libm_error_region
2375 }
2376 ;;
2377
2378
2379 POWL_64_X_IS_INF:
2380 //
2381 // Here if x=inf, y not 1 or nan
2382 //
2383 { .mfi
2384       and GR_exp_y = GR_exp_mask,GR_signexp_y   // Get biased exponent y
2385       fclass.m p13, p0 =  FR_Input_X,0x022      // Test x=-inf
2386       nop.i 999
2387 }
2388 ;;
2389
2390 { .mfi
2391       and GR_y_sign = GR_sign_mask,GR_signexp_y // Get sign of y
2392       fcmp.eq.s0 p9,p0 = FR_Input_Y, f0         // Dummy to set flag if y denorm
2393       nop.i 999
2394 }
2395 ;;
2396
2397 //
2398 //     Maybe y is < 1 already, so
2399 //     isn't an int.
2400 //
2401 { .mfi
2402 (p13) cmp.lt.unc  p9, p8 = GR_exp_y,GR_exp_bias // Test 0 < |y| < 1 if x=-inf
2403       fclass.m p11, p0 =  FR_Input_X,0x021      // Test x=+inf
2404       sub GR_exp_y = GR_exp_y,GR_exp_bias       // Get true exponent y
2405 }
2406 ;;
2407
2408 //
2409 //     Shift significand of y looking for nonzero bits
2410 //     For y > 1, shift signif_y exp_y bits to the left
2411 //     For y < 1, turn on 4 low order bits of significand of y
2412 //     so that the fraction will always be non-zero
2413 //
2414 { .mmi
2415 (p9)  or  GR_exp_y=  0xF,GR_signif_y          // Force nonzero fraction if y<1
2416 ;;
2417 (p11) cmp.eq.unc  p14,p12 = GR_y_sign, r0     // Test x=+inf, y>0
2418 (p8)  shl GR_exp_y=  GR_signif_y,GR_exp_y     // Get lsb of int + fraction
2419                                               // Wait 4 cycles to use result
2420 }
2421 ;;
2422
2423 //
2424 //     Return +inf for x=+inf, y > 0
2425 //     Return +0   for x=+inf, y < 0
2426 //
2427 { .mfi
2428       nop.m 999
2429 (p12) mov FR_Result = f0                      // If x=+inf, y<0, result=+0
2430       nop.i 999
2431 }
2432 { .mfb
2433       nop.m 999
2434 (p14) fma.s0 FR_Result = FR_Input_X,f1,f0     // If x=+inf, y>0, result=+inf
2435 (p11) br.ret.sptk b0                          // Exit x=+inf
2436 }
2437 ;;
2438
2439 //
2440 // Here only if x=-inf.  Wait until can use result of shl...
2441 //
2442 { .mmi
2443       nop.m 999
2444 ;;
2445       nop.m 999
2446       nop.i 999
2447 }
2448 ;;
2449
2450 { .mfi
2451       cmp.eq  p8,p9 = GR_y_sign, r0           // Test y pos
2452       nop.f 999
2453       shl GR_fraction_y = GR_exp_y,1          // Shift left 1 to get fraction
2454 }
2455 ;;
2456
2457 { .mmi
2458       cmp.eq  p13,p0 = GR_fraction_y, r0      // Test y integer
2459 ;;
2460       nop.m 999
2461 (p13) tbit.nz.unc  p13,p0 = GR_exp_y, 63      // Test y odd integer
2462 }
2463 ;;
2464
2465 //
2466 //     Is y even or odd?
2467 //
2468 { .mii
2469 (p13) cmp.eq.unc  p14,p10 = GR_y_sign, r0     // Test x=-inf, y pos odd int
2470 (p13) cmp.ne.and  p8,p9 = r0,r0               // If y odd int, turn off p8,p9
2471       nop.i 999
2472 }
2473 ;;
2474
2475 //
2476 //     Return -0   for x = -inf and y < 0 and odd int.
2477 //     Return -Inf for x = -inf and y > 0 and odd int.
2478 //
2479 { .mfi
2480       nop.m 999
2481 (p10) fmerge.ns FR_Result = f0, f0      // If x=-inf, y neg odd int, result=-0
2482       nop.i 999
2483 }
2484 { .mfi
2485       nop.m 999
2486 (p14) fmpy.s0 FR_Result = FR_Input_X,f1 // If x=-inf, y pos odd int, result=-inf
2487       nop.i 999
2488 }
2489 ;;
2490
2491 //
2492 //     Return Inf for x = -inf and y > 0 not an odd int.
2493 //     Return +0  for x = -inf and y < 0 not an odd int.
2494 //
2495 .pred.rel "mutex",p8,p9
2496 { .mfi
2497       nop.m 999
2498 (p8)  fmerge.ns FR_Result = FR_Input_X, FR_Input_X // If x=-inf, y>0 not odd int
2499                                                    //   result=+inf
2500       nop.i 999
2501 }
2502 { .mfb
2503       nop.m 999
2504 (p9)  fmpy.s0 FR_Result = f0,f0                    // If x=-inf, y<0 not odd int
2505                                                    //   result=+0
2506       br.ret.sptk b0                               // Exit for x=-inf
2507 }
2508 ;;
2509
2510
2511 POWL_64_Y_IS_INF:
2512 // Here if y=inf, x not 1 or nan
2513 //
2514 //     For y = +Inf and |x| < 1  returns 0
2515 //     For y = +Inf and |x| > 1  returns Inf
2516 //     For y = -Inf and |x| < 1  returns Inf
2517 //     For y = -Inf and |x| > 1  returns 0
2518 //     For y =  Inf and |x| = 1  returns 1
2519 //
2520 { .mfi
2521       nop.m 999
2522       fclass.m p8, p0 =  FR_Input_Y, 0x021    // Test y=+inf
2523       nop.i 999
2524 }
2525 ;;
2526
2527 { .mfi
2528       nop.m 999
2529       fclass.m p9, p0 =  FR_Input_Y, 0x022    // Test y=-inf
2530       nop.i 999
2531 }
2532 ;;
2533
2534 { .mfi
2535       nop.m 999
2536       fabs FR_X = FR_Input_X                  // Form |x|
2537       nop.i 999
2538 }
2539 ;;
2540
2541 { .mfi
2542       nop.m 999
2543       fcmp.eq.s0 p10,p0 = FR_Input_X, f0      // flag if x denormal
2544       nop.i 999
2545 }
2546 ;;
2547
2548 { .mfi
2549       nop.m 999
2550 (p8)  fcmp.lt.unc.s1 p6, p0  =  FR_X, f1      // Test y=+inf, |x|<1
2551       nop.i 999
2552 }
2553 ;;
2554
2555 { .mfi
2556       nop.m 999
2557 (p8)  fcmp.gt.unc.s1 p7, p0  =  FR_X, f1      // Test y=+inf, |x|>1
2558       nop.i 999
2559 }
2560 ;;
2561
2562 { .mfi
2563       nop.m 999
2564 (p9)  fcmp.lt.unc.s1 p12, p0 =  FR_X, f1      // Test y=-inf, |x|<1
2565       nop.i 999
2566 }
2567 { .mfi
2568       nop.m 999
2569 (p6)  fmpy.s0 FR_Result = f0,f0               // If y=+inf, |x|<1, result=+0
2570       nop.i 999
2571 }
2572 ;;
2573
2574 { .mfi
2575       nop.m 999
2576 (p9)  fcmp.gt.unc.s1 p13, p0 =  FR_X, f1      // Test y=-inf, |x|>1
2577       nop.i 999
2578 }
2579 { .mfi
2580       nop.m 999
2581 (p7)  fmpy.s0 FR_Result = FR_Input_Y, f1      // If y=+inf, |x|>1, result=+inf
2582       nop.i 999
2583 }
2584 ;;
2585
2586 { .mfi
2587       nop.m 999
2588       fcmp.eq.s1 p14, p0 =  FR_X, f1          // Test y=inf, |x|=1
2589       nop.i 999
2590 }
2591 { .mfi
2592       nop.m 999
2593 (p12) fnma.s0 FR_Result = FR_Input_Y, f1, f0  // If y=-inf, |x|<1, result=+inf
2594       nop.i 999
2595 }
2596 ;;
2597
2598 { .mfi
2599       nop.m 999
2600 (p13) mov FR_Result = f0                      // If y=-inf, |x|>1, result=+0
2601       nop.i 999
2602 }
2603 ;;
2604
2605 { .mfb
2606       nop.m 999
2607 (p14) fmpy.s0 FR_Result = f1,f1               // If y=inf, |x|=1, result=+1
2608       br.ret.sptk b0                          // Common return for y=inf
2609 }
2610 ;;
2611
2612
2613 // Here if x or y denorm/unorm
2614 POWL_DENORM:
2615 { .mmi
2616       getf.sig GR_signif_Z = FR_norm_X   // Get significand of x
2617 ;;
2618       getf.exp GR_signexp_y = FR_norm_Y  // Get sign and exp of y
2619       nop.i 999
2620 }
2621 ;;
2622
2623 { .mfi
2624       getf.sig GR_signif_y = FR_norm_Y   // Get significand of y
2625       nop.f 999
2626       nop.i 999
2627 }
2628 ;;
2629
2630 { .mib
2631       getf.exp GR_signexp_x = FR_norm_X  // Get sign and exp of x
2632       extr.u GR_Index1 = GR_signif_Z, 59, 4  // Extract upper 4 signif bits of x
2633       br.cond.sptk  POWL_COMMON          // Branch back to main path
2634 }
2635 ;;
2636
2637
2638 POWL_64_UNSUPPORT:
2639 //
2640 //     Raise exceptions for specific
2641 //     values - pseudo NaN and
2642 //     infinities.
2643 //     Return NaN and raise invalid
2644 //
2645 { .mfb
2646       nop.m 999
2647       fmpy.s0 FR_Result = FR_Input_X,f0
2648       br.ret.sptk b0
2649 }
2650 ;;
2651
2652 POWL_64_XNEG:
2653 //
2654 //     Raise invalid for x < 0  and
2655 //     y not an integer
2656 //
2657 { .mfi
2658       nop.m 999
2659       frcpa.s0 FR_Result, p8 =  f0, f0
2660       mov GR_Parameter_TAG = 22
2661 }
2662 { .mib
2663       nop.m 999
2664       nop.i 999
2665       br.cond.sptk __libm_error_region
2666 }
2667 ;;
2668
2669 POWL_64_SQRT:
2670 { .mfi
2671       nop.m 999
2672       frsqrta.s0 FR_Result,p10 = FR_save_Input_X
2673       nop.i 999 ;;
2674 }
2675 { .mfi
2676       nop.m 999
2677 (p10) fma.s1   f62=FR_Half,FR_save_Input_X,f0
2678       nop.i 999 ;;
2679 }
2680 { .mfi
2681       nop.m 999
2682 (p10) fma.s1   f63=FR_Result,FR_Result,f0
2683       nop.i 999 ;;
2684 }
2685 { .mfi
2686       nop.m 999
2687 (p10) fnma.s1  f32=f63,f62,FR_Half
2688       nop.i 999 ;;
2689 }
2690 { .mfi
2691       nop.m 999
2692 (p10) fma.s1   f33=f32,FR_Result,FR_Result
2693       nop.i 999 ;;
2694 }
2695 { .mfi
2696       nop.m 999
2697 (p10) fma.s1   f34=f33,f62,f0
2698       nop.i 999 ;;
2699 }
2700 { .mfi
2701       nop.m 999
2702 (p10) fnma.s1  f35=f34,f33,FR_Half
2703       nop.i 999 ;;
2704 }
2705 { .mfi
2706       nop.m 999
2707 (p10) fma.s1   f63=f35,f33,f33
2708       nop.i 999 ;;
2709 }
2710 { .mfi
2711       nop.m 999
2712 (p10) fma.s1   f32=FR_save_Input_X,f63,f0
2713       nop.i 999
2714 }
2715 { .mfi
2716       nop.m 999
2717 (p10) fma.s1   FR_Result=f63,f62,f0
2718       nop.i 999 ;;
2719 }
2720 { .mfi
2721       nop.m 999
2722 (p10) fma.s1   f33=f11,f63,f0
2723       nop.i 999 ;;
2724 }
2725 { .mfi
2726       nop.m 999
2727 (p10) fnma.s1  f34=f32,f32,FR_save_Input_X
2728       nop.i 999
2729 }
2730 { .mfi
2731       nop.m 999
2732 (p10) fnma.s1  f35=FR_Result,f63,FR_Half
2733       nop.i 999 ;;
2734 }
2735 { .mfi
2736       nop.m 999
2737 (p10) fma.s1   f62=f33,f34,f32
2738       nop.i 999
2739 }
2740 { .mfi
2741       nop.m 999
2742 (p10) fma.s1   f63=f33,f35,f33
2743       nop.i 999 ;;
2744 }
2745 { .mfi
2746       nop.m 999
2747 (p10) fnma.s1  f32=f62,f62,FR_save_Input_X
2748       nop.i 999 ;;
2749 }
2750 { .mfb
2751       nop.m 999
2752 (p10) fma.s0 FR_Result=f32,f63,f62
2753       br.ret.sptk   b0                // Exit for x > 0, y = 0.5
2754 }
2755 ;;
2756
2757 GLOBAL_LIBM_END(powl)
2758
2759
2760 LOCAL_LIBM_ENTRY(__libm_error_region)
2761 .prologue
2762 { .mfi
2763         add   GR_Parameter_Y=-32,sp             // Parameter 2 value
2764         nop.f 0
2765 .save   ar.pfs,GR_SAVE_PFS
2766         mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
2767 }
2768 { .mfi
2769 .fframe 64
2770         add sp=-64,sp                           // Create new stack
2771         nop.f 0
2772         mov GR_SAVE_GP=gp                       // Save gp
2773 };;
2774 { .mmi
2775         stfe [GR_Parameter_Y] = FR_Input_Y,16   // Save Parameter 2 on stack
2776         add GR_Parameter_X = 16,sp              // Parameter 1 address
2777 .save   b0, GR_SAVE_B0
2778         mov GR_SAVE_B0=b0                       // Save b0
2779 };;
2780 .body
2781 { .mib
2782         stfe [GR_Parameter_X] = FR_save_Input_X // Store Parameter 1 on stack
2783         add   GR_Parameter_RESULT = 0,GR_Parameter_Y
2784         nop.b 0                                 // Parameter 3 address
2785 }
2786 { .mib
2787         stfe [GR_Parameter_Y] = FR_Result       // Store Parameter 3 on stack
2788         add   GR_Parameter_Y = -16,GR_Parameter_Y
2789         br.call.sptk b0=__libm_error_support#   // Call error handling function
2790 };;
2791 { .mmi
2792         add   GR_Parameter_RESULT = 48,sp
2793         nop.m 0
2794         nop.i 0
2795 };;
2796 { .mmi
2797         ldfe  f8 = [GR_Parameter_RESULT]        // Get return result off stack
2798 .restore sp
2799         add   sp = 64,sp                        // Restore stack pointer
2800         mov   b0 = GR_SAVE_B0                   // Restore return address
2801 };;
2802 { .mib
2803         mov   gp = GR_SAVE_GP                  // Restore gp
2804         mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
2805         br.ret.sptk     b0                     // Return
2806 };;
2807
2808 LOCAL_LIBM_END(__libm_error_region#)
2809 .type   __libm_error_support#,@function
2810 .global __libm_error_support#