sysdeps/ia64/fpu/s_expm1.S

   1 .file "exp_m1.s"
   2
   3 // Copyright (C) 2000, 2001, Intel Corporation
   4 // All rights reserved.
   5 //
   6 // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
   7 // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
   8 //
   9 // Redistribution and use in source and binary forms, with or without
  10 // modification, are permitted provided that the following conditions are
  11 // met:
  12 //
  13 // * Redistributions of source code must retain the above copyright
  14 // notice, this list of conditions and the following disclaimer.
  15 //
  16 // * Redistributions in binary form must reproduce the above copyright
  17 // notice, this list of conditions and the following disclaimer in the
  18 // documentation and/or other materials provided with the distribution.
  19 //
  20 // * The name of Intel Corporation may not be used to endorse or promote
  21 // products derived from this software without specific prior written
  22 // permission.
  23 //
  24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  35 //
  36 // Intel Corporation is the author of this code, and requests that all
  37 // problem reports or change requests be submitted to it directly at
  38 // http://developer.intel.com/opensource.
  39 //
  40 // HISTORY
  41 // 2/02/00  Initial Version
  42 // 4/04/00  Unwind support added
  43 // 8/15/00  Bundle added after call to __libm_error_support to properly
  44 //          set [the previously overwritten] GR_Parameter_RESULT.
  45 //
  46 // *********************************************************************
  47 //
  48 // Function:   Combined exp(x) and expm1(x), where
  49 //                       x
  50 //             exp(x) = e , for double precision x values
  51 //                         x
  52 //             expm1(x) = e  - 1  for double precision x values
  53 //
  54 // *********************************************************************
  55 //
  56 // Accuracy:       Within .7 ulps for 80-bit floating point values
  57 //                 Very accurate for double precision values
  58 //
  59 // *********************************************************************
  60 //
  61 // Resources Used:
  62 //
  63 //    Floating-Point Registers: f8  (Input and Return Value)
  64 //                              f9,f32-f61, f99-f102
  65 //
  66 //    General Purpose Registers:
  67 //      r32-r61
  68 //      r62-r65 (Used to pass arguments to error handling routine)
  69 //
  70 //    Predicate Registers:      p6-p15
  71 //
  72 // *********************************************************************
  73 //
  74 // IEEE Special Conditions:
  75 //
  76 //    Denormal  fault raised on denormal inputs
  77 //    Overflow exceptions raised when appropriate for exp and expm1
  78 //    Underflow exceptions raised when appropriate for exp and expm1
  79 //    (Error Handling Routine called for overflow and Underflow)
  80 //    Inexact raised when appropriate by algorithm
  81 //
  82 //    exp(inf) = inf
  83 //    exp(-inf) = +0
  84 //    exp(SNaN) = QNaN
  85 //    exp(QNaN) = QNaN
  86 //    exp(0) = 1
  87 //    exp(EM_special Values) = QNaN
  88 //    exp(inf) = inf
  89 //    expm1(-inf) = -1
  90 //    expm1(SNaN) = QNaN
  91 //    expm1(QNaN) = QNaN
  92 //    expm1(0) = 0
  93 //    expm1(EM_special Values) = QNaN
  94 //
  95 // *********************************************************************
  96 //
  97 // Implementation and Algorithm Notes:
  98 //
  99 //  ker_exp_64( in_FR  : X,
 100 //            in_GR  : Flag,
 101 //            in_GR  : Expo_Range
 102 //            out_FR : Y_hi,
 103 //            out_FR : Y_lo,
 104 //            out_FR : scale,
 105 //            out_PR : Safe )
 106 //
 107 // On input, X is in register format and
 108 // Flag  = 0 for exp,
 109 // Flag  = 1 for expm1,
 110 //
 111 // On output, provided X and X_cor are real numbers, then
 112 //
 113 //   scale*(Y_hi + Y_lo)  approximates  exp(X)       if Flag is 0
 114 //   scale*(Y_hi + Y_lo)  approximates  exp(X)-1     if Flag is 1
 115 //
 116 // The accuracy is sufficient for a highly accurate 64 sig.
 117 // bit implementation.  Safe is set if there is no danger of
 118 // overflow/underflow when the result is composed from scale,
 119 // Y_hi and Y_lo. Thus, we can have a fast return if Safe is set.
 120 // Otherwise, one must prepare to handle the possible exception
 121 // appropriately.  Note that SAFE not set (false) does not mean
 122 // that overflow/underflow will occur; only the setting of SAFE
 123 // guarantees the opposite.
 124 //
 125 // **** High Level Overview ****
 126 //
 127 // The method consists of three cases.
 128 //
 129 // If           |X| < Tiny      use case exp_tiny;
 130 // else if      |X| < 2^(-6)    use case exp_small;
 131 // else         use case exp_regular;
 132 //
 133 // Case exp_tiny:
 134 //
 135 //   1 + X     can be used to approximate exp(X) or exp(X+X_cor);
 136 //   X + X^2/2 can be used to approximate exp(X) - 1
 137 //
 138 // Case exp_small:
 139 //
 140 //   Here, exp(X), exp(X+X_cor), and exp(X) - 1 can all be
 141 //   appproximated by a relatively simple polynomial.
 142 //
 143 //   This polynomial resembles the truncated Taylor series
 144 //
 145 //      exp(w) = 1 + w + w^2/2! + w^3/3! + ... + w^n/n!
 146 //
 147 // Case exp_regular:
 148 //
 149 //   Here we use a table lookup method. The basic idea is that in
 150 //   order to compute exp(X), we accurately decompose X into
 151 //
 152 //   X = N * log(2)/(2^12)  + r,        |r| <= log(2)/2^13.
 153 //
 154 //   Hence
 155 //
 156 //   exp(X) = 2^( N / 2^12 ) * exp(r).
 157 //
 158 //   The value 2^( N / 2^12 ) is obtained by simple combinations
 159 //   of values calculated beforehand and stored in table; exp(r)
 160 //   is approximated by a short polynomial because |r| is small.
 161 //
 162 //   We elaborate this method in 4 steps.
 163 //
 164 //   Step 1: Reduction
 165 //
 166 //   The value 2^12/log(2) is stored as a double-extended number
 167 //   L_Inv.
 168 //
 169 //   N := round_to_nearest_integer( X * L_Inv )
 170 //
 171 //   The value log(2)/2^12 is stored as two numbers L_hi and L_lo so
 172 //   that r can be computed accurately via
 173 //
 174 //   r := (X - N*L_hi) - N*L_lo
 175 //
 176 //   We pick L_hi such that N*L_hi is representable in 64 sig. bits
 177 //   and thus the FMA   X - N*L_hi   is error free. So r is the
 178 //   1 rounding error from an exact reduction with respect to
 179 //
 180 //   L_hi + L_lo.
 181 //
 182 //   In particular, L_hi has 30 significant bit and can be stored
 183 //   as a double-precision number; L_lo has 64 significant bits and
 184 //   stored as a double-extended number.
 185 //
 186 //   In the case Flag = 2, we further modify r by
 187 //
 188 //   r := r + X_cor.
 189 //
 190 //   Step 2: Approximation
 191 //
 192 //   exp(r) - 1 is approximated by a short polynomial of the form
 193 //
 194 //   r + A_1 r^2 + A_2 r^3 + A_3 r^4 .
 195 //
 196 //   Step 3: Composition from Table Values
 197 //
 198 //   The value 2^( N / 2^12 ) can be composed from a couple of tables
 199 //   of precalculated values. First, express N as three integers
 200 //   K, M_1, and M_2 as
 201 //
 202 //     N  =  K * 2^12  + M_1 * 2^6 + M_2
 203 //
 204 //   Where 0 <= M_1, M_2 < 2^6; and K can be positive or negative.
 205 //   When N is represented in 2's complement, M_2 is simply the 6
 206 //   lsb's, M_1 is the next 6, and K is simply N shifted right
 207 //   arithmetically (sign extended) by 12 bits.
 208 //
 209 //   Now, 2^( N / 2^12 ) is simply
 210 //
 211 //      2^K * 2^( M_1 / 2^6 ) * 2^( M_2 / 2^12 )
 212 //
 213 //   Clearly, 2^K needs no tabulation. The other two values are less
 214 //   trivial because if we store each accurately to more than working
 215 //   precision, than its product is too expensive to calculate. We
 216 //   use the following method.
 217 //
 218 //   Define two mathematical values, delta_1 and delta_2, implicitly
 219 //   such that
 220 //
 221 //     T_1 = exp( [M_1 log(2)/2^6]  -  delta_1 )
 222 //     T_2 = exp( [M_2 log(2)/2^12] -  delta_2 )
 223 //
 224 //   are representable as 24 significant bits. To illustrate the idea,
 225 //   we show how we define delta_1:
 226 //
 227 //     T_1     := round_to_24_bits( exp( M_1 log(2)/2^6 ) )
 228 //     delta_1  = (M_1 log(2)/2^6) - log( T_1 )
 229 //
 230 //   The last equality means mathematical equality. We then tabulate
 231 //
 232 //     W_1 := exp(delta_1) - 1
 233 //     W_2 := exp(delta_2) - 1
 234 //
 235 //   Both in double precision.
 236 //
 237 //   From the tabulated values T_1, T_2, W_1, W_2, we compose the values
 238 //   T and W via
 239 //
 240 //     T := T_1 * T_2                   ...exactly
 241 //     W := W_1 + (1 + W_1)*W_2
 242 //
 243 //   W approximates exp( delta ) - 1  where delta = delta_1 + delta_2.
 244 //   The mathematical product of T and (W+1) is an accurate representation
 245 //   of 2^(M_1/2^6) * 2^(M_2/2^12).
 246 //
 247 //   Step 4. Reconstruction
 248 //
 249 //   Finally, we can reconstruct exp(X), exp(X) - 1.
 250 //   Because
 251 //
 252 //      X = K * log(2) + (M_1*log(2)/2^6  - delta_1)
 253 //                     + (M_2*log(2)/2^12 - delta_2)
 254 //                     + delta_1 + delta_2 + r          ...accurately
 255 //   We have
 256 //
 257 //      exp(X) ~=~ 2^K * ( T + T*[exp(delta_1+delta_2+r) - 1] )
 258 //             ~=~ 2^K * ( T + T*[exp(delta + r) - 1]         )
 259 //             ~=~ 2^K * ( T + T*[(exp(delta)-1)
 260 //                               + exp(delta)*(exp(r)-1)]   )
 261 //             ~=~ 2^K * ( T + T*( W + (1+W)*poly(r) ) )
 262 //             ~=~ 2^K * ( Y_hi  +  Y_lo )
 263 //
 264 //   where Y_hi = T  and Y_lo = T*(W + (1+W)*poly(r))
 265 //
 266 //   For exp(X)-1, we have
 267 //
 268 //      exp(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1
 269 //               ~=~ 2^K * ( Y_hi + Y_lo - 2^(-K) )
 270 //
 271 //   and we combine Y_hi + Y_lo - 2^(-N)  into the form of two
 272 //   numbers  Y_hi + Y_lo carefully.
 273 //
 274 //   **** Algorithm Details ****
 275 //
 276 //   A careful algorithm must be used to realize the mathematical ideas
 277 //   accurately. We describe each of the three cases. We assume SAFE
 278 //   is preset to be TRUE.
 279 //
 280 //   Case exp_tiny:
 281 //
 282 //   The important points are to ensure an accurate result under
 283 //   different rounding directions and a correct setting of the SAFE
 284 //   flag.
 285 //
 286 //   If Flag is 1, then
 287 //      SAFE  := False  ...possibility of underflow
 288 //      Scale := 1.0
 289 //      Y_hi  := X
 290 //      Y_lo  := 2^(-17000)
 291 //   Else
 292 //      Scale := 1.0
 293 //      Y_hi  := 1.0
 294 //      Y_lo  := X      ...for different rounding modes
 295 //   Endif
 296 //
 297 //   Case exp_small:
 298 //
 299 //   Here we compute a simple polynomial. To exploit parallelism, we split
 300 //   the polynomial into several portions.
 301 //
 302 //   Let r = X
 303 //
 304 //   If Flag is not 1   ...i.e. exp( argument )
 305 //
 306 //      rsq := r * r;
 307 //      r4  := rsq*rsq
 308 //      poly_lo := P_3 + r*(P_4 + r*(P_5 + r*P_6))
 309 //      poly_hi := r + rsq*(P_1 + r*P_2)
 310 //      Y_lo    := poly_hi + r4 * poly_lo
 311 //      set lsb(Y_lo) to 1
 312 //      Y_hi    := 1.0
 313 //      Scale   := 1.0
 314 //
 315 //   Else                       ...i.e. exp( argument ) - 1
 316 //
 317 //      rsq := r * r
 318 //      r4  := rsq * rsq
 319 //      r6  := rsq * r4
 320 //      poly_lo := r6*(Q_5 + r*(Q_6 + r*Q_7))
 321 //      poly_hi := Q_1 + r*(Q_2 + r*(Q_3 + r*Q_4))
 322 //      Y_lo    := rsq*poly_hi +  poly_lo
 323 //      set lsb(Y_lo) to 1
 324 //      Y_hi    := X
 325 //      Scale   := 1.0
 326 //
 327 //   Endif
 328 //
 329 //  Case exp_regular:
 330 //
 331 //  The previous description contain enough information except the
 332 //  computation of poly and the final Y_hi and Y_lo in the case for
 333 //  exp(X)-1.
 334 //
 335 //  The computation of poly for Step 2:
 336 //
 337 //   rsq := r*r
 338 //   poly := r + rsq*(A_1 + r*(A_2 + r*A_3))
 339 //
 340 //  For the case exp(X) - 1, we need to incorporate 2^(-K) into
 341 //  Y_hi and Y_lo at the end of Step 4.
 342 //
 343 //   If K > 10 then
 344 //      Y_lo := Y_lo - 2^(-K)
 345 //   Else
 346 //      If K < -10 then
 347 //       Y_lo := Y_hi + Y_lo
 348 //       Y_hi := -2^(-K)
 349 //      Else
 350 //       Y_hi := Y_hi - 2^(-K)
 351 //      End If
 352 //   End If
 353 //
 354
 355 #include "libm_support.h"
 356
 357 GR_SAVE_PFS          = r59
 358 GR_SAVE_B0           = r60
 359 GR_SAVE_GP           = r61
 360
 361 GR_Parameter_X       = r62
 362 GR_Parameter_Y       = r63
 363 GR_Parameter_RESULT  = r64
 364
 365 FR_X             = f9
 366 FR_Y             = f1
 367 FR_RESULT        = f99
 368
 369 #ifdef _LIBC
 370 .rodata
 371 #else
 372 .data
 373 #endif
 374
 375 .align 64
 376 Constants_exp_64_Arg:
 377 ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object)
 378 data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000
 379 data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000
 380 data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000
 381 // /* Inv_L, L_hi, L_lo */
 382 ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg)
 383
 384 .align 64
 385 Constants_exp_64_Exponents:
 386 ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object)
 387 data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF
 388 data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF
 389 data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF
 390 data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF
 391 data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF
 392 data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF
 393 ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents)
 394
 395 .align 64
 396 Constants_exp_64_A:
 397 ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object)
 398 data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000
 399 data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000
 400 data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000
 401 // /* Reversed */
 402 ASM_SIZE_DIRECTIVE(Constants_exp_64_A)
 403
 404 .align 64
 405 Constants_exp_64_P:
 406 ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object)
 407 data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000
 408 data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000
 409 data4 0x7474C518,0x88888888,0x00003FF8,0x00000000
 410 data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000
 411 data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000
 412 data4 0x000004C7,0x80000000,0x00003FFE,0x00000000
 413 // /* Reversed */
 414 ASM_SIZE_DIRECTIVE(Constants_exp_64_P)
 415
 416 .align 64
 417 Constants_exp_64_Q:
 418 ASM_TYPE_DIRECTIVE(Constants_exp_64_Q,@object)
 419 data4 0xA49EF6CA,0xD00D56F7,0x00003FEF,0x00000000
 420 data4 0x1C63493D,0xD00D59AB,0x00003FF2,0x00000000
 421 data4 0xFB50CDD2,0xB60B60B5,0x00003FF5,0x00000000
 422 data4 0x7BA68DC8,0x88888888,0x00003FF8,0x00000000
 423 data4 0xAAAAAC8D,0xAAAAAAAA,0x00003FFA,0x00000000
 424 data4 0xAAAAACCA,0xAAAAAAAA,0x00003FFC,0x00000000
 425 data4 0x00000000,0x80000000,0x00003FFE,0x00000000
 426 // /* Reversed */
 427 ASM_SIZE_DIRECTIVE(Constants_exp_64_Q)
 428
 429 .align 64
 430 Constants_exp_64_T1:
 431 ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object)
 432 data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
 433 data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
 434 data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC
 435 data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D
 436 data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA
 437 data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516
 438 data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A
 439 data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4
 440 data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B
 441 data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD
 442 data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15
 443 data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B
 444 data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5
 445 data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A
 446 data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177
 447 data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C
 448 ASM_SIZE_DIRECTIVE(Constants_exp_64_T1)
 449
 450 .align 64
 451 Constants_exp_64_T2:
 452 ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object)
 453 data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
 454 data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
 455 data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
 456 data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
 457 data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
 458 data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
 459 data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
 460 data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
 461 data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
 462 data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
 463 data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
 464 data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
 465 data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
 466 data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
 467 data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
 468 data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37
 469 ASM_SIZE_DIRECTIVE(Constants_exp_64_T2)
 470
 471 .align 64
 472 Constants_exp_64_W1:
 473 ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object)
 474 data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454
 475 data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6
 476 data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA
 477 data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50
 478 data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2
 479 data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE
 480 data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B
 481 data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04
 482 data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419
 483 data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376
 484 data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A
 485 data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB
 486 data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E
 487 data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA
 488 data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08
 489 data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B
 490 data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75
 491 data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79
 492 data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7
 493 data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087
 494 data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB
 495 data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643
 496 data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C
 497 data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D
 498 data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873
 499 data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F
 500 data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861
 501 data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0
 502 data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC
 503 data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB
 504 data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB
 505 data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148
 506 ASM_SIZE_DIRECTIVE(Constants_exp_64_W1)
 507
 508 .align 64
 509 Constants_exp_64_W2:
 510 ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object)
 511 data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25
 512 data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8
 513 data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A
 514 data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E
 515 data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9
 516 data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2
 517 data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0
 518 data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509
 519 data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33
 520 data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D
 521 data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87
 522 data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3
 523 data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9
 524 data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F
 525 data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82
 526 data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4
 527 data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D
 528 data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030
 529 data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29
 530 data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED
 531 data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B
 532 data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893
 533 data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35
 534 data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C
 535 data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313
 536 data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE
 537 data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426
 538 data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550
 539 data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4
 540 data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31
 541 data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE
 542 data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9
 543 ASM_SIZE_DIRECTIVE(Constants_exp_64_W2)
 544
 545 .section .text
 546 .proc expm1#
 547 .global expm1#
 548 .align 64
 549
 550 expm1:
 551 #ifdef _LIBC
 552 .global __expm1#
 553 __expm1:
 554 #endif
 555
 556
 557 { .mii
 558       alloc r32 = ar.pfs,0,30,4,0
 559 (p0)  add r33 = 1, r0
 560 (p0)  cmp.eq.unc  p7, p0 =  r0, r0
 561 }
 562 ;;
 563
 564
 565 //
 566 //    Set p7 true for expm1
 567 //    Set Flag = r33 = 1 for expm1
 568 //    These are really no longer necesary, but are a remnant
 569 //       when this file had multiple entry points.
 570 //       They should be carefully removed
 571
 572
 573
 574 { .mfi
 575 (p0)  add r32 = 1,r0
 576 (p0)  fnorm.s1 f9 = f8
 577       nop.i 999
 578 }
 579
 580
 581 { .mfi
 582       nop.m 999
 583 (p0)  fclass.m.unc p6, p8 =  f8, 0x1E7
 584       nop.i 999
 585 }
 586
 587 { .mfi
 588       nop.m 999
 589 (p0)  fclass.nm.unc p9, p0 =  f8, 0x1FF
 590       nop.i 999
 591 }
 592
 593 { .mfi
 594         nop.m 999
 595 (p0)  mov f36 = f1
 596         nop.i 999 ;;
 597 }
 598
 599 //
 600 //    Identify NatVals, NaNs, Infs, and Zeros.
 601 //    Identify EM unsupporteds.
 602 //    Save special input registers
 603 //
 604 //    Create FR_X_cor      = 0.0
 605 //           GR_Flag       = 0
 606 //           GR_Expo_Range = 1
 607 //           FR_Scale      = 1.0
 608 //
 609
 610 { .mfb
 611         nop.m 999
 612 (p0)  mov f32 = f0
 613 (p6)  br.cond.spnt EXP_64_SPECIAL ;;
 614 }
 615
 616 { .mib
 617         nop.m 999
 618         nop.i 999
 619 (p9)  br.cond.spnt EXP_64_UNSUPPORTED ;;
 620 }
 621
 622 //
 623 //    Branch out for special input values
 624 //
 625
 626 { .mfi
 627 (p0)  cmp.ne.unc p12, p13 = 0x01, r33
 628 (p0)  fcmp.lt.unc.s0 p9,p0 =  f8, f0
 629 (p0)  cmp.eq.unc  p15, p0 =  r0, r0
 630 }
 631
 632 //
 633 //    Raise possible denormal operand exception
 634 //    Normalize x
 635 //
 636 //    This function computes exp( x  + x_cor)
 637 //    Input  FR 1: FR_X
 638 //    Input  FR 2: FR_X_cor
 639 //    Input  GR 1: GR_Flag
 640 //    Input  GR 2: GR_Expo_Range
 641 //    Output FR 3: FR_Y_hi
 642 //    Output FR 4: FR_Y_lo
 643 //    Output FR 5: FR_Scale
 644 //    Output PR 1: PR_Safe
 645
 646 //
 647 //    Prepare to load constants
 648 //    Set Safe = True
 649 //
 650
 651 { .mmi
 652 (p0)  addl           r34   = @ltoff(Constants_exp_64_Arg#), gp
 653 (p0)  addl           r40   = @ltoff(Constants_exp_64_W1#),  gp
 654 (p0)  addl           r41   = @ltoff(Constants_exp_64_W2#),  gp
 655 }
 656 ;;
 657
 658 { .mmi
 659       ld8 r34 = [r34]
 660       ld8 r40 = [r40]
 661 (p0)  addl           r50   = @ltoff(Constants_exp_64_T1#),  gp
 662 }
 663 ;;
 664
 665
 666 { .mmi
 667       ld8 r41  = [r41]
 668 (p0)  ldfe f37 = [r34],16
 669 (p0)  addl           r51   = @ltoff(Constants_exp_64_T2#),  gp
 670 }
 671 ;;
 672
 673 //
 674 //    N = fcvt.fx(float_N)
 675 //    Set p14 if -6 > expo_X
 676 //
 677
 678
 679 //
 680 //    Bias = 0x0FFFF
 681 //    expo_X = expo_X and Mask
 682 //
 683
 684 //
 685 //    Load L_lo
 686 //    Set p10 if 14 < expo_X
 687 //
 688
 689 { .mmi
 690       ld8  r50 = [r50]
 691 (p0)  ldfe f40 = [r34],16
 692       nop.i 999
 693 }
 694 ;;
 695
 696 { .mlx
 697         nop.m 999
 698 (p0)  movl r58 = 0x0FFFF
 699 }
 700 ;;
 701
 702 //
 703 //    Load W2_ptr
 704 //    Branch to SMALL is expo_X < -6
 705 //
 706
 707 //
 708 //    float_N = X * L_Inv
 709 //    expo_X = exponent of X
 710 //    Mask = 0x1FFFF
 711 //
 712
 713 { .mmi
 714       ld8  r51 = [r51]
 715 (p0)  ldfe f41 = [r34],16
 716 }
 717 ;;
 718
 719 { .mlx
 720 (p0)  addl           r34   = @ltoff(Constants_exp_64_Exponents#),  gp
 721 (p0)  movl r39 = 0x1FFFF
 722 }
 723 ;;
 724
 725 { .mmi
 726       ld8  r34 = [r34]
 727 (p0)  getf.exp r37 = f9
 728       nop.i 999
 729 }
 730 ;;
 731
 732 { .mii
 733       nop.m 999
 734       nop.i 999
 735 (p0)  and  r37 = r37, r39 ;;
 736 }
 737
 738 { .mmi
 739 (p0)  sub r37 = r37, r58 ;;
 740 (p0)  cmp.gt.unc  p14, p0 =  -6, r37
 741 (p0)  cmp.lt.unc  p10, p0 =  14, r37 ;;
 742 }
 743
 744 { .mfi
 745         nop.m 999
 746 //
 747 //    Load L_inv
 748 //    Set p12 true for Flag = 0 (exp)
 749 //    Set p13 true for Flag = 1 (expm1)
 750 //
 751 (p0)  fmpy.s1 f38 = f9, f37
 752         nop.i 999 ;;
 753 }
 754
 755 { .mfb
 756         nop.m 999
 757 //
 758 //    Load L_hi
 759 //    expo_X = expo_X - Bias
 760 //    get W1_ptr
 761 //
 762 (p0)  fcvt.fx.s1 f39 = f38
 763 (p14) br.cond.spnt EXP_SMALL ;;
 764 }
 765
 766 { .mib
 767         nop.m 999
 768         nop.i 999
 769 (p10) br.cond.spnt EXP_HUGE ;;
 770 }
 771
 772 { .mmi
 773 (p0)  shladd r34 = r32,4,r34
 774 (p0)  addl           r35   = @ltoff(Constants_exp_64_A#), gp
 775       nop.i 999
 776 }
 777 ;;
 778
 779 { .mmi
 780       ld8  r35 = [r35]
 781       nop.m 999
 782       nop.i 999
 783 }
 784 ;;
 785
 786 //
 787 //    Load T_1,T_2
 788 //
 789
 790 { .mmb
 791 (p0)  ldfe f51 = [r35],16
 792 (p0)  ld8 r45 = [r34],8
 793         nop.b 999 ;;
 794 }
 795 //
 796 //    Set Safe = True  if k >= big_expo_neg
 797 //    Set Safe = False if k < big_expo_neg
 798 //
 799
 800 { .mmb
 801 (p0)  ldfe f49 = [r35],16
 802 (p0)  ld8 r48 = [r34],0
 803         nop.b 999 ;;
 804 }
 805
 806 { .mfi
 807         nop.m 999
 808 //
 809 //    Branch to HUGE is expo_X > 14
 810 //
 811 (p0)  fcvt.xf f38 = f39
 812         nop.i 999 ;;
 813 }
 814
 815 { .mfi
 816 (p0)  getf.sig r52 = f39
 817         nop.f 999
 818         nop.i 999 ;;
 819 }
 820
 821 { .mii
 822         nop.m 999
 823 (p0)  extr.u r43 = r52, 6, 6 ;;
 824 //
 825 //    r = r - float_N * L_lo
 826 //    K = extr(N_fix,12,52)
 827 //
 828 (p0)  shladd r40 = r43,3,r40 ;;
 829 }
 830
 831 { .mfi
 832 (p0)  shladd r50 = r43,2,r50
 833 (p0)  fnma.s1 f42 = f40, f38, f9
 834 //
 835 //    float_N = float(N)
 836 //    N_fix = signficand N
 837 //
 838 (p0)  extr.u r42 = r52, 0, 6
 839 }
 840
 841 { .mmi
 842 (p0)  ldfd  f43 = [r40],0 ;;
 843 (p0)  shladd r41 = r42,3,r41
 844 (p0)  shladd r51 = r42,2,r51
 845 }
 846 //
 847 //    W_1_p1 = 1 + W_1
 848 //
 849
 850 { .mmi
 851 (p0)  ldfs  f44 = [r50],0 ;;
 852 (p0)  ldfd  f45 = [r41],0
 853 //
 854 //    M_2 = extr(N_fix,0,6)
 855 //    M_1 = extr(N_fix,6,6)
 856 //    r = X - float_N * L_hi
 857 //
 858 (p0)  extr r44 = r52, 12, 52
 859 }
 860
 861 { .mmi
 862 (p0)  ldfs  f46 = [r51],0 ;;
 863 (p0)  sub r46 = r58, r44
 864 (p0)  cmp.gt.unc  p8, p15 =  r44, r45
 865 }
 866 //
 867 //    W = W_1 + W_1_p1*W_2
 868 //    Load  A_2
 869 //    Bias_m_K = Bias - K
 870 //
 871
 872 { .mii
 873 (p0)  ldfe f40 = [r35],16
 874 //
 875 //    load A_1
 876 //    poly = A_2 + r*A_3
 877 //    rsq = r * r
 878 //    neg_2_mK = exponent of Bias_m_k
 879 //
 880 (p0)  add r47 = r58, r44 ;;
 881 //
 882 //    Set Safe = True  if k <= big_expo_pos
 883 //    Set Safe = False  if k >  big_expo_pos
 884 //    Load A_3
 885 //
 886 (p15) cmp.lt p8,p15 = r44,r48 ;;
 887 }
 888
 889 { .mmf
 890 (p0)  setf.exp f61 = r46
 891 //
 892 //    Bias_p + K = Bias + K
 893 //    T = T_1 * T_2
 894 //
 895 (p0)  setf.exp f36 = r47
 896 (p0)  fnma.s1 f42 = f41, f38, f42 ;;
 897 }
 898
 899 { .mfi
 900         nop.m 999
 901 //
 902 //    Load W_1,W_2
 903 //    Load big_exp_pos, load big_exp_neg
 904 //
 905 (p0)  fadd.s1 f47 = f43, f1
 906         nop.i 999 ;;
 907 }
 908
 909 { .mfi
 910         nop.m 999
 911 (p0)  fma.s1 f52 = f42, f51, f49
 912         nop.i 999
 913 }
 914
 915 { .mfi
 916         nop.m 999
 917 (p0)  fmpy.s1 f48 = f42, f42
 918         nop.i 999 ;;
 919 }
 920
 921 { .mfi
 922         nop.m 999
 923 (p0)  fmpy.s1 f53 = f44, f46
 924         nop.i 999 ;;
 925 }
 926
 927 { .mfi
 928         nop.m 999
 929 (p0)  fma.s1 f54 = f45, f47, f43
 930         nop.i 999
 931 }
 932
 933 { .mfi
 934         nop.m 999
 935 (p0)  fneg f61 =  f61
 936         nop.i 999 ;;
 937 }
 938
 939 { .mfi
 940         nop.m 999
 941 (p0)  fma.s1 f52 = f42, f52, f40
 942         nop.i 999 ;;
 943 }
 944
 945 { .mfi
 946         nop.m 999
 947 (p0)  fadd.s1 f55 = f54, f1
 948         nop.i 999
 949 }
 950
 951 { .mfi
 952         nop.m 999
 953 //
 954 //    W + Wp1 * poly
 955 //
 956 (p0)  mov f34 = f53
 957         nop.i 999 ;;
 958 }
 959
 960 { .mfi
 961         nop.m 999
 962 //
 963 //    A_1 + r * poly
 964 //    Scale = setf_exp(Bias_p_k)
 965 //
 966 (p0)  fma.s1 f52 = f48, f52, f42
 967         nop.i 999 ;;
 968 }
 969
 970 { .mfi
 971         nop.m 999
 972 //
 973 //    poly = r + rsq(A_1 + r*poly)
 974 //    Wp1 = 1 + W
 975 //    neg_2_mK = -neg_2_mK
 976 //
 977 (p0)  fma.s1 f35 = f55, f52, f54
 978         nop.i 999 ;;
 979 }
 980
 981 { .mfb
 982         nop.m 999
 983 (p0)  fmpy.s1 f35 = f35, f53
 984 //
 985 //    Y_hi = T
 986 //    Y_lo = T * (W + Wp1*poly)
 987 //
 988 (p12) br.cond.sptk EXP_MAIN ;;
 989 }
 990 //
 991 //    Branch if exp(x)
 992 //    Continue for exp(x-1)
 993 //
 994
 995 { .mii
 996 (p0)  cmp.lt.unc  p12, p13 =  10, r44
 997         nop.i 999 ;;
 998 //
 999 //    Set p12 if 10 < K, Else p13
1000 //
1001 (p13) cmp.gt.unc  p13, p14 =  -10, r44 ;;
1002 }
1003 //
1004 //    K > 10:  Y_lo = Y_lo + neg_2_mK
1005 //    K <=10:  Set p13 if -10 > K, Else set p14
1006 //
1007
1008 { .mfi
1009 (p13) cmp.eq  p15, p0 =  r0, r0
1010 (p14) fadd.s1 f34 = f61, f34
1011         nop.i 999 ;;
1012 }
1013
1014 { .mfi
1015         nop.m 999
1016 (p12) fadd.s1 f35 = f35, f61
1017         nop.i 999 ;;
1018 }
1019
1020 { .mfi
1021         nop.m 999
1022 (p13) fadd.s1 f35 = f35, f34
1023         nop.i 999
1024 }
1025
1026 { .mfb
1027         nop.m 999
1028 //
1029 //    K <= 10 and K < -10, Set Safe = True
1030 //    K <= 10 and K < 10,   Y_lo = Y_hi + Y_lo
1031 //    K <= 10 and K > =-10, Y_hi = Y_hi + neg_2_mk
1032 //
1033 (p13) mov f34 = f61
1034 (p0)  br.cond.sptk EXP_MAIN ;;
1035 }
1036 EXP_SMALL:
1037
1038 { .mmi
1039 (p12)  addl           r35   = @ltoff(Constants_exp_64_P#), gp
1040 (p0)   addl           r34   = @ltoff(Constants_exp_64_Exponents#), gp
1041       nop.i 999
1042 }
1043 ;;
1044
1045 { .mmi
1046 (p12) ld8 r35 = [r35]
1047       ld8 r34 = [r34]
1048       nop.i 999
1049 }
1050 ;;
1051
1052
1053 { .mmi
1054 (p13)  addl           r35   = @ltoff(Constants_exp_64_Q#), gp
1055        nop.m 999
1056        nop.i 999
1057 }
1058 ;;
1059
1060
1061 //
1062 //    Return
1063 //    K <= 10 and K < 10,   Y_hi = neg_2_mk
1064 //
1065 //    /*******************************************************/
1066 //    /*********** Branch EXP_SMALL  *************************/
1067 //    /*******************************************************/
1068
1069 { .mfi
1070 (p13) ld8 r35 = [r35]
1071 (p0)  mov f42 = f9
1072 (p0)  add r34 = 0x48,r34
1073 }
1074 ;;
1075
1076 //
1077 //    Flag = 0
1078 //    r4 = rsq * rsq
1079 //
1080
1081 { .mfi
1082 (p0)  ld8 r49 =[r34],0
1083         nop.f 999
1084         nop.i 999 ;;
1085 }
1086
1087 { .mii
1088         nop.m 999
1089         nop.i 999 ;;
1090 //
1091 //    Flag = 1
1092 //
1093 (p0)  cmp.lt.unc  p14, p0 =  r37, r49 ;;
1094 }
1095
1096 { .mfi
1097         nop.m 999
1098 //
1099 //    r = X
1100 //
1101 (p0)  fmpy.s1 f48 = f42, f42
1102         nop.i 999 ;;
1103 }
1104
1105 { .mfb
1106         nop.m 999
1107 //
1108 //    rsq = r * r
1109 //
1110 (p0)  fmpy.s1 f50 = f48, f48
1111 //
1112 //    Is input very small?
1113 //
1114 (p14) br.cond.spnt EXP_VERY_SMALL ;;
1115 }
1116 //
1117 //    Flag_not1: Y_hi = 1.0
1118 //    Flag is 1: r6 = rsq * r4
1119 //
1120
1121 { .mfi
1122 (p12) ldfe f52 = [r35],16
1123 (p12) mov f34 = f1
1124 (p0)  add r53 = 0x1,r0 ;;
1125 }
1126
1127 { .mfi
1128 (p13) ldfe f51 = [r35],16
1129 //
1130 //    Flag_not_1: Y_lo = poly_hi + r4 * poly_lo
1131 //
1132 (p13) mov f34 = f9
1133         nop.i 999 ;;
1134 }
1135
1136 { .mmf
1137 (p12) ldfe f53 = [r35],16
1138 //
1139 //    For Flag_not_1, Y_hi = X
1140 //    Scale = 1
1141 //    Create 0x000...01
1142 //
1143 (p0)  setf.sig f37 = r53
1144 (p0)  mov f36 = f1 ;;
1145 }
1146
1147 { .mmi
1148 (p13) ldfe f52 = [r35],16 ;;
1149 (p12) ldfe f54 = [r35],16
1150         nop.i 999 ;;
1151 }
1152
1153 { .mfi
1154 (p13) ldfe f53 = [r35],16
1155 (p13) fmpy.s1 f58 = f48, f50
1156         nop.i 999 ;;
1157 }
1158 //
1159 //    Flag_not1: poly_lo = P_5 + r*P_6
1160 //    Flag_1: poly_lo = Q_6 + r*Q_7
1161 //
1162
1163 { .mmi
1164 (p13) ldfe f54 = [r35],16 ;;
1165 (p12) ldfe f55 = [r35],16
1166         nop.i 999 ;;
1167 }
1168
1169 { .mmi
1170 (p12) ldfe f56 = [r35],16 ;;
1171 (p13) ldfe f55 = [r35],16
1172         nop.i 999 ;;
1173 }
1174
1175 { .mmi
1176 (p12) ldfe f57 = [r35],0 ;;
1177 (p13) ldfe f56 = [r35],16
1178         nop.i 999 ;;
1179 }
1180
1181 { .mfi
1182 (p13) ldfe f57 = [r35],0
1183         nop.f 999
1184         nop.i 999 ;;
1185 }
1186
1187 { .mfi
1188         nop.m 999
1189 //
1190 //    For  Flag_not_1, load p5,p6,p1,p2
1191 //    Else load p5,p6,p1,p2
1192 //
1193 (p12) fma.s1 f60 = f52, f42, f53
1194         nop.i 999 ;;
1195 }
1196
1197 { .mfi
1198         nop.m 999
1199 (p13) fma.s1 f60 = f51, f42, f52
1200         nop.i 999 ;;
1201 }
1202
1203 { .mfi
1204         nop.m 999
1205 (p12) fma.s1 f60 = f60, f42, f54
1206         nop.i 999 ;;
1207 }
1208
1209 { .mfi
1210         nop.m 999
1211 (p12) fma.s1 f59 = f56, f42, f57
1212         nop.i 999 ;;
1213 }
1214
1215 { .mfi
1216         nop.m 999
1217 (p13) fma.s1 f60 = f42, f60, f53
1218         nop.i 999 ;;
1219 }
1220
1221 { .mfi
1222         nop.m 999
1223 (p12) fma.s1 f59 = f59, f48, f42
1224         nop.i 999 ;;
1225 }
1226
1227 { .mfi
1228         nop.m 999
1229 //
1230 //    Flag_1: poly_lo = Q_5 + r*(Q_6 + r*Q_7)
1231 //    Flag_not1: poly_lo = P_4 + r*(P_5 + r*P_6)
1232 //    Flag_not1: poly_hi = (P_1 + r*P_2)
1233 //
1234 (p13) fmpy.s1 f60 = f60, f58
1235         nop.i 999 ;;
1236 }
1237
1238 { .mfi
1239         nop.m 999
1240 (p12) fma.s1 f60 = f60, f42, f55
1241         nop.i 999 ;;
1242 }
1243
1244 { .mfi
1245         nop.m 999
1246 //
1247 //    Flag_1: poly_lo = r6 *(Q_5 + ....)
1248 //    Flag_not1: poly_hi =  r + rsq *(P_1 + r*P_2)
1249 //
1250 (p12) fma.s1 f35 = f60, f50, f59
1251         nop.i 999
1252 }
1253
1254 { .mfi
1255         nop.m 999
1256 (p13) fma.s1 f59 = f54, f42, f55
1257         nop.i 999 ;;
1258 }
1259
1260 { .mfi
1261         nop.m 999
1262 //
1263 //    Flag_not1: Y_lo = rsq* poly_hi + poly_lo
1264 //    Flag_1: poly_lo = rsq* poly_hi + poly_lo
1265 //
1266 (p13) fma.s1 f59 = f59, f42, f56
1267         nop.i 999 ;;
1268 }
1269
1270 { .mfi
1271         nop.m 999
1272 //
1273 //    Flag_not_1: (P_1 + r*P_2)
1274 //
1275 (p13) fma.s1 f59 = f59, f42, f57
1276         nop.i 999 ;;
1277 }
1278
1279 { .mfi
1280         nop.m 999
1281 //
1282 //    Flag_not_1: poly_hi = r + rsq * (P_1 + r*P_2)
1283 //
1284 (p13) fma.s1 f35 = f59, f48, f60
1285         nop.i 999 ;;
1286 }
1287
1288 { .mfi
1289         nop.m 999
1290 //
1291 //    Create 0.000...01
1292 //
1293 (p0)  for f37 = f35, f37
1294         nop.i 999 ;;
1295 }
1296
1297 { .mfb
1298         nop.m 999
1299 //
1300 //    Set lsb of Y_lo to 1
1301 //
1302 (p0)  fmerge.se f35 = f35,f37
1303 (p0)  br.cond.sptk EXP_MAIN ;;
1304 }
1305 EXP_VERY_SMALL:
1306
1307 { .mmi
1308       nop.m 999
1309 (p13) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp
1310       nop.i 999;;
1311 }
1312
1313 { .mfi
1314 (p13) ld8  r34 = [r34];
1315 (p12) mov f35 = f9
1316       nop.i 999 ;;
1317 }
1318
1319 { .mfb
1320         nop.m 999
1321 (p12) mov f34 = f1
1322 (p12) br.cond.sptk EXP_MAIN ;;
1323 }
1324
1325 { .mlx
1326 (p13) add  r34 = 8,r34
1327 (p13) movl r39 = 0x0FFFE ;;
1328 }
1329 //
1330 //    Load big_exp_neg
1331 //    Create 1/2's exponent
1332 //
1333
1334 { .mii
1335 (p13) setf.exp f56 = r39
1336 (p13) shladd r34 = r32,4,r34 ;;
1337         nop.i 999
1338 }
1339 //
1340 //    Negative exponents are stored after positive
1341 //
1342
1343 { .mfi
1344 (p13) ld8 r45 = [r34],0
1345 //
1346 //    Y_hi = x
1347 //    Scale = 1
1348 //
1349 (p13) fmpy.s1 f35 = f9, f9
1350         nop.i 999 ;;
1351 }
1352
1353 { .mfi
1354         nop.m 999
1355 //
1356 //    Reset Safe if necessary
1357 //    Create 1/2
1358 //
1359 (p13) mov f34 = f9
1360         nop.i 999 ;;
1361 }
1362
1363 { .mfi
1364 (p13) cmp.lt.unc  p0, p15 =  r37, r45
1365 (p13) mov f36 = f1
1366         nop.i 999 ;;
1367 }
1368
1369 { .mfb
1370         nop.m 999
1371 //
1372 //    Y_lo = x * x
1373 //
1374 (p13) fmpy.s1 f35 = f35, f56
1375 //
1376 //    Y_lo = x*x/2
1377 //
1378 (p13) br.cond.sptk EXP_MAIN ;;
1379 }
1380 EXP_HUGE:
1381
1382 { .mfi
1383         nop.m 999
1384 (p0)  fcmp.gt.unc.s1 p14, p0 =  f9, f0
1385         nop.i 999
1386 }
1387
1388 { .mlx
1389         nop.m 999
1390 (p0)  movl r39 = 0x15DC0 ;;
1391 }
1392
1393 { .mfi
1394 (p14) setf.exp f34 = r39
1395 (p14) mov f35 = f1
1396 (p14) cmp.eq  p0, p15 =  r0, r0 ;;
1397 }
1398
1399 { .mfb
1400         nop.m 999
1401 (p14) mov f36 = f34
1402 //
1403 //    If x > 0, Set Safe = False
1404 //    If x > 0, Y_hi = 2**(24,000)
1405 //    If x > 0, Y_lo = 1.0
1406 //    If x > 0, Scale = 2**(24,000)
1407 //
1408 (p14) br.cond.sptk EXP_MAIN ;;
1409 }
1410
1411 { .mlx
1412         nop.m 999
1413 (p12) movl r39 = 0xA240
1414 }
1415
1416 { .mlx
1417         nop.m 999
1418 (p12) movl r38 = 0xA1DC ;;
1419 }
1420
1421 { .mmb
1422 (p13) cmp.eq  p15, p14 =  r0, r0
1423 (p12) setf.exp f34 = r39
1424         nop.b 999 ;;
1425 }
1426
1427 { .mlx
1428 (p12) setf.exp f35 = r38
1429 (p13) movl r39 = 0xFF9C
1430 }
1431
1432 { .mfi
1433         nop.m 999
1434 (p13) fsub.s1 f34 = f0, f1
1435         nop.i 999 ;;
1436 }
1437
1438 { .mfi
1439         nop.m 999
1440 (p12) mov f36 = f34
1441 (p12) cmp.eq  p0, p15 =  r0, r0 ;;
1442 }
1443
1444 { .mfi
1445 (p13) setf.exp f35 = r39
1446 (p13) mov f36 = f1
1447         nop.i 999 ;;
1448 }
1449 EXP_MAIN:
1450
1451 { .mfi
1452 (p0)  cmp.ne.unc p12, p0 = 0x01, r33
1453 (p0)  fmpy.s1 f101 = f36, f35
1454         nop.i 999 ;;
1455 }
1456
1457 { .mfb
1458         nop.m 999
1459 (p0)  fma.d.s0 f99 = f34, f36, f101
1460 (p15)  br.cond.sptk EXP_64_RETURN;;
1461 }
1462
1463 { .mfi
1464         nop.m 999
1465 (p0)  fsetc.s3 0x7F,0x01
1466         nop.i 999
1467 }
1468
1469 { .mlx
1470         nop.m 999
1471 (p0)  movl r50 = 0x000000000103FF ;;
1472 }
1473 //
1474 //    S0 user supplied status
1475 //    S2 user supplied status + WRE + TD  (Overflows)
1476 //    S3 user supplied status + RZ + TD   (Underflows)
1477 //
1478 //
1479 //    If (Safe) is true, then
1480 //        Compute result using user supplied status field.
1481 //        No overflow or underflow here, but perhaps inexact.
1482 //        Return
1483 //    Else
1484 //       Determine if overflow or underflow  was raised.
1485 //       Fetch +/- overflow threshold for IEEE single, double,
1486 //       double extended
1487 //
1488
1489 { .mfi
1490 (p0)  setf.exp f60 = r50
1491 (p0)  fma.d.s3 f102 = f34, f36, f101
1492         nop.i 999
1493 }
1494
1495 { .mfi
1496         nop.m 999
1497 (p0)  fsetc.s3 0x7F,0x40
1498         nop.i 999 ;;
1499 }
1500
1501 { .mfi
1502         nop.m 999
1503 //
1504 //    For Safe, no need to check for over/under.
1505 //    For expm1, handle errors like exp.
1506 //
1507 (p0)  fsetc.s2 0x7F,0x42
1508         nop.i 999;;
1509 }
1510
1511 { .mfi
1512         nop.m 999
1513 (p0)  fma.d.s2 f100 = f34, f36, f101
1514         nop.i 999 ;;
1515 }
1516
1517 { .mfi
1518         nop.m 999
1519 (p0)  fsetc.s2 0x7F,0x40
1520         nop.i 999 ;;
1521 }
1522
1523 { .mfi
1524         nop.m 999
1525 (p7)  fclass.m.unc   p12, p0 =  f102, 0x00F
1526         nop.i 999
1527 }
1528
1529 { .mfi
1530         nop.m 999
1531 (p0)  fclass.m.unc   p11, p0 =  f102, 0x00F
1532         nop.i 999 ;;
1533 }
1534
1535 { .mfi
1536         nop.m 999
1537 (p7)  fcmp.ge.unc.s1 p10, p0 =  f100, f60
1538         nop.i 999
1539 }
1540
1541 { .mfi
1542         nop.m 999
1543 //
1544 //    Create largest double exponent + 1.
1545 //    Create smallest double exponent - 1.
1546 //
1547 (p0)  fcmp.ge.unc.s1 p8, p0 =  f100, f60
1548         nop.i 999 ;;
1549 }
1550 //
1551 //    fcmp:   resultS2 >= + overflow threshold  -> set (a) if true
1552 //    fcmp:   resultS2 <= - overflow threshold  -> set (b) if true
1553 //    fclass: resultS3 is denorm/unorm/0        -> set (d) if true
1554 //
1555
1556 { .mib
1557 (p10) mov   r65 = 41
1558         nop.i 999
1559 (p10) br.cond.sptk __libm_error_region ;;
1560 }
1561
1562 { .mib
1563 (p8)  mov   r65 = 14
1564         nop.i 999
1565 (p8)  br.cond.sptk __libm_error_region ;;
1566 }
1567 //
1568 //    Report that exp overflowed
1569 //
1570
1571 { .mib
1572 (p12) mov   r65 = 42
1573         nop.i 999
1574 (p12) br.cond.sptk __libm_error_region ;;
1575 }
1576
1577 { .mib
1578 (p11) mov   r65 = 15
1579         nop.i 999
1580 (p11) br.cond.sptk __libm_error_region ;;
1581 }
1582
1583 { .mib
1584         nop.m 999
1585         nop.i 999
1586 //
1587 //    Report that exp underflowed
1588 //
1589 (p0)  br.cond.sptk EXP_64_RETURN;;
1590 }
1591 EXP_64_SPECIAL:
1592
1593 { .mfi
1594         nop.m 999
1595 (p0)  fclass.m.unc p6,  p0 =  f8, 0x0c3
1596         nop.i 999
1597 }
1598
1599 { .mfi
1600         nop.m 999
1601 (p0)  fclass.m.unc p13, p8 =  f8, 0x007
1602         nop.i 999 ;;
1603 }
1604
1605 { .mfi
1606         nop.m 999
1607 (p7)  fclass.m.unc p14, p0 =  f8, 0x007
1608         nop.i 999
1609 }
1610
1611 { .mfi
1612         nop.m 999
1613 (p0)  fclass.m.unc p12, p9 =  f8, 0x021
1614         nop.i 999 ;;
1615 }
1616
1617 { .mfi
1618         nop.m 999
1619 (p0)  fclass.m.unc p11, p0 =  f8, 0x022
1620         nop.i 999
1621 }
1622
1623 { .mfi
1624         nop.m 999
1625 (p7)  fclass.m.unc p10, p0 =  f8, 0x022
1626         nop.i 999 ;;
1627 }
1628
1629 { .mfi
1630         nop.m 999
1631 //
1632 //    Identify +/- 0, Inf, or -Inf
1633 //    Generate the right kind of NaN.
1634 //
1635 (p13) fadd.d.s0 f99 = f0, f1
1636         nop.i 999 ;;
1637 }
1638
1639 { .mfi
1640         nop.m 999
1641 (p14) mov f99 = f8
1642         nop.i 999 ;;
1643 }
1644
1645 { .mfb
1646         nop.m 999
1647 (p6)  fadd.d.s0 f99 = f8, f1
1648 //
1649 //    exp(+/-0) = 1
1650 //    expm1(+/-0) = +/-0
1651 //    No exceptions raised
1652 //
1653 (p6)  br.cond.sptk EXP_64_RETURN;;
1654 }
1655
1656 { .mib
1657         nop.m 999
1658         nop.i 999
1659 (p14)  br.cond.sptk EXP_64_RETURN;;
1660 }
1661
1662 { .mfi
1663         nop.m 999
1664 (p11) mov f99 = f0
1665         nop.i 999 ;;
1666 }
1667
1668 { .mfb
1669         nop.m 999
1670 (p10) fsub.d.s1 f99 = f0, f1
1671 //
1672 //    exp(-Inf) = 0
1673 //    expm1(-Inf) = -1
1674 //    No exceptions raised.
1675 //
1676 (p10)  br.cond.sptk EXP_64_RETURN;;
1677 }
1678
1679 { .mfb
1680         nop.m 999
1681 (p12) fmpy.d.s1 f99 = f8, f1
1682 //
1683 //    exp(+Inf) = Inf
1684 //    No exceptions raised.
1685 //
1686 (p0)  br.cond.sptk EXP_64_RETURN;;
1687 }
1688
1689
1690 EXP_64_UNSUPPORTED:
1691
1692 { .mfb
1693        nop.m 999
1694 (p0)  fmpy.d.s0 f99 = f8, f0
1695       nop.b 0;;
1696 }
1697
1698 EXP_64_RETURN:
1699 { .mfb
1700       nop.m 999
1701 (p0)  mov   f8     = f99
1702 (p0)  br.ret.sptk   b0
1703 }
1704 .endp expm1
1705 ASM_SIZE_DIRECTIVE(expm1)
1706
1707 .proc __libm_error_region
1708 __libm_error_region:
1709 .prologue
1710 // (1)
1711 { .mfi
1712         add   GR_Parameter_Y=-32,sp             // Parameter 2 value
1713         nop.f 0
1714 .save   ar.pfs,GR_SAVE_PFS
1715         mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
1716 }
1717 { .mfi
1718 .fframe 64
1719         add sp=-64,sp                          // Create new stack
1720         nop.f 0
1721         mov GR_SAVE_GP=gp                      // Save gp
1722 };;
1723
1724 // (2)
1725 { .mmi
1726         stfd [GR_Parameter_Y] = FR_Y,16         // STORE Parameter 2 on stack
1727         add GR_Parameter_X = 16,sp            // Parameter 1 address
1728 .save   b0, GR_SAVE_B0
1729         mov GR_SAVE_B0=b0                     // Save b0
1730 };;
1731
1732 .body
1733 // (3)
1734 { .mib
1735         stfd [GR_Parameter_X] = FR_X                    // STORE Parameter 1 on stack
1736         add   GR_Parameter_RESULT = 0,GR_Parameter_Y  // Parameter 3 address
1737         nop.b 0
1738 }
1739 { .mib
1740         stfd [GR_Parameter_Y] = FR_RESULT                   // STORE Parameter 3 on stack
1741         add   GR_Parameter_Y = -16,GR_Parameter_Y
1742         br.call.sptk b0=__libm_error_support#         // Call error handling function
1743 };;
1744 { .mmi
1745         nop.m 0
1746         nop.m 0
1747         add   GR_Parameter_RESULT = 48,sp
1748 };;
1749
1750 // (4)
1751 { .mmi
1752         ldfd  f8 = [GR_Parameter_RESULT]       // Get return result off stack
1753 .restore sp
1754         add   sp = 64,sp                       // Restore stack pointer
1755         mov   b0 = GR_SAVE_B0                  // Restore return address
1756 };;
1757 { .mib
1758         mov   gp = GR_SAVE_GP                  // Restore gp
1759         mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
1760         br.ret.sptk     b0                     // Return
1761 };;
1762
1763 .endp __libm_error_region
1764 ASM_SIZE_DIRECTIVE(__libm_error_region)
1765
1766
1767 .type   __libm_error_support#,@function
1768 .global __libm_error_support#