sysdeps/ia64/fpu/s_atanl.S

   1 .file "atanl.s"
   2
   3
   4 // Copyright (c) 2000 - 2003, Intel Corporation
   5 // All rights reserved.
   6 //
   7 // Contributed 2000 by the Intel Numerics Group, Intel Corporation
   8 //
   9 // Redistribution and use in source and binary forms, with or without
  10 // modification, are permitted provided that the following conditions are
  11 // met:
  12 //
  13 // * Redistributions of source code must retain the above copyright
  14 // notice, this list of conditions and the following disclaimer.
  15 //
  16 // * Redistributions in binary form must reproduce the above copyright
  17 // notice, this list of conditions and the following disclaimer in the
  18 // documentation and/or other materials provided with the distribution.
  19 //
  20 // * The name of Intel Corporation may not be used to endorse or promote
  21 // products derived from this software without specific prior written
  22 // permission.
  23
  24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  35 //
  36 // Intel Corporation is the author of this code, and requests that all
  37 // problem reports or change requests be submitted to it directly at
  38 // http://www.intel.com/software/products/opensource/libraries/num.htm.
  39 //
  40 //
  41 //*********************************************************************
  42 //
  43 // History
  44 // 02/02/00 (hand-optimized)
  45 // 04/04/00 Unwind support added
  46 // 08/15/00 Bundle added after call to __libm_error_support to properly
  47 //          set [the previously overwritten] GR_Parameter_RESULT.
  48 // 03/13/01 Fixed flags when denormal raised on intermediate result
  49 // 01/08/02 Improved speed.
  50 // 02/06/02 Corrected .section statement
  51 // 05/20/02 Cleaned up namespace and sf0 syntax
  52 // 02/10/03 Reordered header: .section, .global, .proc, .align;
  53 //          used data8 for long double table values
  54 //
  55 //*********************************************************************
  56 //
  57 // Function:   atanl(x) = inverse tangent(x), for double extended x values
  58 // Function:   atan2l(y,x) = atan(y/x), for double extended y, x values
  59 //
  60 // API
  61 //
  62 //  long double atanl  (long double x)
  63 //  long double atan2l (long double y, long double x)
  64 //
  65 //*********************************************************************
  66 //
  67 // Resources Used:
  68 //
  69 //    Floating-Point Registers: f8 (Input and Return Value)
  70 //                              f9 (Input for atan2l)
  71 //                              f10-f15, f32-f83
  72 //
  73 //    General Purpose Registers:
  74 //      r32-r51
  75 //      r49-r52 (Arguments to error support for 0,0 case)
  76 //
  77 //    Predicate Registers:      p6-p15
  78 //
  79 //*********************************************************************
  80 //
  81 // IEEE Special Conditions:
  82 //
  83 //    Denormal fault raised on denormal inputs
  84 //    Underflow exceptions may occur
  85 //    Special error handling for the y=0 and x=0 case
  86 //    Inexact raised when appropriate by algorithm
  87 //
  88 //    atanl(SNaN) = QNaN
  89 //    atanl(QNaN) = QNaN
  90 //    atanl(+/-0) = +/- 0
  91 //    atanl(+/-Inf) = +/-pi/2
  92 //
  93 //    atan2l(Any NaN for x or y) = QNaN
  94 //    atan2l(+/-0,x) = +/-0 for x > 0
  95 //    atan2l(+/-0,x) = +/-pi for x < 0
  96 //    atan2l(+/-0,+0) = +/-0
  97 //    atan2l(+/-0,-0) = +/-pi
  98 //    atan2l(y,+/-0) = pi/2 y > 0
  99 //    atan2l(y,+/-0) = -pi/2 y < 0
 100 //    atan2l(+/-y, Inf) = +/-0 for finite y > 0
 101 //    atan2l(+/-Inf, x) = +/-pi/2 for finite x
 102 //    atan2l(+/-y, -Inf) = +/-pi for finite  y > 0
 103 //    atan2l(+/-Inf, Inf) = +/-pi/4
 104 //    atan2l(+/-Inf, -Inf) = +/-3pi/4
 105 //
 106 //*********************************************************************
 107 //
 108 // Mathematical Description
 109 // ---------------------------
 110 //
 111 // The function ATANL( Arg_Y, Arg_X ) returns the "argument"
 112 // or the "phase" of the complex number
 113 //
 114 //           Arg_X + i Arg_Y
 115 //
 116 // or equivalently, the angle in radians from the positive
 117 // x-axis to the line joining the origin and the point
 118 // (Arg_X,Arg_Y)
 119 //
 120 //
 121 //        (Arg_X, Arg_Y) x
 122 //                        \
 123 //                \
 124 //                 \
 125 //                  \
 126 //                   \ angle between is ATANL(Arg_Y,Arg_X)
 127
 128
 129
 130
 131 //                    \
 132 //                     ------------------> X-axis
 133
 134 //                   Origin
 135 //
 136 // Moreover, this angle is reported in the range [-pi,pi] thus
 137 //
 138 //      -pi <= ATANL( Arg_Y, Arg_X ) <= pi.
 139 //
 140 // From the geometry, it is easy to define ATANL when one of
 141 // Arg_X or Arg_Y is +-0 or +-inf:
 142 //
 143 //
 144 //      \ Y |
 145 //     X \  |  +0  | -0  |  +inf |  -inf  |  finite non-zero
 146 //        \ |      |     |       |        |
 147 //    ______________________________________________________
 148 //          |            |       |        |
 149 //     +-0  |   Invalid/ |  pi/2 | -pi/2  |  sign(Y)*pi/2
 150 //          |    qNaN    |       |        |
 151 //  --------------------------------------------------------
 152 //          |      |     |       |        |
 153 //     +inf |  +0  | -0  |  pi/4 | -pi/4  |  sign(Y)*0
 154 //  --------------------------------------------------------
 155 //          |      |     |       |        |
 156 //     -inf |  +pi | -pi | 3pi/4 | -3pi/4 |  sign(Y)*pi
 157 //  --------------------------------------------------------
 158 //   finite |    X>0?    |  pi/2 | -pi/2  |  normal case
 159 //  non-zero| sign(Y)*0: |       |        |
 160 //       | sign(Y)*pi |       |        |
 161 //
 162 //
 163 // One must take note that ATANL is NOT the arctangent of the
 164 // value Arg_Y/Arg_X; but rather ATANL and arctan are related
 165 // in a slightly more complicated way as follows:
 166 //
 167 // Let U := max(|Arg_X|, |Arg_Y|);  V := min(|Arg_X|, |Arg_Y|);
 168 // sign_X be the sign bit of Arg_X, i.e., sign_X is 0 or 1;
 169 // s_X    be the sign     of Arg_X, i.e., s_X = (-1)^sign_X;
 170 //
 171 // sign_Y be the sign bit of Arg_Y, i.e., sign_Y is 0 or 1;
 172 // s_Y    be the sign     of Arg_Y, i.e., s_Y = (-1)^sign_Y;
 173 //
 174 // swap   be 0  if |Arg_X| >= |Arg_Y|  and 1 otherwise.
 175 //
 176 // Then, ATANL(Arg_Y, Arg_X) =
 177 //
 178 //       /    arctan(V/U)     \      sign_X = 0 & swap = 0
 179 //       | pi/2 - arctan(V/U) |      sign_X = 0 & swap = 1
 180 // s_Y * |                    |
 181 //       |  pi  - arctan(V/U) |      sign_X = 1 & swap = 0
 182 //       \ pi/2 + arctan(V/U) /      sign_X = 1 & swap = 1
 183 //
 184 //
 185 // This relationship also suggest that the algorithm's major
 186 // task is to calculate arctan(V/U) for 0 < V <= U; and the
 187 // final Result is given by
 188 //
 189 //      s_Y * { (P_hi + P_lo) + sigma * arctan(V/U) }
 190 //
 191 // where
 192 //
 193 //   (P_hi,P_lo) represents M(sign_X,swap)*(pi/2) accurately
 194 //
 195 //   M(sign_X,swap) = 0  for sign_X = 0 and swap = 0
 196 //              1  for swap   = 1
 197 //              2  for sign_X = 1 and swap = 0
 198 //
 199 // and
 200 //
 201 //   sigma = { (sign_X  XOR  swap) :  -1.0 : 1.0 }
 202 //
 203 //      =  (-1) ^ ( sign_X XOR swap )
 204 //
 205 // Both (P_hi,P_lo) and sigma can be stored in a table and fetched
 206 // using (sign_X,swap) as an index. (P_hi, P_lo) can be stored as a
 207 // double-precision, and single-precision pair; and sigma can
 208 // obviously be just a single-precision number.
 209 //
 210 // In the algorithm we propose, arctan(V/U) is calculated to high accuracy
 211 // as A_hi + A_lo. Consequently, the Result ATANL( Arg_Y, Arg_X ) is
 212 // given by
 213 //
 214 //    s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo)
 215 //
 216 // We now discuss the calculation of arctan(V/U) for 0 < V <= U.
 217 //
 218 // For (V/U) < 2^(-3), we use a simple polynomial of the form
 219 //
 220 //      z + z^3*(P_1 + z^2*(P_2 + z^2*(P_3 + ... + P_8)))
 221 //
 222 // where z = V/U.
 223 //
 224 // For the sake of accuracy, the first term "z" must approximate V/U to
 225 // extra precision. For z^3 and higher power, a working precision
 226 // approximation to V/U suffices. Thus, we obtain:
 227 //
 228 //      z_hi + z_lo = V/U  to extra precision and
 229 //      z           = V/U  to working precision
 230 //
 231 // The value arctan(V/U) is delivered as two pieces (A_hi, A_lo)
 232 //
 233 //      (A_hi,A_lo) = (z_hi, z^3*(P_1 + ... + P_8) + z_lo).
 234 //
 235 //
 236 // For 2^(-3) <= (V/U) <= 1, we use a table-driven approach.
 237 // Consider
 238 //
 239 //      (V/U) = 2^k * 1.b_1 b_2 .... b_63 b_64 b_65 ....
 240 //
 241 // Define
 242 //
 243 //       z_hi = 2^k * 1.b_1 b_2 b_3 b_4 1
 244 //
 245 // then
 246 //                                            /                \
 247 //                                            |  (V/U) - z_hi  |
 248
 249 //      arctan(V/U) = arctan(z_hi) + acrtan| -------------- |
 250 //                                            | 1 + (V/U)*z_hi |
 251 //                                            \                /
 252 //
 253 //                                            /                \
 254 //                                            |   V - z_hi*U   |
 255
 256 //                  = arctan(z_hi) + acrtan| -------------- |
 257 //                                            |   U + V*z_hi   |
 258 //                                            \                /
 259 //
 260 //                  = arctan(z_hi) + acrtan( V' / U' )
 261 //
 262 //
 263 // where
 264 //
 265 //      V' = V - U*z_hi;   U' = U + V*z_hi.
 266 //
 267 // Let
 268 //
 269 //      w_hi + w_lo  = V'/U' to extra precision and
 270 //           w       = V'/U' to working precision
 271 //
 272 // then we can approximate arctan(V'/U') by
 273 //
 274 //      arctan(V'/U') = w_hi + w_lo
 275 //                     + w^3*(Q_1 + w^2*(Q_2 + w^2*(Q_3 + w^2*Q_4)))
 276 //
 277 //                       = w_hi + w_lo + poly
 278 //
 279 // Finally, arctan(z_hi) is calculated beforehand and stored in a table
 280 // as Tbl_hi, Tbl_lo. Thus,
 281 //
 282 //      (A_hi, A_lo) = (Tbl_hi, w_hi+(poly+(w_lo+Tbl_lo)))
 283 //
 284 // This completes the mathematical description.
 285 //
 286 //
 287 // Algorithm
 288 // -------------
 289 //
 290 // Step 0. Check for unsupported format.
 291 //
 292 //    If
 293 //       ( expo(Arg_X) not zero AND msb(Arg_X) = 0 ) OR
 294 //       ( expo(Arg_Y) not zero AND msb(Arg_Y) = 0 )
 295 //
 296 //    then one of the arguments is unsupported. Generate an
 297 //    invalid and return qNaN.
 298 //
 299 // Step 1. Initialize
 300 //
 301 //    Normalize Arg_X and Arg_Y and set the following
 302 //
 303 //    sign_X :=  sign_bit(Arg_X)
 304 //    s_Y    := (sign_bit(Arg_Y)==0? 1.0 : -1.0)
 305 //    swap   := (|Arg_X| >= |Arg_Y|?   0 :  1  )
 306 //    U      := max( |Arg_X|, |Arg_Y| )
 307 //    V      := min( |Arg_X|, |Arg_Y| )
 308 //
 309 //    execute: frcpa E, pred, V, U
 310 //    If pred is 0, go to Step 5 for special cases handling.
 311 //
 312 // Step 2. Decide on branch.
 313 //
 314 //    Q := E * V
 315 //    If Q < 2^(-3) go to Step 4 for simple polynomial case.
 316 //
 317 // Step 3. Table-driven algorithm.
 318 //
 319 //    Q is represented as
 320 //
 321 //      2^(-k) * 1.b_1 b_2 b_3 ... b_63; k = 0,-1,-2,-3
 322 //
 323 // and that if k = 0, b_1 = b_2 = b_3 = b_4 = 0.
 324 //
 325 // Define
 326 //
 327 //      z_hi := 2^(-k) * 1.b_1 b_2 b_3 b_4 1
 328 //
 329 // (note that there are 49 possible values of z_hi).
 330 //
 331 //      ...We now calculate V' and U'. While V' is representable
 332 //      ...as a 64-bit number because of cancellation, U' is
 333 //      ...not in general a 64-bit number. Obtaining U' accurately
 334 //      ...requires two working precision numbers
 335 //
 336 //      U_prime_hi := U + V * z_hi            ...WP approx. to U'
 337 //      U_prime_lo := ( U - U_prime_hi ) + V*z_hi ...observe order
 338 //      V_prime    := V - U * z_hi             ...this is exact
 339 //
 340 //         C_hi := frcpa (1.0, U_prime_hi)  ...C_hi approx 1/U'_hi
 341 //
 342 //      loop 3 times
 343 //         C_hi := C_hi + C_hi*(1.0 - C_hi*U_prime_hi)
 344 //
 345 //      ...at this point C_hi is (1/U_prime_hi) to roughly 64 bits
 346 //
 347 //      w_hi := V_prime * C_hi     ...w_hi is V_prime/U_prime to
 348 //                     ...roughly working precision
 349 //
 350 //         ...note that we want w_hi + w_lo to approximate
 351 //      ...V_prime/(U_prime_hi + U_prime_lo) to extra precision
 352 //         ...but for now, w_hi is good enough for the polynomial
 353 //      ...calculation.
 354 //
 355 //         wsq  := w_hi*w_hi
 356 //      poly := w_hi*wsq*(Q_1 + wsq*(Q_2 + wsq*(Q_3 + wsq*Q_4)))
 357 //
 358 //      Fetch
 359 //      (Tbl_hi, Tbl_lo) = atan(z_hi) indexed by (k,b_1,b_2,b_3,b_4)
 360 //      ...Tbl_hi is a double-precision number
 361 //      ...Tbl_lo is a single-precision number
 362 //
 363 //         (P_hi, P_lo) := M(sign_X,swap)*(Pi_by_2_hi, Pi_by_2_lo)
 364 //      ...as discussed previous. Again; the implementation can
 365 //      ...chose to fetch P_hi and P_lo from a table indexed by
 366 //      ...(sign_X, swap).
 367 //      ...P_hi is a double-precision number;
 368 //      ...P_lo is a single-precision number.
 369 //
 370 //      ...calculate w_lo so that w_hi + w_lo is V'/U' accurately
 371 //         w_lo := ((V_prime - w_hi*U_prime_hi) -
 372 //              w_hi*U_prime_lo) * C_hi     ...observe order
 373 //
 374 //
 375 //      ...Ready to deliver arctan(V'/U') as A_hi, A_lo
 376 //      A_hi := Tbl_hi
 377 //      A_lo := w_hi + (poly + (Tbl_lo + w_lo)) ...observe order
 378 //
 379 //      ...Deliver final Result
 380 //      ...s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo)
 381 //
 382 //      sigma := ( (sign_X XOR swap) ? -1.0 : 1.0 )
 383 //      ...sigma can be obtained by a table lookup using
 384 //      ...(sign_X,swap) as index and stored as single precision
 385 //         ...sigma should be calculated earlier
 386 //
 387 //      P_hi := s_Y*P_hi
 388 //      A_hi := s_Y*A_hi
 389 //
 390 //      Res_hi := P_hi + sigma*A_hi     ...this is exact because
 391 //                          ...both P_hi and Tbl_hi
 392 //                          ...are double-precision
 393 //                          ...and |Tbl_hi| > 2^(-4)
 394 //                          ...P_hi is either 0 or
 395 //                          ...between (1,4)
 396 //
 397 //      Res_lo := sigma*A_lo + P_lo
 398 //
 399 //      Return Res_hi + s_Y*Res_lo in user-defined rounding control
 400 //
 401 // Step 4. Simple polynomial case.
 402 //
 403 //    ...E and Q are inherited from Step 2.
 404 //
 405 //    A_hi := Q     ...Q is inherited from Step 2 Q approx V/U
 406 //
 407 //    loop 3 times
 408 //       E := E + E2(1.0 - E*U1
 409 //    ...at this point E approximates 1/U to roughly working precision
 410 //
 411 //    z := V * E     ...z approximates V/U to roughly working precision
 412 //    zsq := z * z
 413 //    z4 := zsq * zsq; z8 := z4 * z4
 414 //
 415 //    poly1 := P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
 416 //    poly2 := zsq*(P_1 + zsq*(P_2 + zsq*P_3))
 417 //
 418 //    poly  := poly1 + z8*poly2
 419 //
 420 //    z_lo := (V - A_hi*U)*E
 421 //
 422 //    A_lo := z*poly + z_lo
 423 //    ...A_hi, A_lo approximate arctan(V/U) accurately
 424 //
 425 //    (P_hi, P_lo) := M(sign_X,swap)*(Pi_by_2_hi, Pi_by_2_lo)
 426 //    ...one can store the M(sign_X,swap) as single precision
 427 //    ...values
 428 //
 429 //    ...Deliver final Result
 430 //    ...s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo)
 431 //
 432 //    sigma := ( (sign_X XOR swap) ? -1.0 : 1.0 )
 433 //    ...sigma can be obtained by a table lookup using
 434 //    ...(sign_X,swap) as index and stored as single precision
 435 //    ...sigma should be calculated earlier
 436 //
 437 //    P_hi := s_Y*P_hi
 438 //    A_hi := s_Y*A_hi
 439 //
 440 //    Res_hi := P_hi + sigma*A_hi          ...need to compute
 441 //                          ...P_hi + sigma*A_hi
 442 //                          ...exactly
 443 //
 444 //    tmp    := (P_hi - Res_hi) + sigma*A_hi
 445 //
 446 //    Res_lo := s_Y*(sigma*A_lo + P_lo) + tmp
 447 //
 448 //    Return Res_hi + Res_lo in user-defined rounding control
 449 //
 450 // Step 5. Special Cases
 451 //
 452 //    These are detected early in the function by fclass instructions.
 453 //
 454 //    We are in one of those special cases when X or Y is 0,+-inf or NaN
 455 //
 456 //    If one of X and Y is NaN, return X+Y (which will generate
 457 //    invalid in case one is a signaling NaN). Otherwise,
 458 //    return the Result as described in the table
 459 //
 460 //
 461 //
 462 //      \ Y |
 463 //     X \  |  +0  | -0  |  +inf |  -inf  |  finite non-zero
 464 //        \ |      |     |       |        |
 465 //    ______________________________________________________
 466 //          |            |       |        |
 467 //     +-0  |   Invalid/ |  pi/2 | -pi/2  |  sign(Y)*pi/2
 468 //          |    qNaN    |       |        |
 469 //  --------------------------------------------------------
 470 //          |      |     |       |        |
 471 //     +inf |  +0  | -0  |  pi/4 | -pi/4  |  sign(Y)*0
 472 //  --------------------------------------------------------
 473 //          |      |     |       |        |
 474 //     -inf |  +pi | -pi | 3pi/4 | -3pi/4 |  sign(Y)*pi
 475 //  --------------------------------------------------------
 476 //   finite |    X>0?    |  pi/2 | -pi/2  |
 477 //  non-zero| sign(Y)*0: |       |        |      N/A
 478 //       | sign(Y)*pi |       |        |
 479 //
 480 //
 481
 482 ArgY_orig   =   f8
 483 Result      =   f8
 484 FR_RESULT   =   f8
 485 ArgX_orig   =   f9
 486 ArgX        =   f10
 487 FR_X        =   f10
 488 ArgY        =   f11
 489 FR_Y        =   f11
 490 s_Y         =   f12
 491 U           =   f13
 492 V           =   f14
 493 E           =   f15
 494 Q           =   f32
 495 z_hi        =   f33
 496 U_prime_hi  =   f34
 497 U_prime_lo  =   f35
 498 V_prime     =   f36
 499 C_hi        =   f37
 500 w_hi        =   f38
 501 w_lo        =   f39
 502 wsq         =   f40
 503 poly        =   f41
 504 Tbl_hi      =   f42
 505 Tbl_lo      =   f43
 506 P_hi        =   f44
 507 P_lo        =   f45
 508 A_hi        =   f46
 509 A_lo        =   f47
 510 sigma       =   f48
 511 Res_hi      =   f49
 512 Res_lo      =   f50
 513 Z           =   f52
 514 zsq         =   f53
 515 z4          =   f54
 516 z8          =   f54
 517 poly1       =   f55
 518 poly2       =   f56
 519 z_lo        =   f57
 520 tmp         =   f58
 521 P_1         =   f59
 522 Q_1         =   f60
 523 P_2         =   f61
 524 Q_2         =   f62
 525 P_3         =   f63
 526 Q_3         =   f64
 527 P_4         =   f65
 528 Q_4         =   f66
 529 P_5         =   f67
 530 P_6         =   f68
 531 P_7         =   f69
 532 P_8         =   f70
 533 U_hold      =   f71
 534 TWO_TO_NEG3 =   f72
 535 C_hi_hold   =   f73
 536 E_hold      =   f74
 537 M           =   f75
 538 ArgX_abs    =   f76
 539 ArgY_abs    =   f77
 540 Result_lo   =   f78
 541 A_temp      =   f79
 542 FR_temp     =   f80
 543 Xsq         =   f81
 544 Ysq         =   f82
 545 tmp_small   =   f83
 546
 547 GR_SAVE_PFS   = r33
 548 GR_SAVE_B0    = r34
 549 GR_SAVE_GP    = r35
 550 sign_X        = r36
 551 sign_Y        = r37
 552 swap          = r38
 553 table_ptr1    = r39
 554 table_ptr2    = r40
 555 k             = r41
 556 lookup        = r42
 557 exp_ArgX      = r43
 558 exp_ArgY      = r44
 559 exponent_Q    = r45
 560 significand_Q = r46
 561 special       = r47
 562 sp_exp_Q      = r48
 563 sp_exp_4sig_Q = r49
 564 table_base    = r50
 565 int_temp      = r51
 566
 567 GR_Parameter_X      = r49
 568 GR_Parameter_Y      = r50
 569 GR_Parameter_RESULT = r51
 570 GR_Parameter_TAG    = r52
 571 GR_temp             = r52
 572
 573 RODATA
 574 .align 16
 575
 576 LOCAL_OBJECT_START(Constants_atan)
 577 //       double pi/2
 578 data8 0x3FF921FB54442D18
 579 //       single lo_pi/2, two**(-3)
 580 data4 0x248D3132, 0x3E000000
 581 data8 0xAAAAAAAAAAAAAAA3, 0xBFFD // P_1
 582 data8 0xCCCCCCCCCCCC54B2, 0x3FFC // P_2
 583 data8 0x9249249247E4D0C2, 0xBFFC // P_3
 584 data8 0xE38E38E058870889, 0x3FFB // P_4
 585 data8 0xBA2E895B290149F8, 0xBFFB // P_5
 586 data8 0x9D88E6D4250F733D, 0x3FFB // P_6
 587 data8 0x884E51FFFB8745A0, 0xBFFB // P_7
 588 data8 0xE1C7412B394396BD, 0x3FFA // P_8
 589 data8 0xAAAAAAAAAAAAA52F, 0xBFFD // Q_1
 590 data8 0xCCCCCCCCC75B60D3, 0x3FFC // Q_2
 591 data8 0x924923AD011F1940, 0xBFFC // Q_3
 592 data8 0xE36F716D2A5F89BD, 0x3FFB // Q_4
 593 //
 594 //    Entries Tbl_hi  (double precision)
 595 //    B = 1+Index/16+1/32  Index = 0
 596 //    Entries Tbl_lo (single precision)
 597 //    B = 1+Index/16+1/32  Index = 0
 598 //
 599 data8 0x3FE9A000A935BD8E
 600 data4 0x23ACA08F, 0x00000000
 601 //
 602 //    Entries Tbl_hi  (double precision) Index = 0,1,...,15
 603 //    B = 2^(-1)*(1+Index/16+1/32)
 604 //    Entries Tbl_lo (single precision)
 605 //    Index = 0,1,...,15  B = 2^(-1)*(1+Index/16+1/32)
 606 //
 607 data8 0x3FDE77EB7F175A34
 608 data4 0x238729EE, 0x00000000
 609 data8 0x3FE0039C73C1A40B
 610 data4 0x249334DB, 0x00000000
 611 data8 0x3FE0C6145B5B43DA
 612 data4 0x22CBA7D1, 0x00000000
 613 data8 0x3FE1835A88BE7C13
 614 data4 0x246310E7, 0x00000000
 615 data8 0x3FE23B71E2CC9E6A
 616 data4 0x236210E5, 0x00000000
 617 data8 0x3FE2EE628406CBCA
 618 data4 0x2462EAF5, 0x00000000
 619 data8 0x3FE39C391CD41719
 620 data4 0x24B73EF3, 0x00000000
 621 data8 0x3FE445065B795B55
 622 data4 0x24C11260, 0x00000000
 623 data8 0x3FE4E8DE5BB6EC04
 624 data4 0x242519EE, 0x00000000
 625 data8 0x3FE587D81F732FBA
 626 data4 0x24D4346C, 0x00000000
 627 data8 0x3FE6220D115D7B8D
 628 data4 0x24ED487B, 0x00000000
 629 data8 0x3FE6B798920B3D98
 630 data4 0x2495FF1E, 0x00000000
 631 data8 0x3FE748978FBA8E0F
 632 data4 0x223D9531, 0x00000000
 633 data8 0x3FE7D528289FA093
 634 data4 0x242B0411, 0x00000000
 635 data8 0x3FE85D69576CC2C5
 636 data4 0x2335B374, 0x00000000
 637 data8 0x3FE8E17AA99CC05D
 638 data4 0x24C27CFB, 0x00000000
 639 //
 640 //    Entries Tbl_hi  (double precision) Index = 0,1,...,15
 641 //    B = 2^(-2)*(1+Index/16+1/32)
 642 //    Entries Tbl_lo (single precision)
 643 //    Index = 0,1,...,15  B = 2^(-2)*(1+Index/16+1/32)
 644 //
 645 data8 0x3FD025FA510665B5
 646 data4 0x24263482, 0x00000000
 647 data8 0x3FD1151A362431C9
 648 data4 0x242C8DC9, 0x00000000
 649 data8 0x3FD2025567E47C95
 650 data4 0x245CF9BA, 0x00000000
 651 data8 0x3FD2ED987A823CFE
 652 data4 0x235C892C, 0x00000000
 653 data8 0x3FD3D6D129271134
 654 data4 0x2389BE52, 0x00000000
 655 data8 0x3FD4BDEE586890E6
 656 data4 0x24436471, 0x00000000
 657 data8 0x3FD5A2E0175E0F4E
 658 data4 0x2389DBD4, 0x00000000
 659 data8 0x3FD685979F5FA6FD
 660 data4 0x2476D43F, 0x00000000
 661 data8 0x3FD7660752817501
 662 data4 0x24711774, 0x00000000
 663 data8 0x3FD84422B8DF95D7
 664 data4 0x23EBB501, 0x00000000
 665 data8 0x3FD91FDE7CD0C662
 666 data4 0x23883A0C, 0x00000000
 667 data8 0x3FD9F93066168001
 668 data4 0x240DF63F, 0x00000000
 669 data8 0x3FDAD00F5422058B
 670 data4 0x23FE261A, 0x00000000
 671 data8 0x3FDBA473378624A5
 672 data4 0x23A8CD0E, 0x00000000
 673 data8 0x3FDC76550AAD71F8
 674 data4 0x2422D1D0, 0x00000000
 675 data8 0x3FDD45AEC9EC862B
 676 data4 0x2344A109, 0x00000000
 677 //
 678 //    Entries Tbl_hi  (double precision) Index = 0,1,...,15
 679 //    B = 2^(-3)*(1+Index/16+1/32)
 680 //    Entries Tbl_lo (single precision)
 681 //    Index = 0,1,...,15  B = 2^(-3)*(1+Index/16+1/32)
 682 //
 683 data8 0x3FC068D584212B3D
 684 data4 0x239874B6, 0x00000000
 685 data8 0x3FC1646541060850
 686 data4 0x2335E774, 0x00000000
 687 data8 0x3FC25F6E171A535C
 688 data4 0x233E36BE, 0x00000000
 689 data8 0x3FC359E8EDEB99A3
 690 data4 0x239680A3, 0x00000000
 691 data8 0x3FC453CEC6092A9E
 692 data4 0x230FB29E, 0x00000000
 693 data8 0x3FC54D18BA11570A
 694 data4 0x230C1418, 0x00000000
 695 data8 0x3FC645BFFFB3AA73
 696 data4 0x23F0564A, 0x00000000
 697 data8 0x3FC73DBDE8A7D201
 698 data4 0x23D4A5E1, 0x00000000
 699 data8 0x3FC8350BE398EBC7
 700 data4 0x23D4ADDA, 0x00000000
 701 data8 0x3FC92BA37D050271
 702 data4 0x23BCB085, 0x00000000
 703 data8 0x3FCA217E601081A5
 704 data4 0x23BC841D, 0x00000000
 705 data8 0x3FCB1696574D780B
 706 data4 0x23CF4A8E, 0x00000000
 707 data8 0x3FCC0AE54D768466
 708 data4 0x23BECC90, 0x00000000
 709 data8 0x3FCCFE654E1D5395
 710 data4 0x2323DCD2, 0x00000000
 711 data8 0x3FCDF110864C9D9D
 712 data4 0x23F53F3A, 0x00000000
 713 data8 0x3FCEE2E1451D980C
 714 data4 0x23CCB11F, 0x00000000
 715 //
 716 data8 0x400921FB54442D18, 0x3CA1A62633145C07 // PI two doubles
 717 data8 0x3FF921FB54442D18, 0x3C91A62633145C07 // PI_by_2 two dbles
 718 data8 0x3FE921FB54442D18, 0x3C81A62633145C07 // PI_by_4 two dbles
 719 data8 0x4002D97C7F3321D2, 0x3C9A79394C9E8A0A // 3PI_by_4 two dbles
 720 LOCAL_OBJECT_END(Constants_atan)
 721
 722
 723 .section .text
 724 GLOBAL_IEEE754_ENTRY(atanl)
 725
 726 // Use common code with atan2l after setting x=1.0
 727 { .mfi
 728       alloc r32 = ar.pfs, 0, 17, 4, 0
 729       fma.s1 Ysq = ArgY_orig, ArgY_orig, f0          // Form y*y
 730       nop.i 999
 731 }
 732 { .mfi
 733       addl table_ptr1 = @ltoff(Constants_atan#), gp  // Address of table pointer
 734       fma.s1 Xsq = f1, f1, f0                        // Form x*x
 735       nop.i 999
 736 }
 737 ;;
 738
 739 { .mfi
 740       ld8 table_ptr1 = [table_ptr1]                  // Get table pointer
 741       fnorm.s1 ArgY = ArgY_orig
 742       nop.i 999
 743 }
 744 { .mfi
 745       nop.m 999
 746       fnorm.s1 ArgX = f1
 747       nop.i 999
 748 }
 749 ;;
 750
 751 { .mfi
 752       getf.exp sign_X = f1               // Get signexp of x
 753       fmerge.s ArgX_abs = f0, f1         // Form |x|
 754       nop.i 999
 755 }
 756 { .mfi
 757       nop.m 999
 758       fnorm.s1 ArgX_orig = f1
 759       nop.i 999
 760 }
 761 ;;
 762
 763 { .mfi
 764       getf.exp sign_Y = ArgY_orig        // Get signexp of y
 765       fmerge.s ArgY_abs = f0, ArgY_orig  // Form |y|
 766       mov table_base = table_ptr1        // Save base pointer to tables
 767 }
 768 ;;
 769
 770 { .mfi
 771       ldfd P_hi = [table_ptr1],8         // Load double precision hi part of pi
 772       fclass.m p8,p0 = ArgY_orig, 0x1e7  // Test y natval, nan, inf, zero
 773       nop.i 999
 774 }
 775 ;;
 776
 777 { .mfi
 778       ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3
 779       nop.f 999
 780       nop.i 999
 781 }
 782 { .mfi
 783       nop.m 999
 784       fma.s1 M = f1, f1, f0              // Set M = 1.0
 785       nop.i 999
 786 }
 787 ;;
 788
 789 //
 790 //     Check for everything - if false, then must be pseudo-zero
 791 //     or pseudo-nan (IA unsupporteds).
 792 //
 793 { .mfb
 794       nop.m 999
 795       fclass.m p0,p12 = f1, 0x1FF        // Test x unsupported
 796 (p8)  br.cond.spnt ATANL_Y_SPECIAL       // Branch if y natval, nan, inf, zero
 797 }
 798 ;;
 799
 800 //     U = max(ArgX_abs,ArgY_abs)
 801 //     V = min(ArgX_abs,ArgY_abs)
 802 { .mfi
 803       nop.m 999
 804       fcmp.ge.s1 p6,p7 = Xsq, Ysq        // Test for |x| >= |y| using squares
 805       nop.i 999
 806 }
 807 { .mfb
 808       nop.m 999
 809       fma.s1 V = ArgX_abs, f1, f0        // Set V assuming |x| < |y|
 810       br.cond.sptk ATANL_COMMON          // Branch to common code
 811 }
 812 ;;
 813
 814 GLOBAL_IEEE754_END(atanl)
 815 GLOBAL_IEEE754_ENTRY(atan2l)
 816
 817 { .mfi
 818       alloc r32 = ar.pfs, 0, 17, 4, 0
 819       fma.s1 Ysq = ArgY_orig, ArgY_orig, f0          // Form y*y
 820       nop.i 999
 821 }
 822 { .mfi
 823       addl table_ptr1 = @ltoff(Constants_atan#), gp  // Address of table pointer
 824       fma.s1 Xsq = ArgX_orig, ArgX_orig, f0          // Form x*x
 825       nop.i 999
 826 }
 827 ;;
 828
 829 { .mfi
 830       ld8 table_ptr1 = [table_ptr1]                  // Get table pointer
 831       fnorm.s1 ArgY = ArgY_orig
 832       nop.i 999
 833 }
 834 { .mfi
 835       nop.m 999
 836       fnorm.s1 ArgX = ArgX_orig
 837       nop.i 999
 838 }
 839 ;;
 840
 841 { .mfi
 842       getf.exp sign_X = ArgX_orig        // Get signexp of x
 843       fmerge.s ArgX_abs = f0, ArgX_orig  // Form |x|
 844       nop.i 999
 845 }
 846 ;;
 847
 848 { .mfi
 849       getf.exp sign_Y = ArgY_orig        // Get signexp of y
 850       fmerge.s ArgY_abs = f0, ArgY_orig  // Form |y|
 851       mov table_base = table_ptr1        // Save base pointer to tables
 852 }
 853 ;;
 854
 855 { .mfi
 856       ldfd P_hi = [table_ptr1],8         // Load double precision hi part of pi
 857       fclass.m p8,p0 = ArgY_orig, 0x1e7  // Test y natval, nan, inf, zero
 858       nop.i 999
 859 }
 860 ;;
 861
 862 { .mfi
 863       ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3
 864       fclass.m p9,p0 = ArgX_orig, 0x1e7  // Test x natval, nan, inf, zero
 865       nop.i 999
 866 }
 867 { .mfi
 868       nop.m 999
 869       fma.s1 M = f1, f1, f0              // Set M = 1.0
 870       nop.i 999
 871 }
 872 ;;
 873
 874 //
 875 //     Check for everything - if false, then must be pseudo-zero
 876 //     or pseudo-nan (IA unsupporteds).
 877 //
 878 { .mfb
 879       nop.m 999
 880       fclass.m p0,p12 = ArgX_orig, 0x1FF // Test x unsupported
 881 (p8)  br.cond.spnt ATANL_Y_SPECIAL       // Branch if y natval, nan, inf, zero
 882 }
 883 ;;
 884
 885 //     U = max(ArgX_abs,ArgY_abs)
 886 //     V = min(ArgX_abs,ArgY_abs)
 887 { .mfi
 888       nop.m 999
 889       fcmp.ge.s1 p6,p7 = Xsq, Ysq        // Test for |x| >= |y| using squares
 890       nop.i 999
 891 }
 892 { .mfb
 893       nop.m 999
 894       fma.s1 V = ArgX_abs, f1, f0        // Set V assuming |x| < |y|
 895 (p9)  br.cond.spnt ATANL_X_SPECIAL       // Branch if x natval, nan, inf, zero
 896 }
 897 ;;
 898
 899 // Now common code for atanl and atan2l
 900 ATANL_COMMON:
 901 { .mfi
 902       nop.m 999
 903       fclass.m p0,p13 = ArgY_orig, 0x1FF // Test y unsupported
 904       shr sign_X = sign_X, 17            // Get sign bit of x
 905 }
 906 { .mfi
 907       nop.m 999
 908       fma.s1 U = ArgY_abs, f1, f0        // Set U assuming |x| < |y|
 909       adds table_ptr1 = 176, table_ptr1  // Point to Q4
 910 }
 911 ;;
 912
 913 { .mfi
 914 (p6)  add swap = r0, r0                  // Set swap=0 if |x| >= |y|
 915 (p6)  frcpa.s1 E, p0 = ArgY_abs, ArgX_abs // Compute E if |x| >= |y|
 916       shr sign_Y = sign_Y, 17            // Get sign bit of y
 917 }
 918 { .mfb
 919       nop.m 999
 920 (p6)  fma.s1 V = ArgY_abs, f1, f0        // Set V if |x| >= |y|
 921 (p12) br.cond.spnt ATANL_UNSUPPORTED     // Branch if x unsupported
 922 }
 923 ;;
 924
 925 // Set p8 if y >=0
 926 // Set p9 if y < 0
 927 // Set p10 if |x| >= |y| and x >=0
 928 // Set p11 if |x| >= |y| and x < 0
 929 { .mfi
 930       cmp.eq p8, p9 = 0, sign_Y          // Test for y >= 0
 931 (p7)  frcpa.s1 E, p0 = ArgX_abs, ArgY_abs // Compute E if |x| < |y|
 932 (p7)  add swap = 1, r0                   // Set swap=1 if |x| < |y|
 933 }
 934 { .mfb
 935 (p6)  cmp.eq.unc p10, p11 = 0, sign_X    // If |x| >= |y|, test for x >= 0
 936 (p6)  fma.s1 U = ArgX_abs, f1, f0        // Set U if |x| >= |y|
 937 (p13) br.cond.spnt ATANL_UNSUPPORTED     // Branch if y unsupported
 938 }
 939 ;;
 940
 941 //
 942 //     if p8, s_Y = 1.0
 943 //     if p9, s_Y = -1.0
 944 //
 945 .pred.rel "mutex",p8,p9
 946 { .mfi
 947       nop.m 999
 948 (p8)  fadd.s1 s_Y = f0, f1               // If y >= 0 set s_Y = 1.0
 949       nop.i 999
 950 }
 951 { .mfi
 952       nop.m 999
 953 (p9)  fsub.s1 s_Y = f0, f1               // If y < 0 set s_Y = -1.0
 954       nop.i 999
 955 }
 956 ;;
 957
 958 .pred.rel "mutex",p10,p11
 959 { .mfi
 960       nop.m 999
 961 (p10) fsub.s1 M = M, f1                  // If |x| >= |y| and x >=0, set M=0
 962       nop.i 999
 963 }
 964 { .mfi
 965       nop.m 999
 966 (p11) fadd.s1 M = M, f1                  // If |x| >= |y| and x < 0, set M=2.0
 967       nop.i 999
 968 }
 969 ;;
 970
 971 { .mfi
 972       nop.m 999
 973       fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig // Dummy to set denormal flag
 974       nop.i 999
 975 }
 976 // *************************************************
 977 // ********************* STEP2 *********************
 978 // *************************************************
 979 //
 980 //     Q = E * V
 981 //
 982 { .mfi
 983       nop.m 999
 984       fmpy.s1 Q = E, V
 985       nop.i 999
 986 }
 987 ;;
 988
 989 { .mfi
 990       nop.m 999
 991       fnma.s1 E_hold = E, U, f1           // E_hold = 1.0 - E*U (1) if POLY path
 992       nop.i 999
 993 }
 994 ;;
 995
 996 // Create a single precision representation of the signexp of Q with the
 997 // 4 most significant bits of the significand followed by a 1 and then 18 0's
 998 { .mfi
 999       nop.m 999
1000       fmpy.s1 P_hi = M, P_hi
1001       dep.z special = 0x1, 18, 1           // Form 0x0000000000040000
1002 }
1003 { .mfi
1004       nop.m 999
1005       fmpy.s1 P_lo = M, P_lo
1006       add table_ptr2 = 32, table_ptr1
1007 }
1008 ;;
1009
1010 { .mfi
1011       nop.m 999
1012       fma.s1 A_temp = Q, f1, f0            // Set A_temp if POLY path
1013       nop.i 999
1014 }
1015 { .mfi
1016       nop.m 999
1017       fma.s1 E = E, E_hold, E              // E = E + E*E_hold (1) if POLY path
1018       nop.i 999
1019 }
1020 ;;
1021
1022 //
1023 //     Is Q < 2**(-3)?
1024 //     swap = xor(swap,sign_X)
1025 //
1026 { .mfi
1027       nop.m 999
1028       fcmp.lt.s1 p9, p0 = Q, TWO_TO_NEG3    // Test Q < 2^-3
1029       xor swap = sign_X, swap
1030 }
1031 ;;
1032
1033 //     P_hi = s_Y * P_hi
1034 { .mmf
1035       getf.exp exponent_Q =  Q              // Get signexp of Q
1036       cmp.eq.unc p7, p6 = 0x00000, swap
1037       fmpy.s1 P_hi = s_Y, P_hi
1038 }
1039 ;;
1040
1041 //
1042 //     if (PR_1) sigma = -1.0
1043 //     if (PR_2) sigma =  1.0
1044 //
1045 { .mfi
1046       getf.sig significand_Q = Q            // Get significand of Q
1047 (p6)  fsub.s1 sigma = f0, f1
1048       nop.i 999
1049 }
1050 { .mfb
1051 (p9)  add table_ptr1 = 128, table_base      // Point to P8 if POLY path
1052 (p7)  fadd.s1 sigma = f0, f1
1053 (p9)  br.cond.spnt ATANL_POLY               // Branch to POLY if 0 < Q < 2^-3
1054 }
1055 ;;
1056
1057 //
1058 // *************************************************
1059 // ******************** STEP3 **********************
1060 // *************************************************
1061 //
1062 //     lookup = b_1 b_2 b_3 B_4
1063 //
1064 { .mmi
1065       nop.m 999
1066       nop.m 999
1067       andcm k = 0x0003, exponent_Q  // k=0,1,2,3 for exp_Q=0,-1,-2,-3
1068 }
1069 ;;
1070
1071 //
1072 //  Generate sign_exp_Q b_1 b_2 b_3 b_4 1 0 0 0 ... 0  in single precision
1073 //  representation.  Note sign of Q is always 0.
1074 //
1075 { .mfi
1076       cmp.eq p8, p9 = 0x0000, k             // Test k=0
1077       nop.f 999
1078       extr.u lookup = significand_Q, 59, 4  // Extract b_1 b_2 b_3 b_4 for index
1079 }
1080 { .mfi
1081       sub sp_exp_Q = 0x7f, k                // Form single prec biased exp of Q
1082       nop.f 999
1083       sub k = k, r0, 1                      // Decrement k
1084 }
1085 ;;
1086
1087 //     Form pointer to B index table
1088 { .mfi
1089       ldfe Q_4 = [table_ptr1], -16          // Load Q_4
1090       nop.f 999
1091 (p9)  shl k = k, 8                          // k = 0, 256, or 512
1092 }
1093 { .mfi
1094 (p9)  shladd table_ptr2 = lookup, 4, table_ptr2
1095       nop.f 999
1096       shladd sp_exp_4sig_Q = sp_exp_Q, 4, lookup // Shift and add in 4 high bits
1097 }
1098 ;;
1099
1100 { .mmi
1101 (p8)  add table_ptr2 = -16, table_ptr2      // Pointer if original k was 0
1102 (p9)  add table_ptr2 = k, table_ptr2        // Pointer if k was 1, 2, 3
1103       dep special = sp_exp_4sig_Q, special, 19, 13 // Form z_hi as single prec
1104 }
1105 ;;
1106
1107 //     z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
1108 { .mmi
1109       ldfd Tbl_hi = [table_ptr2], 8         // Load Tbl_hi from index table
1110 ;;
1111       setf.s z_hi = special                 // Form z_hi
1112       nop.i 999
1113 }
1114 { .mmi
1115       ldfs Tbl_lo = [table_ptr2], 8         // Load Tbl_lo from index table
1116 ;;
1117       ldfe Q_3 = [table_ptr1], -16          // Load Q_3
1118       nop.i 999
1119 }
1120 ;;
1121
1122 { .mmi
1123       ldfe Q_2 = [table_ptr1], -16          // Load Q_2
1124       nop.m 999
1125       nop.i 999
1126 }
1127 ;;
1128
1129 { .mmf
1130       ldfe Q_1 = [table_ptr1], -16          // Load Q_1
1131       nop.m 999
1132       nop.f 999
1133 }
1134 ;;
1135
1136 { .mfi
1137       nop.m 999
1138       fma.s1 U_prime_hi = V, z_hi, U        // U_prime_hi = U + V * z_hi
1139       nop.i 999
1140 }
1141 { .mfi
1142       nop.m 999
1143       fnma.s1 V_prime = U, z_hi, V          // V_prime =  V - U * z_hi
1144       nop.i 999
1145 }
1146 ;;
1147
1148 { .mfi
1149       nop.m 999
1150       mov A_hi = Tbl_hi                     // Start with A_hi = Tbl_hi
1151       nop.i 999
1152 }
1153 ;;
1154
1155 { .mfi
1156       nop.m 999
1157       fsub.s1 U_hold = U, U_prime_hi        // U_hold = U - U_prime_hi
1158       nop.i 999
1159 }
1160 ;;
1161
1162 { .mfi
1163       nop.m 999
1164       frcpa.s1 C_hi, p0 = f1, U_prime_hi    // C_hi = frcpa(1,U_prime_hi)
1165       nop.i 999
1166 }
1167 ;;
1168
1169 { .mfi
1170       nop.m 999
1171       fmpy.s1 A_hi = s_Y, A_hi              // A_hi = s_Y * A_hi
1172       nop.i 999
1173 }
1174 ;;
1175
1176 { .mfi
1177       nop.m 999
1178       fma.s1 U_prime_lo = z_hi, V, U_hold   // U_prime_lo =  U_hold + V * z_hi
1179       nop.i 999
1180 }
1181 ;;
1182
1183 //     C_hi_hold = 1 - C_hi * U_prime_hi (1)
1184 { .mfi
1185       nop.m 999
1186       fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
1187       nop.i 999
1188 }
1189 ;;
1190
1191 { .mfi
1192       nop.m 999
1193       fma.s1 Res_hi = sigma, A_hi, P_hi   // Res_hi = P_hi + sigma * A_hi
1194       nop.i 999
1195 }
1196 ;;
1197
1198 { .mfi
1199       nop.m 999
1200       fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (1)
1201       nop.i 999
1202 }
1203 ;;
1204
1205 //     C_hi_hold = 1 - C_hi * U_prime_hi (2)
1206 { .mfi
1207       nop.m 999
1208       fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
1209       nop.i 999
1210 }
1211 ;;
1212
1213 { .mfi
1214       nop.m 999
1215       fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (2)
1216       nop.i 999
1217 }
1218 ;;
1219
1220 //     C_hi_hold = 1 - C_hi * U_prime_hi (3)
1221 { .mfi
1222       nop.m 999
1223       fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
1224       nop.i 999
1225 }
1226 ;;
1227
1228 { .mfi
1229       nop.m 999
1230       fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (3)
1231       nop.i 999
1232 }
1233 ;;
1234
1235 { .mfi
1236       nop.m 999
1237       fmpy.s1 w_hi = V_prime, C_hi           // w_hi = V_prime * C_hi
1238       nop.i 999
1239 }
1240 ;;
1241
1242 { .mfi
1243       nop.m 999
1244       fmpy.s1 wsq = w_hi, w_hi               // wsq = w_hi * w_hi
1245       nop.i 999
1246 }
1247 { .mfi
1248       nop.m 999
1249       fnma.s1 w_lo = w_hi, U_prime_hi, V_prime // w_lo = V_prime-w_hi*U_prime_hi
1250       nop.i 999
1251 }
1252 ;;
1253
1254 { .mfi
1255       nop.m 999
1256       fma.s1 poly =  wsq, Q_4, Q_3           // poly = Q_3 + wsq * Q_4
1257       nop.i 999
1258 }
1259 { .mfi
1260       nop.m 999
1261       fnma.s1 w_lo = w_hi, U_prime_lo, w_lo  // w_lo = w_lo - w_hi * U_prime_lo
1262       nop.i 999
1263 }
1264 ;;
1265
1266 { .mfi
1267       nop.m 999
1268       fma.s1 poly = wsq, poly, Q_2           // poly = Q_2 + wsq * poly
1269       nop.i 999
1270 }
1271 { .mfi
1272       nop.m 999
1273       fmpy.s1 w_lo = C_hi, w_lo              // w_lo =  = w_lo * C_hi
1274       nop.i 999
1275 }
1276 ;;
1277
1278 { .mfi
1279       nop.m 999
1280       fma.s1 poly = wsq, poly, Q_1           // poly = Q_1 + wsq * poly
1281       nop.i 999
1282 }
1283 { .mfi
1284       nop.m 999
1285       fadd.s1 A_lo = Tbl_lo, w_lo            // A_lo = Tbl_lo + w_lo
1286       nop.i 999
1287 }
1288 ;;
1289
1290 { .mfi
1291       nop.m 999
1292       fmpy.s0 Q_1 =  Q_1, Q_1                // Dummy operation to raise inexact
1293       nop.i 999
1294 }
1295 ;;
1296
1297 { .mfi
1298       nop.m 999
1299       fmpy.s1 poly = wsq, poly               // poly = wsq * poly
1300       nop.i 999
1301 }
1302 ;;
1303
1304 { .mfi
1305       nop.m 999
1306       fmpy.s1 poly = w_hi, poly              // poly = w_hi * poly
1307       nop.i 999
1308 }
1309 ;;
1310
1311 { .mfi
1312       nop.m 999
1313       fadd.s1 A_lo = A_lo, poly              // A_lo = A_lo + poly
1314       nop.i 999
1315 }
1316 ;;
1317
1318 { .mfi
1319       nop.m 999
1320       fadd.s1 A_lo = A_lo, w_hi              // A_lo = A_lo + w_hi
1321       nop.i 999
1322 }
1323 ;;
1324
1325 { .mfi
1326       nop.m 999
1327       fma.s1 Res_lo = sigma, A_lo, P_lo      // Res_lo = P_lo + sigma * A_lo
1328       nop.i 999
1329 }
1330 ;;
1331
1332 //
1333 //     Result  =  Res_hi + Res_lo * s_Y  (User Supplied Rounding Mode)
1334 //
1335 { .mfb
1336       nop.m 999
1337       fma.s0 Result = Res_lo, s_Y, Res_hi
1338       br.ret.sptk   b0                        // Exit table path 2^-3 <= V/U < 1
1339 }
1340 ;;
1341
1342
1343 ATANL_POLY:
1344 // Here if 0 < V/U < 2^-3
1345 //
1346 // ***********************************************
1347 // ******************** STEP4 ********************
1348 // ***********************************************
1349
1350 //
1351 //     Following:
1352 //     Iterate 3 times E = E + E*(1.0 - E*U)
1353 //     Also load P_8, P_7, P_6, P_5, P_4
1354 //
1355 { .mfi
1356       ldfe P_8 = [table_ptr1], -16            // Load P_8
1357       fnma.s1 z_lo = A_temp, U, V             // z_lo = V - A_temp * U
1358       nop.i 999
1359 }
1360 { .mfi
1361       nop.m 999
1362       fnma.s1 E_hold = E, U, f1               // E_hold = 1.0 - E*U (2)
1363       nop.i 999
1364 }
1365 ;;
1366
1367 { .mmi
1368       ldfe P_7 = [table_ptr1], -16            // Load P_7
1369 ;;
1370       ldfe P_6 = [table_ptr1], -16            // Load P_6
1371       nop.i 999
1372 }
1373 ;;
1374
1375 { .mfi
1376       ldfe P_5 = [table_ptr1], -16            // Load P_5
1377       fma.s1 E = E, E_hold, E                 // E = E + E_hold*E (2)
1378       nop.i 999
1379 }
1380 ;;
1381
1382 { .mmi
1383       ldfe P_4 = [table_ptr1], -16            // Load P_4
1384 ;;
1385       ldfe P_3 = [table_ptr1], -16            // Load P_3
1386       nop.i 999
1387 }
1388 ;;
1389
1390 { .mfi
1391       ldfe P_2 = [table_ptr1], -16            // Load P_2
1392       fnma.s1 E_hold = E, U, f1               // E_hold = 1.0 - E*U (3)
1393       nop.i 999
1394 }
1395 { .mlx
1396       nop.m 999
1397       movl         int_temp = 0x24005         // Signexp for small neg number
1398 }
1399 ;;
1400
1401 { .mmf
1402       ldfe P_1 = [table_ptr1], -16            // Load P_1
1403       setf.exp     tmp_small = int_temp       // Form small neg number
1404       fma.s1 E = E, E_hold, E                 // E = E + E_hold*E (3)
1405 }
1406 ;;
1407
1408 //
1409 //
1410 // At this point E approximates 1/U to roughly working precision
1411 // Z = V*E approximates V/U
1412 //
1413 { .mfi
1414       nop.m 999
1415       fmpy.s1 Z = V, E                         // Z = V * E
1416       nop.i 999
1417 }
1418 { .mfi
1419       nop.m 999
1420       fmpy.s1 z_lo = z_lo, E                   // z_lo = z_lo * E
1421       nop.i 999
1422 }
1423 ;;
1424
1425 //
1426 //     Now what we want to do is
1427 //     poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
1428 //     poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3))
1429 //
1430 //
1431 //     Fixup added to force inexact later -
1432 //     A_hi = A_temp + z_lo
1433 //     z_lo = (A_temp - A_hi) + z_lo
1434 //
1435 { .mfi
1436       nop.m 999
1437       fmpy.s1 zsq = Z, Z                        // zsq = Z * Z
1438       nop.i 999
1439 }
1440 { .mfi
1441       nop.m 999
1442       fadd.s1 A_hi = A_temp, z_lo               // A_hi = A_temp + z_lo
1443       nop.i 999
1444 }
1445 ;;
1446
1447 { .mfi
1448       nop.m 999
1449       fma.s1 poly1 = zsq, P_8, P_7              // poly1 = P_7 + zsq * P_8
1450       nop.i 999
1451 }
1452 { .mfi
1453       nop.m 999
1454       fma.s1 poly2 = zsq, P_3, P_2              // poly2 = P_2 + zsq * P_3
1455       nop.i 999
1456 }
1457 ;;
1458
1459 { .mfi
1460       nop.m 999
1461       fmpy.s1 z4 = zsq, zsq                     // z4 = zsq * zsq
1462       nop.i 999
1463 }
1464 { .mfi
1465       nop.m 999
1466       fsub.s1 A_temp = A_temp, A_hi             // A_temp = A_temp - A_hi
1467       nop.i 999
1468 }
1469 ;;
1470
1471 { .mfi
1472       nop.m 999
1473       fmerge.s     tmp = A_hi, A_hi             // Copy tmp = A_hi
1474       nop.i 999
1475 }
1476 ;;
1477
1478 { .mfi
1479       nop.m 999
1480       fma.s1 poly1 = zsq, poly1, P_6            // poly1 = P_6 + zsq * poly1
1481       nop.i 999
1482 }
1483 { .mfi
1484       nop.m 999
1485       fma.s1 poly2 = zsq, poly2, P_1            // poly2 = P_2 + zsq * poly2
1486       nop.i 999
1487 }
1488 ;;
1489
1490 { .mfi
1491       nop.m 999
1492       fmpy.s1 z8 = z4, z4                       // z8 = z4 * z4
1493       nop.i 999
1494 }
1495 { .mfi
1496       nop.m 999
1497       fadd.s1 z_lo = A_temp, z_lo               // z_lo = (A_temp - A_hi) + z_lo
1498       nop.i 999
1499 }
1500 ;;
1501
1502 { .mfi
1503       nop.m 999
1504       fma.s1 poly1 = zsq, poly1, P_5            // poly1 = P_5 + zsq * poly1
1505       nop.i 999
1506 }
1507 { .mfi
1508       nop.m 999
1509       fmpy.s1 poly2 = poly2, zsq                // poly2 = zsq * poly2
1510       nop.i 999
1511 }
1512 ;;
1513
1514 //     Create small GR double in case need to raise underflow
1515 { .mfi
1516       nop.m 999
1517       fma.s1 poly1 = zsq, poly1, P_4            // poly1 = P_4 + zsq * poly1
1518       dep GR_temp = -1,r0,0,53
1519 }
1520 ;;
1521
1522 //     Create small double in case need to raise underflow
1523 { .mfi
1524       setf.d FR_temp = GR_temp
1525       fma.s1 poly = z8, poly1, poly2            // poly = poly2 + z8 * poly1
1526       nop.i 999
1527 }
1528 ;;
1529
1530 { .mfi
1531       nop.m 999
1532       fma.s1 A_lo = Z, poly, z_lo               // A_lo = z_lo + Z * poly
1533       nop.i 999
1534 }
1535 ;;
1536
1537 { .mfi
1538       nop.m 999
1539       fadd.s1      A_hi = tmp, A_lo             // A_hi = tmp + A_lo
1540       nop.i 999
1541 }
1542 ;;
1543
1544 { .mfi
1545       nop.m 999
1546       fsub.s1      tmp = tmp, A_hi              // tmp = tmp - A_hi
1547       nop.i 999
1548 }
1549 { .mfi
1550       nop.m 999
1551       fmpy.s1 A_hi = s_Y, A_hi                  // A_hi = s_Y * A_hi
1552       nop.i 999
1553 }
1554 ;;
1555
1556 { .mfi
1557       nop.m 999
1558       fadd.s1      A_lo = tmp, A_lo             // A_lo = tmp + A_lo
1559       nop.i 999
1560 }
1561 { .mfi
1562       nop.m 999
1563       fma.s1 Res_hi = sigma, A_hi, P_hi         // Res_hi = P_hi + sigma * A_hi
1564       nop.i 999
1565 }
1566 ;;
1567
1568 { .mfi
1569       nop.m 999
1570       fsub.s1 tmp =  P_hi, Res_hi               // tmp = P_hi - Res_hi
1571       nop.i 999
1572 }
1573 ;;
1574
1575 //
1576 //     Test if A_lo is zero
1577 //
1578 { .mfi
1579       nop.m 999
1580       fclass.m p6,p0 = A_lo, 0x007              // Test A_lo = 0
1581       nop.i 999
1582 }
1583 ;;
1584
1585 { .mfi
1586       nop.m 999
1587 (p6)  mov          A_lo = tmp_small             // If A_lo zero, make very small
1588       nop.i 999
1589 }
1590 ;;
1591
1592 { .mfi
1593       nop.m 999
1594       fma.s1 tmp = A_hi, sigma, tmp             // tmp = sigma * A_hi  + tmp
1595       nop.i 999
1596 }
1597 { .mfi
1598       nop.m 999
1599       fma.s1 sigma =  A_lo, sigma, P_lo         // sigma = A_lo * sigma  + P_lo
1600       nop.i 999
1601 }
1602 ;;
1603
1604 { .mfi
1605       nop.m 999
1606       fma.s1 Res_lo = s_Y, sigma, tmp           // Res_lo = s_Y * sigma + tmp
1607       nop.i 999
1608 }
1609 ;;
1610
1611 //
1612 //     Test if Res_lo is denormal
1613 //
1614 { .mfi
1615       nop.m 999
1616       fclass.m p14, p15 = Res_lo, 0x0b
1617       nop.i 999
1618 }
1619 ;;
1620
1621 //
1622 //     Compute Result = Res_lo + Res_hi.  Use s3 if Res_lo is denormal.
1623 //
1624 { .mfi
1625       nop.m 999
1626 (p14) fadd.s3 Result = Res_lo, Res_hi     // Result for Res_lo denormal
1627       nop.i 999
1628 }
1629 { .mfi
1630       nop.m 999
1631 (p15) fadd.s0 Result = Res_lo, Res_hi     // Result for Res_lo normal
1632       nop.i 999
1633 }
1634 ;;
1635
1636 //
1637 //     If Res_lo is denormal test if Result equals zero
1638 //
1639 { .mfi
1640       nop.m 999
1641 (p14) fclass.m.unc p14, p0 = Result, 0x07
1642       nop.i 999
1643 }
1644 ;;
1645
1646 //
1647 //     If Res_lo is denormal and Result equals zero, raise inexact, underflow
1648 //     by squaring small double
1649 //
1650 { .mfb
1651       nop.m 999
1652 (p14) fmpy.d.s0 FR_temp = FR_temp, FR_temp
1653       br.ret.sptk   b0                     // Exit POLY path, 0 < Q < 2^-3
1654 }
1655 ;;
1656
1657
1658 ATANL_UNSUPPORTED:
1659 { .mfb
1660       nop.m 999
1661       fmpy.s0 Result = ArgX,ArgY
1662       br.ret.sptk   b0
1663 }
1664 ;;
1665
1666 // Here if y natval, nan, inf, zero
1667 ATANL_Y_SPECIAL:
1668 // Here if x natval, nan, inf, zero
1669 ATANL_X_SPECIAL:
1670 { .mfi
1671       nop.m 999
1672       fclass.m p13,p12 = ArgY_orig, 0x0c3  // Test y nan
1673       nop.i 999
1674 }
1675 ;;
1676
1677 { .mfi
1678       nop.m 999
1679       fclass.m p15,p14 = ArgY_orig, 0x103  // Test y natval
1680       nop.i 999
1681 }
1682 ;;
1683
1684 { .mfi
1685       nop.m 999
1686 (p12) fclass.m p13,p0 = ArgX_orig, 0x0c3  // Test x nan
1687       nop.i 999
1688 }
1689 ;;
1690
1691 { .mfi
1692       nop.m 999
1693 (p14) fclass.m p15,p0 = ArgX_orig, 0x103  // Test x natval
1694       nop.i 999
1695 }
1696 ;;
1697
1698 { .mfb
1699       nop.m 999
1700 (p13) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result nan if x or y nan
1701 (p13) br.ret.spnt b0                      // Exit if x or y nan
1702 }
1703 ;;
1704
1705 { .mfb
1706       nop.m 999
1707 (p15) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result natval if x or y natval
1708 (p15) br.ret.spnt b0                      // Exit if x or y natval
1709 }
1710 ;;
1711
1712
1713 // Here if x or y inf or zero
1714 ATANL_SPECIAL_HANDLING:
1715 { .mfi
1716       nop.m 999
1717       fclass.m p6, p7 = ArgY_orig, 0x007        // Test y zero
1718       mov special = 992                         // Offset to table
1719 }
1720 ;;
1721
1722 { .mfb
1723       add table_ptr1 = table_base, special      // Point to 3pi/4
1724       fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig  // Dummy to set denormal flag
1725 (p7)  br.cond.spnt ATANL_ArgY_Not_ZERO          // Branch if y not zero
1726 }
1727 ;;
1728
1729 // Here if y zero
1730 { .mmf
1731       ldfd  Result = [table_ptr1], 8            // Get pi high
1732       nop.m 999
1733       fclass.m p14, p0 = ArgX, 0x035            // Test for x>=+0
1734 }
1735 ;;
1736
1737 { .mmf
1738       nop.m 999
1739       ldfd  Result_lo = [table_ptr1], -8        // Get pi lo
1740       fclass.m p15, p0 = ArgX, 0x036            // Test for x<=-0
1741 }
1742 ;;
1743
1744 //
1745 //     Return sign_Y * 0 when  ArgX > +0
1746 //
1747 { .mfi
1748       nop.m 999
1749 (p14) fmerge.s Result = ArgY, f0               // If x>=+0, y=0, hi sgn(y)*0
1750       nop.i 999
1751 }
1752 ;;
1753
1754 { .mfi
1755       nop.m 999
1756       fclass.m p13, p0 = ArgX, 0x007           // Test for x=0
1757       nop.i 999
1758 }
1759 ;;
1760
1761 { .mfi
1762       nop.m 999
1763 (p14) fmerge.s Result_lo = ArgY, f0            // If x>=+0, y=0, lo sgn(y)*0
1764       nop.i 999
1765 }
1766 ;;
1767
1768 { .mfi
1769 (p13) mov GR_Parameter_TAG = 36                // Error tag for x=0, y=0
1770       nop.f 999
1771       nop.i 999
1772 }
1773 ;;
1774
1775 //
1776 //     Return sign_Y * pi when  ArgX < -0
1777 //
1778 { .mfi
1779       nop.m 999
1780 (p15) fmerge.s Result = ArgY, Result           // If x<0, y=0, hi=sgn(y)*pi
1781       nop.i 999
1782 }
1783 ;;
1784
1785 { .mfi
1786       nop.m 999
1787 (p15) fmerge.s Result_lo = ArgY, Result_lo     // If x<0, y=0, lo=sgn(y)*pi
1788       nop.i 999
1789 }
1790 ;;
1791
1792 //
1793 //     Call error support function for atan(0,0)
1794 //
1795 { .mfb
1796       nop.m 999
1797       fadd.s0 Result = Result, Result_lo
1798 (p13) br.cond.spnt __libm_error_region         // Branch if atan(0,0)
1799 }
1800 ;;
1801
1802 { .mib
1803       nop.m 999
1804       nop.i 999
1805       br.ret.sptk   b0                         // Exit for y=0, x not 0
1806 }
1807 ;;
1808
1809 // Here if y not zero
1810 ATANL_ArgY_Not_ZERO:
1811 { .mfi
1812       nop.m 999
1813       fclass.m p0, p10 = ArgY, 0x023           // Test y inf
1814       nop.i 999
1815 }
1816 ;;
1817
1818 { .mfb
1819       nop.m 999
1820       fclass.m p6, p0 = ArgX, 0x017            // Test for 0 <= |x| < inf
1821 (p10) br.cond.spnt  ATANL_ArgY_Not_INF         // Branch if 0 < |y| < inf
1822 }
1823 ;;
1824
1825 // Here if y=inf
1826 //
1827 //     Return +PI/2 when ArgY = +Inf and ArgX = +/-0 or normal
1828 //     Return -PI/2 when ArgY = -Inf and ArgX = +/-0 or normal
1829 //     Return +PI/4 when ArgY = +Inf and ArgX = +Inf
1830 //     Return -PI/4 when ArgY = -Inf and ArgX = +Inf
1831 //     Return +3PI/4 when ArgY = +Inf and ArgX = -Inf
1832 //     Return -3PI/4 when ArgY = -Inf and ArgX = -Inf
1833 //
1834 { .mfi
1835       nop.m 999
1836       fclass.m p7, p0 = ArgX, 0x021            // Test for x=+inf
1837       nop.i 999
1838 }
1839 ;;
1840
1841 { .mfi
1842 (p6)  add table_ptr1 =  16, table_ptr1         // Point to pi/2, if x finite
1843       fclass.m p8, p0 = ArgX, 0x022            // Test for x=-inf
1844       nop.i 999
1845 }
1846 ;;
1847
1848 { .mmi
1849 (p7)  add table_ptr1 =  32, table_ptr1         // Point to pi/4 if x=+inf
1850 ;;
1851 (p8)  add table_ptr1 =  48, table_ptr1         // Point to 3pi/4 if x=-inf
1852
1853       nop.i 999
1854 }
1855 ;;
1856
1857 { .mmi
1858       ldfd Result = [table_ptr1], 8            // Load pi/2, pi/4, or 3pi/4 hi
1859 ;;
1860       ldfd Result_lo = [table_ptr1], -8        // Load pi/2, pi/4, or 3pi/4 lo
1861       nop.i 999
1862 }
1863 ;;
1864
1865 { .mfi
1866       nop.m 999
1867       fmerge.s Result = ArgY, Result           // Merge sgn(y) in hi
1868       nop.i 999
1869 }
1870 ;;
1871
1872 { .mfi
1873       nop.m 999
1874       fmerge.s Result_lo = ArgY, Result_lo     // Merge sgn(y) in lo
1875       nop.i 999
1876 }
1877 ;;
1878
1879 { .mfb
1880       nop.m 999
1881       fadd.s0 Result = Result, Result_lo       // Compute complete result
1882       br.ret.sptk   b0                         // Exit for y=inf
1883 }
1884 ;;
1885
1886 // Here if y not INF, and x=0 or INF
1887 ATANL_ArgY_Not_INF:
1888 //
1889 //     Return +PI/2 when ArgY NOT Inf, ArgY > 0 and ArgX = +/-0
1890 //     Return -PI/2 when ArgY NOT Inf, ArgY < 0 and ArgX = +/-0
1891 //     Return +0    when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf
1892 //     Return -0    when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf
1893 //     Return +PI   when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf
1894 //     Return -PI   when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf
1895 //
1896 { .mfi
1897       nop.m 999
1898       fclass.m p7, p9 = ArgX, 0x021            // Test for x=+inf
1899       nop.i 999
1900 }
1901 ;;
1902
1903 { .mfi
1904       nop.m 999
1905       fclass.m p6, p0 = ArgX, 0x007            // Test for x=0
1906       nop.i 999
1907 }
1908 ;;
1909
1910 { .mfi
1911 (p6)  add table_ptr1 = 16, table_ptr1          // Point to pi/2
1912       fclass.m p8, p0 = ArgX, 0x022            // Test for x=-inf
1913       nop.i 999
1914 }
1915 ;;
1916
1917 .pred.rel "mutex",p7,p9
1918 { .mfi
1919 (p9)  ldfd Result = [table_ptr1], 8           // Load pi or pi/2 hi
1920 (p7)  fmerge.s Result = ArgY, f0              // If y not inf, x=+inf, sgn(y)*0
1921       nop.i 999
1922 }
1923 ;;
1924
1925 { .mfi
1926 (p9)  ldfd Result_lo = [table_ptr1], -8       // Load pi or pi/2 lo
1927 (p7)  fnorm.s0 Result = Result                // If y not inf, x=+inf normalize
1928       nop.i 999
1929 }
1930 ;;
1931
1932 { .mfi
1933       nop.m 999
1934 (p9)  fmerge.s Result = ArgY, Result          // Merge sgn(y) in hi
1935       nop.i 999
1936 }
1937 ;;
1938
1939 { .mfi
1940       nop.m 999
1941 (p9)  fmerge.s Result_lo = ArgY, Result_lo    // Merge sgn(y) in lo
1942       nop.i 999
1943 }
1944 ;;
1945
1946 { .mfb
1947       nop.m 999
1948 (p9)  fadd.s0 Result = Result, Result_lo      // Compute complete result
1949       br.ret.spnt   b0                        // Exit for y not inf, x=0,inf
1950 }
1951 ;;
1952
1953 GLOBAL_IEEE754_END(atan2l)
1954 LOCAL_LIBM_ENTRY(__libm_error_region)
1955 .prologue
1956 { .mfi
1957         add   GR_Parameter_Y=-32,sp             // Parameter 2 value
1958         nop.f 0
1959 .save   ar.pfs,GR_SAVE_PFS
1960         mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
1961 }
1962 { .mfi
1963 .fframe 64
1964         add sp=-64,sp                           // Create new stack
1965         nop.f 0
1966         mov GR_SAVE_GP=gp                       // Save gp
1967 };;
1968 { .mmi
1969         stfe [GR_Parameter_Y] = FR_Y,16         // Save Parameter 2 on stack
1970         add GR_Parameter_X = 16,sp              // Parameter 1 address
1971 .save   b0, GR_SAVE_B0
1972         mov GR_SAVE_B0=b0                       // Save b0
1973 };;
1974 .body
1975 { .mib
1976         stfe [GR_Parameter_X] = FR_X            // Store Parameter 1 on stack
1977         add   GR_Parameter_RESULT = 0,GR_Parameter_Y
1978         nop.b 0                                 // Parameter 3 address
1979 }
1980 { .mib
1981         stfe [GR_Parameter_Y] = FR_RESULT      // Store Parameter 3 on stack
1982         add   GR_Parameter_Y = -16,GR_Parameter_Y
1983         br.call.sptk b0=__libm_error_support#  // Call error handling function
1984 };;
1985 { .mmi
1986         nop.m 0
1987         nop.m 0
1988         add   GR_Parameter_RESULT = 48,sp
1989 };;
1990 { .mmi
1991         ldfe  f8 = [GR_Parameter_RESULT]       // Get return result off stack
1992 .restore sp
1993         add   sp = 64,sp                       // Restore stack pointer
1994         mov   b0 = GR_SAVE_B0                  // Restore return address
1995 };;
1996 { .mib
1997         mov   gp = GR_SAVE_GP                  // Restore gp
1998         mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
1999         br.ret.sptk     b0                     // Return
2000 };;
2001
2002 LOCAL_LIBM_END(__libm_error_region#)
2003 .type   __libm_error_support#,@function
2004 .global __libm_error_support#