sysdeps/ia64/fpu/s_atanl.S

   1 .file "atanl.s"
   2
   3
   4 // Copyright (c) 2000 - 2005, Intel Corporation
   5 // All rights reserved.
   6 //
   7 // Contributed 2000 by the Intel Numerics Group, Intel Corporation
   8 //
   9 // Redistribution and use in source and binary forms, with or without
  10 // modification, are permitted provided that the following conditions are
  11 // met:
  12 //
  13 // * Redistributions of source code must retain the above copyright
  14 // notice, this list of conditions and the following disclaimer.
  15 //
  16 // * Redistributions in binary form must reproduce the above copyright
  17 // notice, this list of conditions and the following disclaimer in the
  18 // documentation and/or other materials provided with the distribution.
  19 //
  20 // * The name of Intel Corporation may not be used to endorse or promote
  21 // products derived from this software without specific prior written
  22 // permission.
  23
  24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  35 //
  36 // Intel Corporation is the author of this code, and requests that all
  37 // problem reports or change requests be submitted to it directly at
  38 // http://www.intel.com/software/products/opensource/libraries/num.htm.
  39 //
  40 //
  41 //*********************************************************************
  42 //
  43 // History
  44 // 02/02/00 (hand-optimized)
  45 // 04/04/00 Unwind support added
  46 // 08/15/00 Bundle added after call to __libm_error_support to properly
  47 //          set [the previously overwritten] GR_Parameter_RESULT.
  48 // 03/13/01 Fixed flags when denormal raised on intermediate result
  49 // 01/08/02 Improved speed.
  50 // 02/06/02 Corrected .section statement
  51 // 05/20/02 Cleaned up namespace and sf0 syntax
  52 // 02/10/03 Reordered header: .section, .global, .proc, .align;
  53 //          used data8 for long double table values
  54 // 03/31/05 Reformatted delimiters between data tables
  55 //
  56 //*********************************************************************
  57 //
  58 // Function:   atanl(x) = inverse tangent(x), for double extended x values
  59 // Function:   atan2l(y,x) = atan(y/x), for double extended y, x values
  60 //
  61 // API
  62 //
  63 //  long double atanl  (long double x)
  64 //  long double atan2l (long double y, long double x)
  65 //
  66 //*********************************************************************
  67 //
  68 // Resources Used:
  69 //
  70 //    Floating-Point Registers: f8 (Input and Return Value)
  71 //                              f9 (Input for atan2l)
  72 //                              f10-f15, f32-f83
  73 //
  74 //    General Purpose Registers:
  75 //      r32-r51
  76 //      r49-r52 (Arguments to error support for 0,0 case)
  77 //
  78 //    Predicate Registers:      p6-p15
  79 //
  80 //*********************************************************************
  81 //
  82 // IEEE Special Conditions:
  83 //
  84 //    Denormal fault raised on denormal inputs
  85 //    Underflow exceptions may occur
  86 //    Special error handling for the y=0 and x=0 case
  87 //    Inexact raised when appropriate by algorithm
  88 //
  89 //    atanl(SNaN) = QNaN
  90 //    atanl(QNaN) = QNaN
  91 //    atanl(+/-0) = +/- 0
  92 //    atanl(+/-Inf) = +/-pi/2
  93 //
  94 //    atan2l(Any NaN for x or y) = QNaN
  95 //    atan2l(+/-0,x) = +/-0 for x > 0
  96 //    atan2l(+/-0,x) = +/-pi for x < 0
  97 //    atan2l(+/-0,+0) = +/-0
  98 //    atan2l(+/-0,-0) = +/-pi
  99 //    atan2l(y,+/-0) = pi/2 y > 0
 100 //    atan2l(y,+/-0) = -pi/2 y < 0
 101 //    atan2l(+/-y, Inf) = +/-0 for finite y > 0
 102 //    atan2l(+/-Inf, x) = +/-pi/2 for finite x
 103 //    atan2l(+/-y, -Inf) = +/-pi for finite  y > 0
 104 //    atan2l(+/-Inf, Inf) = +/-pi/4
 105 //    atan2l(+/-Inf, -Inf) = +/-3pi/4
 106 //
 107 //*********************************************************************
 108 //
 109 // Mathematical Description
 110 // ---------------------------
 111 //
 112 // The function ATANL( Arg_Y, Arg_X ) returns the "argument"
 113 // or the "phase" of the complex number
 114 //
 115 //           Arg_X + i Arg_Y
 116 //
 117 // or equivalently, the angle in radians from the positive
 118 // x-axis to the line joining the origin and the point
 119 // (Arg_X,Arg_Y)
 120 //
 121 //
 122 //        (Arg_X, Arg_Y) x
 123 //                        \
 124 //                \
 125 //                 \
 126 //                  \
 127 //                   \ angle between is ATANL(Arg_Y,Arg_X)
 128
 129
 130
 131
 132 //                    \
 133 //                     ------------------> X-axis
 134
 135 //                   Origin
 136 //
 137 // Moreover, this angle is reported in the range [-pi,pi] thus
 138 //
 139 //      -pi <= ATANL( Arg_Y, Arg_X ) <= pi.
 140 //
 141 // From the geometry, it is easy to define ATANL when one of
 142 // Arg_X or Arg_Y is +-0 or +-inf:
 143 //
 144 //
 145 //      \ Y |
 146 //     X \  |  +0  | -0  |  +inf |  -inf  |  finite non-zero
 147 //        \ |      |     |       |        |
 148 //    ______________________________________________________
 149 //          |            |       |        |
 150 //     +-0  |   Invalid/ |  pi/2 | -pi/2  |  sign(Y)*pi/2
 151 //          |    qNaN    |       |        |
 152 //  --------------------------------------------------------
 153 //          |      |     |       |        |
 154 //     +inf |  +0  | -0  |  pi/4 | -pi/4  |  sign(Y)*0
 155 //  --------------------------------------------------------
 156 //          |      |     |       |        |
 157 //     -inf |  +pi | -pi | 3pi/4 | -3pi/4 |  sign(Y)*pi
 158 //  --------------------------------------------------------
 159 //   finite |    X>0?    |  pi/2 | -pi/2  |  normal case
 160 //  non-zero| sign(Y)*0: |       |        |
 161 //       | sign(Y)*pi |       |        |
 162 //
 163 //
 164 // One must take note that ATANL is NOT the arctangent of the
 165 // value Arg_Y/Arg_X; but rather ATANL and arctan are related
 166 // in a slightly more complicated way as follows:
 167 //
 168 // Let U := max(|Arg_X|, |Arg_Y|);  V := min(|Arg_X|, |Arg_Y|);
 169 // sign_X be the sign bit of Arg_X, i.e., sign_X is 0 or 1;
 170 // s_X    be the sign     of Arg_X, i.e., s_X = (-1)^sign_X;
 171 //
 172 // sign_Y be the sign bit of Arg_Y, i.e., sign_Y is 0 or 1;
 173 // s_Y    be the sign     of Arg_Y, i.e., s_Y = (-1)^sign_Y;
 174 //
 175 // swap   be 0  if |Arg_X| >= |Arg_Y|  and 1 otherwise.
 176 //
 177 // Then, ATANL(Arg_Y, Arg_X) =
 178 //
 179 //       /    arctan(V/U)     \      sign_X = 0 & swap = 0
 180 //       | pi/2 - arctan(V/U) |      sign_X = 0 & swap = 1
 181 // s_Y * |                    |
 182 //       |  pi  - arctan(V/U) |      sign_X = 1 & swap = 0
 183 //       \ pi/2 + arctan(V/U) /      sign_X = 1 & swap = 1
 184 //
 185 //
 186 // This relationship also suggest that the algorithm's major
 187 // task is to calculate arctan(V/U) for 0 < V <= U; and the
 188 // final Result is given by
 189 //
 190 //      s_Y * { (P_hi + P_lo) + sigma * arctan(V/U) }
 191 //
 192 // where
 193 //
 194 //   (P_hi,P_lo) represents M(sign_X,swap)*(pi/2) accurately
 195 //
 196 //   M(sign_X,swap) = 0  for sign_X = 0 and swap = 0
 197 //              1  for swap   = 1
 198 //              2  for sign_X = 1 and swap = 0
 199 //
 200 // and
 201 //
 202 //   sigma = { (sign_X  XOR  swap) :  -1.0 : 1.0 }
 203 //
 204 //      =  (-1) ^ ( sign_X XOR swap )
 205 //
 206 // Both (P_hi,P_lo) and sigma can be stored in a table and fetched
 207 // using (sign_X,swap) as an index. (P_hi, P_lo) can be stored as a
 208 // double-precision, and single-precision pair; and sigma can
 209 // obviously be just a single-precision number.
 210 //
 211 // In the algorithm we propose, arctan(V/U) is calculated to high accuracy
 212 // as A_hi + A_lo. Consequently, the Result ATANL( Arg_Y, Arg_X ) is
 213 // given by
 214 //
 215 //    s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo)
 216 //
 217 // We now discuss the calculation of arctan(V/U) for 0 < V <= U.
 218 //
 219 // For (V/U) < 2^(-3), we use a simple polynomial of the form
 220 //
 221 //      z + z^3*(P_1 + z^2*(P_2 + z^2*(P_3 + ... + P_8)))
 222 //
 223 // where z = V/U.
 224 //
 225 // For the sake of accuracy, the first term "z" must approximate V/U to
 226 // extra precision. For z^3 and higher power, a working precision
 227 // approximation to V/U suffices. Thus, we obtain:
 228 //
 229 //      z_hi + z_lo = V/U  to extra precision and
 230 //      z           = V/U  to working precision
 231 //
 232 // The value arctan(V/U) is delivered as two pieces (A_hi, A_lo)
 233 //
 234 //      (A_hi,A_lo) = (z_hi, z^3*(P_1 + ... + P_8) + z_lo).
 235 //
 236 //
 237 // For 2^(-3) <= (V/U) <= 1, we use a table-driven approach.
 238 // Consider
 239 //
 240 //      (V/U) = 2^k * 1.b_1 b_2 .... b_63 b_64 b_65 ....
 241 //
 242 // Define
 243 //
 244 //       z_hi = 2^k * 1.b_1 b_2 b_3 b_4 1
 245 //
 246 // then
 247 //                                            /                \
 248 //                                            |  (V/U) - z_hi  |
 249
 250 //      arctan(V/U) = arctan(z_hi) + acrtan| -------------- |
 251 //                                            | 1 + (V/U)*z_hi |
 252 //                                            \                /
 253 //
 254 //                                            /                \
 255 //                                            |   V - z_hi*U   |
 256
 257 //                  = arctan(z_hi) + acrtan| -------------- |
 258 //                                            |   U + V*z_hi   |
 259 //                                            \                /
 260 //
 261 //                  = arctan(z_hi) + acrtan( V' / U' )
 262 //
 263 //
 264 // where
 265 //
 266 //      V' = V - U*z_hi;   U' = U + V*z_hi.
 267 //
 268 // Let
 269 //
 270 //      w_hi + w_lo  = V'/U' to extra precision and
 271 //           w       = V'/U' to working precision
 272 //
 273 // then we can approximate arctan(V'/U') by
 274 //
 275 //      arctan(V'/U') = w_hi + w_lo
 276 //                     + w^3*(Q_1 + w^2*(Q_2 + w^2*(Q_3 + w^2*Q_4)))
 277 //
 278 //                       = w_hi + w_lo + poly
 279 //
 280 // Finally, arctan(z_hi) is calculated beforehand and stored in a table
 281 // as Tbl_hi, Tbl_lo. Thus,
 282 //
 283 //      (A_hi, A_lo) = (Tbl_hi, w_hi+(poly+(w_lo+Tbl_lo)))
 284 //
 285 // This completes the mathematical description.
 286 //
 287 //
 288 // Algorithm
 289 // -------------
 290 //
 291 // Step 0. Check for unsupported format.
 292 //
 293 //    If
 294 //       ( expo(Arg_X) not zero AND msb(Arg_X) = 0 ) OR
 295 //       ( expo(Arg_Y) not zero AND msb(Arg_Y) = 0 )
 296 //
 297 //    then one of the arguments is unsupported. Generate an
 298 //    invalid and return qNaN.
 299 //
 300 // Step 1. Initialize
 301 //
 302 //    Normalize Arg_X and Arg_Y and set the following
 303 //
 304 //    sign_X :=  sign_bit(Arg_X)
 305 //    s_Y    := (sign_bit(Arg_Y)==0? 1.0 : -1.0)
 306 //    swap   := (|Arg_X| >= |Arg_Y|?   0 :  1  )
 307 //    U      := max( |Arg_X|, |Arg_Y| )
 308 //    V      := min( |Arg_X|, |Arg_Y| )
 309 //
 310 //    execute: frcpa E, pred, V, U
 311 //    If pred is 0, go to Step 5 for special cases handling.
 312 //
 313 // Step 2. Decide on branch.
 314 //
 315 //    Q := E * V
 316 //    If Q < 2^(-3) go to Step 4 for simple polynomial case.
 317 //
 318 // Step 3. Table-driven algorithm.
 319 //
 320 //    Q is represented as
 321 //
 322 //      2^(-k) * 1.b_1 b_2 b_3 ... b_63; k = 0,-1,-2,-3
 323 //
 324 // and that if k = 0, b_1 = b_2 = b_3 = b_4 = 0.
 325 //
 326 // Define
 327 //
 328 //      z_hi := 2^(-k) * 1.b_1 b_2 b_3 b_4 1
 329 //
 330 // (note that there are 49 possible values of z_hi).
 331 //
 332 //      ...We now calculate V' and U'. While V' is representable
 333 //      ...as a 64-bit number because of cancellation, U' is
 334 //      ...not in general a 64-bit number. Obtaining U' accurately
 335 //      ...requires two working precision numbers
 336 //
 337 //      U_prime_hi := U + V * z_hi            ...WP approx. to U'
 338 //      U_prime_lo := ( U - U_prime_hi ) + V*z_hi ...observe order
 339 //      V_prime    := V - U * z_hi             ...this is exact
 340 //
 341 //         C_hi := frcpa (1.0, U_prime_hi)  ...C_hi approx 1/U'_hi
 342 //
 343 //      loop 3 times
 344 //         C_hi := C_hi + C_hi*(1.0 - C_hi*U_prime_hi)
 345 //
 346 //      ...at this point C_hi is (1/U_prime_hi) to roughly 64 bits
 347 //
 348 //      w_hi := V_prime * C_hi     ...w_hi is V_prime/U_prime to
 349 //                     ...roughly working precision
 350 //
 351 //         ...note that we want w_hi + w_lo to approximate
 352 //      ...V_prime/(U_prime_hi + U_prime_lo) to extra precision
 353 //         ...but for now, w_hi is good enough for the polynomial
 354 //      ...calculation.
 355 //
 356 //         wsq  := w_hi*w_hi
 357 //      poly := w_hi*wsq*(Q_1 + wsq*(Q_2 + wsq*(Q_3 + wsq*Q_4)))
 358 //
 359 //      Fetch
 360 //      (Tbl_hi, Tbl_lo) = atan(z_hi) indexed by (k,b_1,b_2,b_3,b_4)
 361 //      ...Tbl_hi is a double-precision number
 362 //      ...Tbl_lo is a single-precision number
 363 //
 364 //         (P_hi, P_lo) := M(sign_X,swap)*(Pi_by_2_hi, Pi_by_2_lo)
 365 //      ...as discussed previous. Again; the implementation can
 366 //      ...chose to fetch P_hi and P_lo from a table indexed by
 367 //      ...(sign_X, swap).
 368 //      ...P_hi is a double-precision number;
 369 //      ...P_lo is a single-precision number.
 370 //
 371 //      ...calculate w_lo so that w_hi + w_lo is V'/U' accurately
 372 //         w_lo := ((V_prime - w_hi*U_prime_hi) -
 373 //              w_hi*U_prime_lo) * C_hi     ...observe order
 374 //
 375 //
 376 //      ...Ready to deliver arctan(V'/U') as A_hi, A_lo
 377 //      A_hi := Tbl_hi
 378 //      A_lo := w_hi + (poly + (Tbl_lo + w_lo)) ...observe order
 379 //
 380 //      ...Deliver final Result
 381 //      ...s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo)
 382 //
 383 //      sigma := ( (sign_X XOR swap) ? -1.0 : 1.0 )
 384 //      ...sigma can be obtained by a table lookup using
 385 //      ...(sign_X,swap) as index and stored as single precision
 386 //         ...sigma should be calculated earlier
 387 //
 388 //      P_hi := s_Y*P_hi
 389 //      A_hi := s_Y*A_hi
 390 //
 391 //      Res_hi := P_hi + sigma*A_hi     ...this is exact because
 392 //                          ...both P_hi and Tbl_hi
 393 //                          ...are double-precision
 394 //                          ...and |Tbl_hi| > 2^(-4)
 395 //                          ...P_hi is either 0 or
 396 //                          ...between (1,4)
 397 //
 398 //      Res_lo := sigma*A_lo + P_lo
 399 //
 400 //      Return Res_hi + s_Y*Res_lo in user-defined rounding control
 401 //
 402 // Step 4. Simple polynomial case.
 403 //
 404 //    ...E and Q are inherited from Step 2.
 405 //
 406 //    A_hi := Q     ...Q is inherited from Step 2 Q approx V/U
 407 //
 408 //    loop 3 times
 409 //       E := E + E2(1.0 - E*U1
 410 //    ...at this point E approximates 1/U to roughly working precision
 411 //
 412 //    z := V * E     ...z approximates V/U to roughly working precision
 413 //    zsq := z * z
 414 //    z4 := zsq * zsq; z8 := z4 * z4
 415 //
 416 //    poly1 := P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
 417 //    poly2 := zsq*(P_1 + zsq*(P_2 + zsq*P_3))
 418 //
 419 //    poly  := poly1 + z8*poly2
 420 //
 421 //    z_lo := (V - A_hi*U)*E
 422 //
 423 //    A_lo := z*poly + z_lo
 424 //    ...A_hi, A_lo approximate arctan(V/U) accurately
 425 //
 426 //    (P_hi, P_lo) := M(sign_X,swap)*(Pi_by_2_hi, Pi_by_2_lo)
 427 //    ...one can store the M(sign_X,swap) as single precision
 428 //    ...values
 429 //
 430 //    ...Deliver final Result
 431 //    ...s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo)
 432 //
 433 //    sigma := ( (sign_X XOR swap) ? -1.0 : 1.0 )
 434 //    ...sigma can be obtained by a table lookup using
 435 //    ...(sign_X,swap) as index and stored as single precision
 436 //    ...sigma should be calculated earlier
 437 //
 438 //    P_hi := s_Y*P_hi
 439 //    A_hi := s_Y*A_hi
 440 //
 441 //    Res_hi := P_hi + sigma*A_hi          ...need to compute
 442 //                          ...P_hi + sigma*A_hi
 443 //                          ...exactly
 444 //
 445 //    tmp    := (P_hi - Res_hi) + sigma*A_hi
 446 //
 447 //    Res_lo := s_Y*(sigma*A_lo + P_lo) + tmp
 448 //
 449 //    Return Res_hi + Res_lo in user-defined rounding control
 450 //
 451 // Step 5. Special Cases
 452 //
 453 //    These are detected early in the function by fclass instructions.
 454 //
 455 //    We are in one of those special cases when X or Y is 0,+-inf or NaN
 456 //
 457 //    If one of X and Y is NaN, return X+Y (which will generate
 458 //    invalid in case one is a signaling NaN). Otherwise,
 459 //    return the Result as described in the table
 460 //
 461 //
 462 //
 463 //      \ Y |
 464 //     X \  |  +0  | -0  |  +inf |  -inf  |  finite non-zero
 465 //        \ |      |     |       |        |
 466 //    ______________________________________________________
 467 //          |            |       |        |
 468 //     +-0  |   Invalid/ |  pi/2 | -pi/2  |  sign(Y)*pi/2
 469 //          |    qNaN    |       |        |
 470 //  --------------------------------------------------------
 471 //          |      |     |       |        |
 472 //     +inf |  +0  | -0  |  pi/4 | -pi/4  |  sign(Y)*0
 473 //  --------------------------------------------------------
 474 //          |      |     |       |        |
 475 //     -inf |  +pi | -pi | 3pi/4 | -3pi/4 |  sign(Y)*pi
 476 //  --------------------------------------------------------
 477 //   finite |    X>0?    |  pi/2 | -pi/2  |
 478 //  non-zero| sign(Y)*0: |       |        |      N/A
 479 //       | sign(Y)*pi |       |        |
 480 //
 481 //
 482
 483 ArgY_orig   =   f8
 484 Result      =   f8
 485 FR_RESULT   =   f8
 486 ArgX_orig   =   f9
 487 ArgX        =   f10
 488 FR_X        =   f10
 489 ArgY        =   f11
 490 FR_Y        =   f11
 491 s_Y         =   f12
 492 U           =   f13
 493 V           =   f14
 494 E           =   f15
 495 Q           =   f32
 496 z_hi        =   f33
 497 U_prime_hi  =   f34
 498 U_prime_lo  =   f35
 499 V_prime     =   f36
 500 C_hi        =   f37
 501 w_hi        =   f38
 502 w_lo        =   f39
 503 wsq         =   f40
 504 poly        =   f41
 505 Tbl_hi      =   f42
 506 Tbl_lo      =   f43
 507 P_hi        =   f44
 508 P_lo        =   f45
 509 A_hi        =   f46
 510 A_lo        =   f47
 511 sigma       =   f48
 512 Res_hi      =   f49
 513 Res_lo      =   f50
 514 Z           =   f52
 515 zsq         =   f53
 516 z4          =   f54
 517 z8          =   f54
 518 poly1       =   f55
 519 poly2       =   f56
 520 z_lo        =   f57
 521 tmp         =   f58
 522 P_1         =   f59
 523 Q_1         =   f60
 524 P_2         =   f61
 525 Q_2         =   f62
 526 P_3         =   f63
 527 Q_3         =   f64
 528 P_4         =   f65
 529 Q_4         =   f66
 530 P_5         =   f67
 531 P_6         =   f68
 532 P_7         =   f69
 533 P_8         =   f70
 534 U_hold      =   f71
 535 TWO_TO_NEG3 =   f72
 536 C_hi_hold   =   f73
 537 E_hold      =   f74
 538 M           =   f75
 539 ArgX_abs    =   f76
 540 ArgY_abs    =   f77
 541 Result_lo   =   f78
 542 A_temp      =   f79
 543 FR_temp     =   f80
 544 Xsq         =   f81
 545 Ysq         =   f82
 546 tmp_small   =   f83
 547
 548 GR_SAVE_PFS   = r33
 549 GR_SAVE_B0    = r34
 550 GR_SAVE_GP    = r35
 551 sign_X        = r36
 552 sign_Y        = r37
 553 swap          = r38
 554 table_ptr1    = r39
 555 table_ptr2    = r40
 556 k             = r41
 557 lookup        = r42
 558 exp_ArgX      = r43
 559 exp_ArgY      = r44
 560 exponent_Q    = r45
 561 significand_Q = r46
 562 special       = r47
 563 sp_exp_Q      = r48
 564 sp_exp_4sig_Q = r49
 565 table_base    = r50
 566 int_temp      = r51
 567
 568 GR_Parameter_X      = r49
 569 GR_Parameter_Y      = r50
 570 GR_Parameter_RESULT = r51
 571 GR_Parameter_TAG    = r52
 572 GR_temp             = r52
 573
 574 RODATA
 575 .align 16
 576
 577 LOCAL_OBJECT_START(Constants_atan)
 578 //       double pi/2
 579 data8 0x3FF921FB54442D18
 580 //       single lo_pi/2, two**(-3)
 581 data4 0x248D3132, 0x3E000000
 582 data8 0xAAAAAAAAAAAAAAA3, 0xBFFD // P_1
 583 data8 0xCCCCCCCCCCCC54B2, 0x3FFC // P_2
 584 data8 0x9249249247E4D0C2, 0xBFFC // P_3
 585 data8 0xE38E38E058870889, 0x3FFB // P_4
 586 data8 0xBA2E895B290149F8, 0xBFFB // P_5
 587 data8 0x9D88E6D4250F733D, 0x3FFB // P_6
 588 data8 0x884E51FFFB8745A0, 0xBFFB // P_7
 589 data8 0xE1C7412B394396BD, 0x3FFA // P_8
 590 data8 0xAAAAAAAAAAAAA52F, 0xBFFD // Q_1
 591 data8 0xCCCCCCCCC75B60D3, 0x3FFC // Q_2
 592 data8 0x924923AD011F1940, 0xBFFC // Q_3
 593 data8 0xE36F716D2A5F89BD, 0x3FFB // Q_4
 594 //
 595 //    Entries Tbl_hi  (double precision)
 596 //    B = 1+Index/16+1/32  Index = 0
 597 //    Entries Tbl_lo (single precision)
 598 //    B = 1+Index/16+1/32  Index = 0
 599 //
 600 data8 0x3FE9A000A935BD8E
 601 data4 0x23ACA08F, 0x00000000
 602 //
 603 //    Entries Tbl_hi  (double precision) Index = 0,1,...,15
 604 //    B = 2^(-1)*(1+Index/16+1/32)
 605 //    Entries Tbl_lo (single precision)
 606 //    Index = 0,1,...,15  B = 2^(-1)*(1+Index/16+1/32)
 607 //
 608 data8 0x3FDE77EB7F175A34
 609 data4 0x238729EE, 0x00000000
 610 data8 0x3FE0039C73C1A40B
 611 data4 0x249334DB, 0x00000000
 612 data8 0x3FE0C6145B5B43DA
 613 data4 0x22CBA7D1, 0x00000000
 614 data8 0x3FE1835A88BE7C13
 615 data4 0x246310E7, 0x00000000
 616 data8 0x3FE23B71E2CC9E6A
 617 data4 0x236210E5, 0x00000000
 618 data8 0x3FE2EE628406CBCA
 619 data4 0x2462EAF5, 0x00000000
 620 data8 0x3FE39C391CD41719
 621 data4 0x24B73EF3, 0x00000000
 622 data8 0x3FE445065B795B55
 623 data4 0x24C11260, 0x00000000
 624 data8 0x3FE4E8DE5BB6EC04
 625 data4 0x242519EE, 0x00000000
 626 data8 0x3FE587D81F732FBA
 627 data4 0x24D4346C, 0x00000000
 628 data8 0x3FE6220D115D7B8D
 629 data4 0x24ED487B, 0x00000000
 630 data8 0x3FE6B798920B3D98
 631 data4 0x2495FF1E, 0x00000000
 632 data8 0x3FE748978FBA8E0F
 633 data4 0x223D9531, 0x00000000
 634 data8 0x3FE7D528289FA093
 635 data4 0x242B0411, 0x00000000
 636 data8 0x3FE85D69576CC2C5
 637 data4 0x2335B374, 0x00000000
 638 data8 0x3FE8E17AA99CC05D
 639 data4 0x24C27CFB, 0x00000000
 640 //
 641 //    Entries Tbl_hi  (double precision) Index = 0,1,...,15
 642 //    B = 2^(-2)*(1+Index/16+1/32)
 643 //    Entries Tbl_lo (single precision)
 644 //    Index = 0,1,...,15  B = 2^(-2)*(1+Index/16+1/32)
 645 //
 646 data8 0x3FD025FA510665B5
 647 data4 0x24263482, 0x00000000
 648 data8 0x3FD1151A362431C9
 649 data4 0x242C8DC9, 0x00000000
 650 data8 0x3FD2025567E47C95
 651 data4 0x245CF9BA, 0x00000000
 652 data8 0x3FD2ED987A823CFE
 653 data4 0x235C892C, 0x00000000
 654 data8 0x3FD3D6D129271134
 655 data4 0x2389BE52, 0x00000000
 656 data8 0x3FD4BDEE586890E6
 657 data4 0x24436471, 0x00000000
 658 data8 0x3FD5A2E0175E0F4E
 659 data4 0x2389DBD4, 0x00000000
 660 data8 0x3FD685979F5FA6FD
 661 data4 0x2476D43F, 0x00000000
 662 data8 0x3FD7660752817501
 663 data4 0x24711774, 0x00000000
 664 data8 0x3FD84422B8DF95D7
 665 data4 0x23EBB501, 0x00000000
 666 data8 0x3FD91FDE7CD0C662
 667 data4 0x23883A0C, 0x00000000
 668 data8 0x3FD9F93066168001
 669 data4 0x240DF63F, 0x00000000
 670 data8 0x3FDAD00F5422058B
 671 data4 0x23FE261A, 0x00000000
 672 data8 0x3FDBA473378624A5
 673 data4 0x23A8CD0E, 0x00000000
 674 data8 0x3FDC76550AAD71F8
 675 data4 0x2422D1D0, 0x00000000
 676 data8 0x3FDD45AEC9EC862B
 677 data4 0x2344A109, 0x00000000
 678 //
 679 //    Entries Tbl_hi  (double precision) Index = 0,1,...,15
 680 //    B = 2^(-3)*(1+Index/16+1/32)
 681 //    Entries Tbl_lo (single precision)
 682 //    Index = 0,1,...,15  B = 2^(-3)*(1+Index/16+1/32)
 683 //
 684 data8 0x3FC068D584212B3D
 685 data4 0x239874B6, 0x00000000
 686 data8 0x3FC1646541060850
 687 data4 0x2335E774, 0x00000000
 688 data8 0x3FC25F6E171A535C
 689 data4 0x233E36BE, 0x00000000
 690 data8 0x3FC359E8EDEB99A3
 691 data4 0x239680A3, 0x00000000
 692 data8 0x3FC453CEC6092A9E
 693 data4 0x230FB29E, 0x00000000
 694 data8 0x3FC54D18BA11570A
 695 data4 0x230C1418, 0x00000000
 696 data8 0x3FC645BFFFB3AA73
 697 data4 0x23F0564A, 0x00000000
 698 data8 0x3FC73DBDE8A7D201
 699 data4 0x23D4A5E1, 0x00000000
 700 data8 0x3FC8350BE398EBC7
 701 data4 0x23D4ADDA, 0x00000000
 702 data8 0x3FC92BA37D050271
 703 data4 0x23BCB085, 0x00000000
 704 data8 0x3FCA217E601081A5
 705 data4 0x23BC841D, 0x00000000
 706 data8 0x3FCB1696574D780B
 707 data4 0x23CF4A8E, 0x00000000
 708 data8 0x3FCC0AE54D768466
 709 data4 0x23BECC90, 0x00000000
 710 data8 0x3FCCFE654E1D5395
 711 data4 0x2323DCD2, 0x00000000
 712 data8 0x3FCDF110864C9D9D
 713 data4 0x23F53F3A, 0x00000000
 714 data8 0x3FCEE2E1451D980C
 715 data4 0x23CCB11F, 0x00000000
 716 //
 717 data8 0x400921FB54442D18, 0x3CA1A62633145C07 // PI two doubles
 718 data8 0x3FF921FB54442D18, 0x3C91A62633145C07 // PI_by_2 two dbles
 719 data8 0x3FE921FB54442D18, 0x3C81A62633145C07 // PI_by_4 two dbles
 720 data8 0x4002D97C7F3321D2, 0x3C9A79394C9E8A0A // 3PI_by_4 two dbles
 721 LOCAL_OBJECT_END(Constants_atan)
 722
 723
 724 .section .text
 725 GLOBAL_IEEE754_ENTRY(atanl)
 726
 727 // Use common code with atan2l after setting x=1.0
 728 { .mfi
 729       alloc r32 = ar.pfs, 0, 17, 4, 0
 730       fma.s1 Ysq = ArgY_orig, ArgY_orig, f0          // Form y*y
 731       nop.i 999
 732 }
 733 { .mfi
 734       addl table_ptr1 = @ltoff(Constants_atan#), gp  // Address of table pointer
 735       fma.s1 Xsq = f1, f1, f0                        // Form x*x
 736       nop.i 999
 737 }
 738 ;;
 739
 740 { .mfi
 741       ld8 table_ptr1 = [table_ptr1]                  // Get table pointer
 742       fnorm.s1 ArgY = ArgY_orig
 743       nop.i 999
 744 }
 745 { .mfi
 746       nop.m 999
 747       fnorm.s1 ArgX = f1
 748       nop.i 999
 749 }
 750 ;;
 751
 752 { .mfi
 753       getf.exp sign_X = f1               // Get signexp of x
 754       fmerge.s ArgX_abs = f0, f1         // Form |x|
 755       nop.i 999
 756 }
 757 { .mfi
 758       nop.m 999
 759       fnorm.s1 ArgX_orig = f1
 760       nop.i 999
 761 }
 762 ;;
 763
 764 { .mfi
 765       getf.exp sign_Y = ArgY_orig        // Get signexp of y
 766       fmerge.s ArgY_abs = f0, ArgY_orig  // Form |y|
 767       mov table_base = table_ptr1        // Save base pointer to tables
 768 }
 769 ;;
 770
 771 { .mfi
 772       ldfd P_hi = [table_ptr1],8         // Load double precision hi part of pi
 773       fclass.m p8,p0 = ArgY_orig, 0x1e7  // Test y natval, nan, inf, zero
 774       nop.i 999
 775 }
 776 ;;
 777
 778 { .mfi
 779       ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3
 780       nop.f 999
 781       nop.i 999
 782 }
 783 { .mfi
 784       nop.m 999
 785       fma.s1 M = f1, f1, f0              // Set M = 1.0
 786       nop.i 999
 787 }
 788 ;;
 789
 790 //
 791 //     Check for everything - if false, then must be pseudo-zero
 792 //     or pseudo-nan (IA unsupporteds).
 793 //
 794 { .mfb
 795       nop.m 999
 796       fclass.m p0,p12 = f1, 0x1FF        // Test x unsupported
 797 (p8)  br.cond.spnt ATANL_Y_SPECIAL       // Branch if y natval, nan, inf, zero
 798 }
 799 ;;
 800
 801 //     U = max(ArgX_abs,ArgY_abs)
 802 //     V = min(ArgX_abs,ArgY_abs)
 803 { .mfi
 804       nop.m 999
 805       fcmp.ge.s1 p6,p7 = Xsq, Ysq        // Test for |x| >= |y| using squares
 806       nop.i 999
 807 }
 808 { .mfb
 809       nop.m 999
 810       fma.s1 V = ArgX_abs, f1, f0        // Set V assuming |x| < |y|
 811       br.cond.sptk ATANL_COMMON          // Branch to common code
 812 }
 813 ;;
 814
 815 GLOBAL_IEEE754_END(atanl)
 816
 817 GLOBAL_IEEE754_ENTRY(atan2l)
 818
 819 { .mfi
 820       alloc r32 = ar.pfs, 0, 17, 4, 0
 821       fma.s1 Ysq = ArgY_orig, ArgY_orig, f0          // Form y*y
 822       nop.i 999
 823 }
 824 { .mfi
 825       addl table_ptr1 = @ltoff(Constants_atan#), gp  // Address of table pointer
 826       fma.s1 Xsq = ArgX_orig, ArgX_orig, f0          // Form x*x
 827       nop.i 999
 828 }
 829 ;;
 830
 831 { .mfi
 832       ld8 table_ptr1 = [table_ptr1]                  // Get table pointer
 833       fnorm.s1 ArgY = ArgY_orig
 834       nop.i 999
 835 }
 836 { .mfi
 837       nop.m 999
 838       fnorm.s1 ArgX = ArgX_orig
 839       nop.i 999
 840 }
 841 ;;
 842
 843 { .mfi
 844       getf.exp sign_X = ArgX_orig        // Get signexp of x
 845       fmerge.s ArgX_abs = f0, ArgX_orig  // Form |x|
 846       nop.i 999
 847 }
 848 ;;
 849
 850 { .mfi
 851       getf.exp sign_Y = ArgY_orig        // Get signexp of y
 852       fmerge.s ArgY_abs = f0, ArgY_orig  // Form |y|
 853       mov table_base = table_ptr1        // Save base pointer to tables
 854 }
 855 ;;
 856
 857 { .mfi
 858       ldfd P_hi = [table_ptr1],8         // Load double precision hi part of pi
 859       fclass.m p8,p0 = ArgY_orig, 0x1e7  // Test y natval, nan, inf, zero
 860       nop.i 999
 861 }
 862 ;;
 863
 864 { .mfi
 865       ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3
 866       fclass.m p9,p0 = ArgX_orig, 0x1e7  // Test x natval, nan, inf, zero
 867       nop.i 999
 868 }
 869 { .mfi
 870       nop.m 999
 871       fma.s1 M = f1, f1, f0              // Set M = 1.0
 872       nop.i 999
 873 }
 874 ;;
 875
 876 //
 877 //     Check for everything - if false, then must be pseudo-zero
 878 //     or pseudo-nan (IA unsupporteds).
 879 //
 880 { .mfb
 881       nop.m 999
 882       fclass.m p0,p12 = ArgX_orig, 0x1FF // Test x unsupported
 883 (p8)  br.cond.spnt ATANL_Y_SPECIAL       // Branch if y natval, nan, inf, zero
 884 }
 885 ;;
 886
 887 //     U = max(ArgX_abs,ArgY_abs)
 888 //     V = min(ArgX_abs,ArgY_abs)
 889 { .mfi
 890       nop.m 999
 891       fcmp.ge.s1 p6,p7 = Xsq, Ysq        // Test for |x| >= |y| using squares
 892       nop.i 999
 893 }
 894 { .mfb
 895       nop.m 999
 896       fma.s1 V = ArgX_abs, f1, f0        // Set V assuming |x| < |y|
 897 (p9)  br.cond.spnt ATANL_X_SPECIAL       // Branch if x natval, nan, inf, zero
 898 }
 899 ;;
 900
 901 // Now common code for atanl and atan2l
 902 ATANL_COMMON:
 903 { .mfi
 904       nop.m 999
 905       fclass.m p0,p13 = ArgY_orig, 0x1FF // Test y unsupported
 906       shr sign_X = sign_X, 17            // Get sign bit of x
 907 }
 908 { .mfi
 909       nop.m 999
 910       fma.s1 U = ArgY_abs, f1, f0        // Set U assuming |x| < |y|
 911       adds table_ptr1 = 176, table_ptr1  // Point to Q4
 912 }
 913 ;;
 914
 915 { .mfi
 916 (p6)  add swap = r0, r0                  // Set swap=0 if |x| >= |y|
 917 (p6)  frcpa.s1 E, p0 = ArgY_abs, ArgX_abs // Compute E if |x| >= |y|
 918       shr sign_Y = sign_Y, 17            // Get sign bit of y
 919 }
 920 { .mfb
 921       nop.m 999
 922 (p6)  fma.s1 V = ArgY_abs, f1, f0        // Set V if |x| >= |y|
 923 (p12) br.cond.spnt ATANL_UNSUPPORTED     // Branch if x unsupported
 924 }
 925 ;;
 926
 927 // Set p8 if y >=0
 928 // Set p9 if y < 0
 929 // Set p10 if |x| >= |y| and x >=0
 930 // Set p11 if |x| >= |y| and x < 0
 931 { .mfi
 932       cmp.eq p8, p9 = 0, sign_Y          // Test for y >= 0
 933 (p7)  frcpa.s1 E, p0 = ArgX_abs, ArgY_abs // Compute E if |x| < |y|
 934 (p7)  add swap = 1, r0                   // Set swap=1 if |x| < |y|
 935 }
 936 { .mfb
 937 (p6)  cmp.eq.unc p10, p11 = 0, sign_X    // If |x| >= |y|, test for x >= 0
 938 (p6)  fma.s1 U = ArgX_abs, f1, f0        // Set U if |x| >= |y|
 939 (p13) br.cond.spnt ATANL_UNSUPPORTED     // Branch if y unsupported
 940 }
 941 ;;
 942
 943 //
 944 //     if p8, s_Y = 1.0
 945 //     if p9, s_Y = -1.0
 946 //
 947 .pred.rel "mutex",p8,p9
 948 { .mfi
 949       nop.m 999
 950 (p8)  fadd.s1 s_Y = f0, f1               // If y >= 0 set s_Y = 1.0
 951       nop.i 999
 952 }
 953 { .mfi
 954       nop.m 999
 955 (p9)  fsub.s1 s_Y = f0, f1               // If y < 0 set s_Y = -1.0
 956       nop.i 999
 957 }
 958 ;;
 959
 960 .pred.rel "mutex",p10,p11
 961 { .mfi
 962       nop.m 999
 963 (p10) fsub.s1 M = M, f1                  // If |x| >= |y| and x >=0, set M=0
 964       nop.i 999
 965 }
 966 { .mfi
 967       nop.m 999
 968 (p11) fadd.s1 M = M, f1                  // If |x| >= |y| and x < 0, set M=2.0
 969       nop.i 999
 970 }
 971 ;;
 972
 973 { .mfi
 974       nop.m 999
 975       fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig // Dummy to set denormal flag
 976       nop.i 999
 977 }
 978 // *************************************************
 979 // ********************* STEP2 *********************
 980 // *************************************************
 981 //
 982 //     Q = E * V
 983 //
 984 { .mfi
 985       nop.m 999
 986       fmpy.s1 Q = E, V
 987       nop.i 999
 988 }
 989 ;;
 990
 991 { .mfi
 992       nop.m 999
 993       fnma.s1 E_hold = E, U, f1           // E_hold = 1.0 - E*U (1) if POLY path
 994       nop.i 999
 995 }
 996 ;;
 997
 998 // Create a single precision representation of the signexp of Q with the
 999 // 4 most significant bits of the significand followed by a 1 and then 18 0's
1000 { .mfi
1001       nop.m 999
1002       fmpy.s1 P_hi = M, P_hi
1003       dep.z special = 0x1, 18, 1           // Form 0x0000000000040000
1004 }
1005 { .mfi
1006       nop.m 999
1007       fmpy.s1 P_lo = M, P_lo
1008       add table_ptr2 = 32, table_ptr1
1009 }
1010 ;;
1011
1012 { .mfi
1013       nop.m 999
1014       fma.s1 A_temp = Q, f1, f0            // Set A_temp if POLY path
1015       nop.i 999
1016 }
1017 { .mfi
1018       nop.m 999
1019       fma.s1 E = E, E_hold, E              // E = E + E*E_hold (1) if POLY path
1020       nop.i 999
1021 }
1022 ;;
1023
1024 //
1025 //     Is Q < 2**(-3)?
1026 //     swap = xor(swap,sign_X)
1027 //
1028 { .mfi
1029       nop.m 999
1030       fcmp.lt.s1 p9, p0 = Q, TWO_TO_NEG3    // Test Q < 2^-3
1031       xor swap = sign_X, swap
1032 }
1033 ;;
1034
1035 //     P_hi = s_Y * P_hi
1036 { .mmf
1037       getf.exp exponent_Q =  Q              // Get signexp of Q
1038       cmp.eq.unc p7, p6 = 0x00000, swap
1039       fmpy.s1 P_hi = s_Y, P_hi
1040 }
1041 ;;
1042
1043 //
1044 //     if (PR_1) sigma = -1.0
1045 //     if (PR_2) sigma =  1.0
1046 //
1047 { .mfi
1048       getf.sig significand_Q = Q            // Get significand of Q
1049 (p6)  fsub.s1 sigma = f0, f1
1050       nop.i 999
1051 }
1052 { .mfb
1053 (p9)  add table_ptr1 = 128, table_base      // Point to P8 if POLY path
1054 (p7)  fadd.s1 sigma = f0, f1
1055 (p9)  br.cond.spnt ATANL_POLY               // Branch to POLY if 0 < Q < 2^-3
1056 }
1057 ;;
1058
1059 //
1060 // *************************************************
1061 // ******************** STEP3 **********************
1062 // *************************************************
1063 //
1064 //     lookup = b_1 b_2 b_3 B_4
1065 //
1066 { .mmi
1067       nop.m 999
1068       nop.m 999
1069       andcm k = 0x0003, exponent_Q  // k=0,1,2,3 for exp_Q=0,-1,-2,-3
1070 }
1071 ;;
1072
1073 //
1074 //  Generate sign_exp_Q b_1 b_2 b_3 b_4 1 0 0 0 ... 0  in single precision
1075 //  representation.  Note sign of Q is always 0.
1076 //
1077 { .mfi
1078       cmp.eq p8, p9 = 0x0000, k             // Test k=0
1079       nop.f 999
1080       extr.u lookup = significand_Q, 59, 4  // Extract b_1 b_2 b_3 b_4 for index
1081 }
1082 { .mfi
1083       sub sp_exp_Q = 0x7f, k                // Form single prec biased exp of Q
1084       nop.f 999
1085       sub k = k, r0, 1                      // Decrement k
1086 }
1087 ;;
1088
1089 //     Form pointer to B index table
1090 { .mfi
1091       ldfe Q_4 = [table_ptr1], -16          // Load Q_4
1092       nop.f 999
1093 (p9)  shl k = k, 8                          // k = 0, 256, or 512
1094 }
1095 { .mfi
1096 (p9)  shladd table_ptr2 = lookup, 4, table_ptr2
1097       nop.f 999
1098       shladd sp_exp_4sig_Q = sp_exp_Q, 4, lookup // Shift and add in 4 high bits
1099 }
1100 ;;
1101
1102 { .mmi
1103 (p8)  add table_ptr2 = -16, table_ptr2      // Pointer if original k was 0
1104 (p9)  add table_ptr2 = k, table_ptr2        // Pointer if k was 1, 2, 3
1105       dep special = sp_exp_4sig_Q, special, 19, 13 // Form z_hi as single prec
1106 }
1107 ;;
1108
1109 //     z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
1110 { .mmi
1111       ldfd Tbl_hi = [table_ptr2], 8         // Load Tbl_hi from index table
1112 ;;
1113       setf.s z_hi = special                 // Form z_hi
1114       nop.i 999
1115 }
1116 { .mmi
1117       ldfs Tbl_lo = [table_ptr2], 8         // Load Tbl_lo from index table
1118 ;;
1119       ldfe Q_3 = [table_ptr1], -16          // Load Q_3
1120       nop.i 999
1121 }
1122 ;;
1123
1124 { .mmi
1125       ldfe Q_2 = [table_ptr1], -16          // Load Q_2
1126       nop.m 999
1127       nop.i 999
1128 }
1129 ;;
1130
1131 { .mmf
1132       ldfe Q_1 = [table_ptr1], -16          // Load Q_1
1133       nop.m 999
1134       nop.f 999
1135 }
1136 ;;
1137
1138 { .mfi
1139       nop.m 999
1140       fma.s1 U_prime_hi = V, z_hi, U        // U_prime_hi = U + V * z_hi
1141       nop.i 999
1142 }
1143 { .mfi
1144       nop.m 999
1145       fnma.s1 V_prime = U, z_hi, V          // V_prime =  V - U * z_hi
1146       nop.i 999
1147 }
1148 ;;
1149
1150 { .mfi
1151       nop.m 999
1152       mov A_hi = Tbl_hi                     // Start with A_hi = Tbl_hi
1153       nop.i 999
1154 }
1155 ;;
1156
1157 { .mfi
1158       nop.m 999
1159       fsub.s1 U_hold = U, U_prime_hi        // U_hold = U - U_prime_hi
1160       nop.i 999
1161 }
1162 ;;
1163
1164 { .mfi
1165       nop.m 999
1166       frcpa.s1 C_hi, p0 = f1, U_prime_hi    // C_hi = frcpa(1,U_prime_hi)
1167       nop.i 999
1168 }
1169 ;;
1170
1171 { .mfi
1172       nop.m 999
1173       fmpy.s1 A_hi = s_Y, A_hi              // A_hi = s_Y * A_hi
1174       nop.i 999
1175 }
1176 ;;
1177
1178 { .mfi
1179       nop.m 999
1180       fma.s1 U_prime_lo = z_hi, V, U_hold   // U_prime_lo =  U_hold + V * z_hi
1181       nop.i 999
1182 }
1183 ;;
1184
1185 //     C_hi_hold = 1 - C_hi * U_prime_hi (1)
1186 { .mfi
1187       nop.m 999
1188       fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
1189       nop.i 999
1190 }
1191 ;;
1192
1193 { .mfi
1194       nop.m 999
1195       fma.s1 Res_hi = sigma, A_hi, P_hi   // Res_hi = P_hi + sigma * A_hi
1196       nop.i 999
1197 }
1198 ;;
1199
1200 { .mfi
1201       nop.m 999
1202       fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (1)
1203       nop.i 999
1204 }
1205 ;;
1206
1207 //     C_hi_hold = 1 - C_hi * U_prime_hi (2)
1208 { .mfi
1209       nop.m 999
1210       fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
1211       nop.i 999
1212 }
1213 ;;
1214
1215 { .mfi
1216       nop.m 999
1217       fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (2)
1218       nop.i 999
1219 }
1220 ;;
1221
1222 //     C_hi_hold = 1 - C_hi * U_prime_hi (3)
1223 { .mfi
1224       nop.m 999
1225       fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
1226       nop.i 999
1227 }
1228 ;;
1229
1230 { .mfi
1231       nop.m 999
1232       fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (3)
1233       nop.i 999
1234 }
1235 ;;
1236
1237 { .mfi
1238       nop.m 999
1239       fmpy.s1 w_hi = V_prime, C_hi           // w_hi = V_prime * C_hi
1240       nop.i 999
1241 }
1242 ;;
1243
1244 { .mfi
1245       nop.m 999
1246       fmpy.s1 wsq = w_hi, w_hi               // wsq = w_hi * w_hi
1247       nop.i 999
1248 }
1249 { .mfi
1250       nop.m 999
1251       fnma.s1 w_lo = w_hi, U_prime_hi, V_prime // w_lo = V_prime-w_hi*U_prime_hi
1252       nop.i 999
1253 }
1254 ;;
1255
1256 { .mfi
1257       nop.m 999
1258       fma.s1 poly =  wsq, Q_4, Q_3           // poly = Q_3 + wsq * Q_4
1259       nop.i 999
1260 }
1261 { .mfi
1262       nop.m 999
1263       fnma.s1 w_lo = w_hi, U_prime_lo, w_lo  // w_lo = w_lo - w_hi * U_prime_lo
1264       nop.i 999
1265 }
1266 ;;
1267
1268 { .mfi
1269       nop.m 999
1270       fma.s1 poly = wsq, poly, Q_2           // poly = Q_2 + wsq * poly
1271       nop.i 999
1272 }
1273 { .mfi
1274       nop.m 999
1275       fmpy.s1 w_lo = C_hi, w_lo              // w_lo =  = w_lo * C_hi
1276       nop.i 999
1277 }
1278 ;;
1279
1280 { .mfi
1281       nop.m 999
1282       fma.s1 poly = wsq, poly, Q_1           // poly = Q_1 + wsq * poly
1283       nop.i 999
1284 }
1285 { .mfi
1286       nop.m 999
1287       fadd.s1 A_lo = Tbl_lo, w_lo            // A_lo = Tbl_lo + w_lo
1288       nop.i 999
1289 }
1290 ;;
1291
1292 { .mfi
1293       nop.m 999
1294       fmpy.s0 Q_1 =  Q_1, Q_1                // Dummy operation to raise inexact
1295       nop.i 999
1296 }
1297 ;;
1298
1299 { .mfi
1300       nop.m 999
1301       fmpy.s1 poly = wsq, poly               // poly = wsq * poly
1302       nop.i 999
1303 }
1304 ;;
1305
1306 { .mfi
1307       nop.m 999
1308       fmpy.s1 poly = w_hi, poly              // poly = w_hi * poly
1309       nop.i 999
1310 }
1311 ;;
1312
1313 { .mfi
1314       nop.m 999
1315       fadd.s1 A_lo = A_lo, poly              // A_lo = A_lo + poly
1316       nop.i 999
1317 }
1318 ;;
1319
1320 { .mfi
1321       nop.m 999
1322       fadd.s1 A_lo = A_lo, w_hi              // A_lo = A_lo + w_hi
1323       nop.i 999
1324 }
1325 ;;
1326
1327 { .mfi
1328       nop.m 999
1329       fma.s1 Res_lo = sigma, A_lo, P_lo      // Res_lo = P_lo + sigma * A_lo
1330       nop.i 999
1331 }
1332 ;;
1333
1334 //
1335 //     Result  =  Res_hi + Res_lo * s_Y  (User Supplied Rounding Mode)
1336 //
1337 { .mfb
1338       nop.m 999
1339       fma.s0 Result = Res_lo, s_Y, Res_hi
1340       br.ret.sptk   b0                        // Exit table path 2^-3 <= V/U < 1
1341 }
1342 ;;
1343
1344
1345 ATANL_POLY:
1346 // Here if 0 < V/U < 2^-3
1347 //
1348 // ***********************************************
1349 // ******************** STEP4 ********************
1350 // ***********************************************
1351
1352 //
1353 //     Following:
1354 //     Iterate 3 times E = E + E*(1.0 - E*U)
1355 //     Also load P_8, P_7, P_6, P_5, P_4
1356 //
1357 { .mfi
1358       ldfe P_8 = [table_ptr1], -16            // Load P_8
1359       fnma.s1 z_lo = A_temp, U, V             // z_lo = V - A_temp * U
1360       nop.i 999
1361 }
1362 { .mfi
1363       nop.m 999
1364       fnma.s1 E_hold = E, U, f1               // E_hold = 1.0 - E*U (2)
1365       nop.i 999
1366 }
1367 ;;
1368
1369 { .mmi
1370       ldfe P_7 = [table_ptr1], -16            // Load P_7
1371 ;;
1372       ldfe P_6 = [table_ptr1], -16            // Load P_6
1373       nop.i 999
1374 }
1375 ;;
1376
1377 { .mfi
1378       ldfe P_5 = [table_ptr1], -16            // Load P_5
1379       fma.s1 E = E, E_hold, E                 // E = E + E_hold*E (2)
1380       nop.i 999
1381 }
1382 ;;
1383
1384 { .mmi
1385       ldfe P_4 = [table_ptr1], -16            // Load P_4
1386 ;;
1387       ldfe P_3 = [table_ptr1], -16            // Load P_3
1388       nop.i 999
1389 }
1390 ;;
1391
1392 { .mfi
1393       ldfe P_2 = [table_ptr1], -16            // Load P_2
1394       fnma.s1 E_hold = E, U, f1               // E_hold = 1.0 - E*U (3)
1395       nop.i 999
1396 }
1397 { .mlx
1398       nop.m 999
1399       movl         int_temp = 0x24005         // Signexp for small neg number
1400 }
1401 ;;
1402
1403 { .mmf
1404       ldfe P_1 = [table_ptr1], -16            // Load P_1
1405       setf.exp     tmp_small = int_temp       // Form small neg number
1406       fma.s1 E = E, E_hold, E                 // E = E + E_hold*E (3)
1407 }
1408 ;;
1409
1410 //
1411 //
1412 // At this point E approximates 1/U to roughly working precision
1413 // Z = V*E approximates V/U
1414 //
1415 { .mfi
1416       nop.m 999
1417       fmpy.s1 Z = V, E                         // Z = V * E
1418       nop.i 999
1419 }
1420 { .mfi
1421       nop.m 999
1422       fmpy.s1 z_lo = z_lo, E                   // z_lo = z_lo * E
1423       nop.i 999
1424 }
1425 ;;
1426
1427 //
1428 //     Now what we want to do is
1429 //     poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
1430 //     poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3))
1431 //
1432 //
1433 //     Fixup added to force inexact later -
1434 //     A_hi = A_temp + z_lo
1435 //     z_lo = (A_temp - A_hi) + z_lo
1436 //
1437 { .mfi
1438       nop.m 999
1439       fmpy.s1 zsq = Z, Z                        // zsq = Z * Z
1440       nop.i 999
1441 }
1442 { .mfi
1443       nop.m 999
1444       fadd.s1 A_hi = A_temp, z_lo               // A_hi = A_temp + z_lo
1445       nop.i 999
1446 }
1447 ;;
1448
1449 { .mfi
1450       nop.m 999
1451       fma.s1 poly1 = zsq, P_8, P_7              // poly1 = P_7 + zsq * P_8
1452       nop.i 999
1453 }
1454 { .mfi
1455       nop.m 999
1456       fma.s1 poly2 = zsq, P_3, P_2              // poly2 = P_2 + zsq * P_3
1457       nop.i 999
1458 }
1459 ;;
1460
1461 { .mfi
1462       nop.m 999
1463       fmpy.s1 z4 = zsq, zsq                     // z4 = zsq * zsq
1464       nop.i 999
1465 }
1466 { .mfi
1467       nop.m 999
1468       fsub.s1 A_temp = A_temp, A_hi             // A_temp = A_temp - A_hi
1469       nop.i 999
1470 }
1471 ;;
1472
1473 { .mfi
1474       nop.m 999
1475       fmerge.s     tmp = A_hi, A_hi             // Copy tmp = A_hi
1476       nop.i 999
1477 }
1478 ;;
1479
1480 { .mfi
1481       nop.m 999
1482       fma.s1 poly1 = zsq, poly1, P_6            // poly1 = P_6 + zsq * poly1
1483       nop.i 999
1484 }
1485 { .mfi
1486       nop.m 999
1487       fma.s1 poly2 = zsq, poly2, P_1            // poly2 = P_2 + zsq * poly2
1488       nop.i 999
1489 }
1490 ;;
1491
1492 { .mfi
1493       nop.m 999
1494       fmpy.s1 z8 = z4, z4                       // z8 = z4 * z4
1495       nop.i 999
1496 }
1497 { .mfi
1498       nop.m 999
1499       fadd.s1 z_lo = A_temp, z_lo               // z_lo = (A_temp - A_hi) + z_lo
1500       nop.i 999
1501 }
1502 ;;
1503
1504 { .mfi
1505       nop.m 999
1506       fma.s1 poly1 = zsq, poly1, P_5            // poly1 = P_5 + zsq * poly1
1507       nop.i 999
1508 }
1509 { .mfi
1510       nop.m 999
1511       fmpy.s1 poly2 = poly2, zsq                // poly2 = zsq * poly2
1512       nop.i 999
1513 }
1514 ;;
1515
1516 //     Create small GR double in case need to raise underflow
1517 { .mfi
1518       nop.m 999
1519       fma.s1 poly1 = zsq, poly1, P_4            // poly1 = P_4 + zsq * poly1
1520       dep GR_temp = -1,r0,0,53
1521 }
1522 ;;
1523
1524 //     Create small double in case need to raise underflow
1525 { .mfi
1526       setf.d FR_temp = GR_temp
1527       fma.s1 poly = z8, poly1, poly2            // poly = poly2 + z8 * poly1
1528       nop.i 999
1529 }
1530 ;;
1531
1532 { .mfi
1533       nop.m 999
1534       fma.s1 A_lo = Z, poly, z_lo               // A_lo = z_lo + Z * poly
1535       nop.i 999
1536 }
1537 ;;
1538
1539 { .mfi
1540       nop.m 999
1541       fadd.s1      A_hi = tmp, A_lo             // A_hi = tmp + A_lo
1542       nop.i 999
1543 }
1544 ;;
1545
1546 { .mfi
1547       nop.m 999
1548       fsub.s1      tmp = tmp, A_hi              // tmp = tmp - A_hi
1549       nop.i 999
1550 }
1551 { .mfi
1552       nop.m 999
1553       fmpy.s1 A_hi = s_Y, A_hi                  // A_hi = s_Y * A_hi
1554       nop.i 999
1555 }
1556 ;;
1557
1558 { .mfi
1559       nop.m 999
1560       fadd.s1      A_lo = tmp, A_lo             // A_lo = tmp + A_lo
1561       nop.i 999
1562 }
1563 { .mfi
1564       nop.m 999
1565       fma.s1 Res_hi = sigma, A_hi, P_hi         // Res_hi = P_hi + sigma * A_hi
1566       nop.i 999
1567 }
1568 ;;
1569
1570 { .mfi
1571       nop.m 999
1572       fsub.s1 tmp =  P_hi, Res_hi               // tmp = P_hi - Res_hi
1573       nop.i 999
1574 }
1575 ;;
1576
1577 //
1578 //     Test if A_lo is zero
1579 //
1580 { .mfi
1581       nop.m 999
1582       fclass.m p6,p0 = A_lo, 0x007              // Test A_lo = 0
1583       nop.i 999
1584 }
1585 ;;
1586
1587 { .mfi
1588       nop.m 999
1589 (p6)  mov          A_lo = tmp_small             // If A_lo zero, make very small
1590       nop.i 999
1591 }
1592 ;;
1593
1594 { .mfi
1595       nop.m 999
1596       fma.s1 tmp = A_hi, sigma, tmp             // tmp = sigma * A_hi  + tmp
1597       nop.i 999
1598 }
1599 { .mfi
1600       nop.m 999
1601       fma.s1 sigma =  A_lo, sigma, P_lo         // sigma = A_lo * sigma  + P_lo
1602       nop.i 999
1603 }
1604 ;;
1605
1606 { .mfi
1607       nop.m 999
1608       fma.s1 Res_lo = s_Y, sigma, tmp           // Res_lo = s_Y * sigma + tmp
1609       nop.i 999
1610 }
1611 ;;
1612
1613 //
1614 //     Test if Res_lo is denormal
1615 //
1616 { .mfi
1617       nop.m 999
1618       fclass.m p14, p15 = Res_lo, 0x0b
1619       nop.i 999
1620 }
1621 ;;
1622
1623 //
1624 //     Compute Result = Res_lo + Res_hi.  Use s3 if Res_lo is denormal.
1625 //
1626 { .mfi
1627       nop.m 999
1628 (p14) fadd.s3 Result = Res_lo, Res_hi     // Result for Res_lo denormal
1629       nop.i 999
1630 }
1631 { .mfi
1632       nop.m 999
1633 (p15) fadd.s0 Result = Res_lo, Res_hi     // Result for Res_lo normal
1634       nop.i 999
1635 }
1636 ;;
1637
1638 //
1639 //     If Res_lo is denormal test if Result equals zero
1640 //
1641 { .mfi
1642       nop.m 999
1643 (p14) fclass.m.unc p14, p0 = Result, 0x07
1644       nop.i 999
1645 }
1646 ;;
1647
1648 //
1649 //     If Res_lo is denormal and Result equals zero, raise inexact, underflow
1650 //     by squaring small double
1651 //
1652 { .mfb
1653       nop.m 999
1654 (p14) fmpy.d.s0 FR_temp = FR_temp, FR_temp
1655       br.ret.sptk   b0                     // Exit POLY path, 0 < Q < 2^-3
1656 }
1657 ;;
1658
1659
1660 ATANL_UNSUPPORTED:
1661 { .mfb
1662       nop.m 999
1663       fmpy.s0 Result = ArgX,ArgY
1664       br.ret.sptk   b0
1665 }
1666 ;;
1667
1668 // Here if y natval, nan, inf, zero
1669 ATANL_Y_SPECIAL:
1670 // Here if x natval, nan, inf, zero
1671 ATANL_X_SPECIAL:
1672 { .mfi
1673       nop.m 999
1674       fclass.m p13,p12 = ArgY_orig, 0x0c3  // Test y nan
1675       nop.i 999
1676 }
1677 ;;
1678
1679 { .mfi
1680       nop.m 999
1681       fclass.m p15,p14 = ArgY_orig, 0x103  // Test y natval
1682       nop.i 999
1683 }
1684 ;;
1685
1686 { .mfi
1687       nop.m 999
1688 (p12) fclass.m p13,p0 = ArgX_orig, 0x0c3  // Test x nan
1689       nop.i 999
1690 }
1691 ;;
1692
1693 { .mfi
1694       nop.m 999
1695 (p14) fclass.m p15,p0 = ArgX_orig, 0x103  // Test x natval
1696       nop.i 999
1697 }
1698 ;;
1699
1700 { .mfb
1701       nop.m 999
1702 (p13) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result nan if x or y nan
1703 (p13) br.ret.spnt b0                      // Exit if x or y nan
1704 }
1705 ;;
1706
1707 { .mfb
1708       nop.m 999
1709 (p15) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result natval if x or y natval
1710 (p15) br.ret.spnt b0                      // Exit if x or y natval
1711 }
1712 ;;
1713
1714
1715 // Here if x or y inf or zero
1716 ATANL_SPECIAL_HANDLING:
1717 { .mfi
1718       nop.m 999
1719       fclass.m p6, p7 = ArgY_orig, 0x007        // Test y zero
1720       mov special = 992                         // Offset to table
1721 }
1722 ;;
1723
1724 { .mfb
1725       add table_ptr1 = table_base, special      // Point to 3pi/4
1726       fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig  // Dummy to set denormal flag
1727 (p7)  br.cond.spnt ATANL_ArgY_Not_ZERO          // Branch if y not zero
1728 }
1729 ;;
1730
1731 // Here if y zero
1732 { .mmf
1733       ldfd  Result = [table_ptr1], 8            // Get pi high
1734       nop.m 999
1735       fclass.m p14, p0 = ArgX, 0x035            // Test for x>=+0
1736 }
1737 ;;
1738
1739 { .mmf
1740       nop.m 999
1741       ldfd  Result_lo = [table_ptr1], -8        // Get pi lo
1742       fclass.m p15, p0 = ArgX, 0x036            // Test for x<=-0
1743 }
1744 ;;
1745
1746 //
1747 //     Return sign_Y * 0 when  ArgX > +0
1748 //
1749 { .mfi
1750       nop.m 999
1751 (p14) fmerge.s Result = ArgY, f0               // If x>=+0, y=0, hi sgn(y)*0
1752       nop.i 999
1753 }
1754 ;;
1755
1756 { .mfi
1757       nop.m 999
1758       fclass.m p13, p0 = ArgX, 0x007           // Test for x=0
1759       nop.i 999
1760 }
1761 ;;
1762
1763 { .mfi
1764       nop.m 999
1765 (p14) fmerge.s Result_lo = ArgY, f0            // If x>=+0, y=0, lo sgn(y)*0
1766       nop.i 999
1767 }
1768 ;;
1769
1770 { .mfi
1771 (p13) mov GR_Parameter_TAG = 36                // Error tag for x=0, y=0
1772       nop.f 999
1773       nop.i 999
1774 }
1775 ;;
1776
1777 //
1778 //     Return sign_Y * pi when  ArgX < -0
1779 //
1780 { .mfi
1781       nop.m 999
1782 (p15) fmerge.s Result = ArgY, Result           // If x<0, y=0, hi=sgn(y)*pi
1783       nop.i 999
1784 }
1785 ;;
1786
1787 { .mfi
1788       nop.m 999
1789 (p15) fmerge.s Result_lo = ArgY, Result_lo     // If x<0, y=0, lo=sgn(y)*pi
1790       nop.i 999
1791 }
1792 ;;
1793
1794 //
1795 //     Call error support function for atan(0,0)
1796 //
1797 { .mfb
1798       nop.m 999
1799       fadd.s0 Result = Result, Result_lo
1800 (p13) br.cond.spnt __libm_error_region         // Branch if atan(0,0)
1801 }
1802 ;;
1803
1804 { .mib
1805       nop.m 999
1806       nop.i 999
1807       br.ret.sptk   b0                         // Exit for y=0, x not 0
1808 }
1809 ;;
1810
1811 // Here if y not zero
1812 ATANL_ArgY_Not_ZERO:
1813 { .mfi
1814       nop.m 999
1815       fclass.m p0, p10 = ArgY, 0x023           // Test y inf
1816       nop.i 999
1817 }
1818 ;;
1819
1820 { .mfb
1821       nop.m 999
1822       fclass.m p6, p0 = ArgX, 0x017            // Test for 0 <= |x| < inf
1823 (p10) br.cond.spnt  ATANL_ArgY_Not_INF         // Branch if 0 < |y| < inf
1824 }
1825 ;;
1826
1827 // Here if y=inf
1828 //
1829 //     Return +PI/2 when ArgY = +Inf and ArgX = +/-0 or normal
1830 //     Return -PI/2 when ArgY = -Inf and ArgX = +/-0 or normal
1831 //     Return +PI/4 when ArgY = +Inf and ArgX = +Inf
1832 //     Return -PI/4 when ArgY = -Inf and ArgX = +Inf
1833 //     Return +3PI/4 when ArgY = +Inf and ArgX = -Inf
1834 //     Return -3PI/4 when ArgY = -Inf and ArgX = -Inf
1835 //
1836 { .mfi
1837       nop.m 999
1838       fclass.m p7, p0 = ArgX, 0x021            // Test for x=+inf
1839       nop.i 999
1840 }
1841 ;;
1842
1843 { .mfi
1844 (p6)  add table_ptr1 =  16, table_ptr1         // Point to pi/2, if x finite
1845       fclass.m p8, p0 = ArgX, 0x022            // Test for x=-inf
1846       nop.i 999
1847 }
1848 ;;
1849
1850 { .mmi
1851 (p7)  add table_ptr1 =  32, table_ptr1         // Point to pi/4 if x=+inf
1852 ;;
1853 (p8)  add table_ptr1 =  48, table_ptr1         // Point to 3pi/4 if x=-inf
1854
1855       nop.i 999
1856 }
1857 ;;
1858
1859 { .mmi
1860       ldfd Result = [table_ptr1], 8            // Load pi/2, pi/4, or 3pi/4 hi
1861 ;;
1862       ldfd Result_lo = [table_ptr1], -8        // Load pi/2, pi/4, or 3pi/4 lo
1863       nop.i 999
1864 }
1865 ;;
1866
1867 { .mfi
1868       nop.m 999
1869       fmerge.s Result = ArgY, Result           // Merge sgn(y) in hi
1870       nop.i 999
1871 }
1872 ;;
1873
1874 { .mfi
1875       nop.m 999
1876       fmerge.s Result_lo = ArgY, Result_lo     // Merge sgn(y) in lo
1877       nop.i 999
1878 }
1879 ;;
1880
1881 { .mfb
1882       nop.m 999
1883       fadd.s0 Result = Result, Result_lo       // Compute complete result
1884       br.ret.sptk   b0                         // Exit for y=inf
1885 }
1886 ;;
1887
1888 // Here if y not INF, and x=0 or INF
1889 ATANL_ArgY_Not_INF:
1890 //
1891 //     Return +PI/2 when ArgY NOT Inf, ArgY > 0 and ArgX = +/-0
1892 //     Return -PI/2 when ArgY NOT Inf, ArgY < 0 and ArgX = +/-0
1893 //     Return +0    when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf
1894 //     Return -0    when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf
1895 //     Return +PI   when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf
1896 //     Return -PI   when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf
1897 //
1898 { .mfi
1899       nop.m 999
1900       fclass.m p7, p9 = ArgX, 0x021            // Test for x=+inf
1901       nop.i 999
1902 }
1903 ;;
1904
1905 { .mfi
1906       nop.m 999
1907       fclass.m p6, p0 = ArgX, 0x007            // Test for x=0
1908       nop.i 999
1909 }
1910 ;;
1911
1912 { .mfi
1913 (p6)  add table_ptr1 = 16, table_ptr1          // Point to pi/2
1914       fclass.m p8, p0 = ArgX, 0x022            // Test for x=-inf
1915       nop.i 999
1916 }
1917 ;;
1918
1919 .pred.rel "mutex",p7,p9
1920 { .mfi
1921 (p9)  ldfd Result = [table_ptr1], 8           // Load pi or pi/2 hi
1922 (p7)  fmerge.s Result = ArgY, f0              // If y not inf, x=+inf, sgn(y)*0
1923       nop.i 999
1924 }
1925 ;;
1926
1927 { .mfi
1928 (p9)  ldfd Result_lo = [table_ptr1], -8       // Load pi or pi/2 lo
1929 (p7)  fnorm.s0 Result = Result                // If y not inf, x=+inf normalize
1930       nop.i 999
1931 }
1932 ;;
1933
1934 { .mfi
1935       nop.m 999
1936 (p9)  fmerge.s Result = ArgY, Result          // Merge sgn(y) in hi
1937       nop.i 999
1938 }
1939 ;;
1940
1941 { .mfi
1942       nop.m 999
1943 (p9)  fmerge.s Result_lo = ArgY, Result_lo    // Merge sgn(y) in lo
1944       nop.i 999
1945 }
1946 ;;
1947
1948 { .mfb
1949       nop.m 999
1950 (p9)  fadd.s0 Result = Result, Result_lo      // Compute complete result
1951       br.ret.spnt   b0                        // Exit for y not inf, x=0,inf
1952 }
1953 ;;
1954
1955 GLOBAL_IEEE754_END(atan2l)
1956
1957 LOCAL_LIBM_ENTRY(__libm_error_region)
1958 .prologue
1959 { .mfi
1960         add   GR_Parameter_Y=-32,sp             // Parameter 2 value
1961         nop.f 0
1962 .save   ar.pfs,GR_SAVE_PFS
1963         mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
1964 }
1965 { .mfi
1966 .fframe 64
1967         add sp=-64,sp                           // Create new stack
1968         nop.f 0
1969         mov GR_SAVE_GP=gp                       // Save gp
1970 };;
1971 { .mmi
1972         stfe [GR_Parameter_Y] = FR_Y,16         // Save Parameter 2 on stack
1973         add GR_Parameter_X = 16,sp              // Parameter 1 address
1974 .save   b0, GR_SAVE_B0
1975         mov GR_SAVE_B0=b0                       // Save b0
1976 };;
1977 .body
1978 { .mib
1979         stfe [GR_Parameter_X] = FR_X            // Store Parameter 1 on stack
1980         add   GR_Parameter_RESULT = 0,GR_Parameter_Y
1981         nop.b 0                                 // Parameter 3 address
1982 }
1983 { .mib
1984         stfe [GR_Parameter_Y] = FR_RESULT      // Store Parameter 3 on stack
1985         add   GR_Parameter_Y = -16,GR_Parameter_Y
1986         br.call.sptk b0=__libm_error_support#  // Call error handling function
1987 };;
1988 { .mmi
1989         nop.m 0
1990         nop.m 0
1991         add   GR_Parameter_RESULT = 48,sp
1992 };;
1993 { .mmi
1994         ldfe  f8 = [GR_Parameter_RESULT]       // Get return result off stack
1995 .restore sp
1996         add   sp = 64,sp                       // Restore stack pointer
1997         mov   b0 = GR_SAVE_B0                  // Restore return address
1998 };;
1999 { .mib
2000         mov   gp = GR_SAVE_GP                  // Restore gp
2001         mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
2002         br.ret.sptk     b0                     // Return
2003 };;
2004
2005 LOCAL_LIBM_END(__libm_error_region#)
2006 .type   __libm_error_support#,@function
2007 .global __libm_error_support#