sysdeps/ia64/fpu/s_tanl.S

   1 .file "tanl.s"
   2
   3 // Copyright (C) 2000, 2001, Intel Corporation
   4 // All rights reserved.
   5 //
   6 // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
   7 // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
   8 //
   9 // Redistribution and use in source and binary forms, with or without
  10 // modification, are permitted provided that the following conditions are
  11 // met:
  12 //
  13 // * Redistributions of source code must retain the above copyright
  14 // notice, this list of conditions and the following disclaimer.
  15 //
  16 // * Redistributions in binary form must reproduce the above copyright
  17 // notice, this list of conditions and the following disclaimer in the
  18 // documentation and/or other materials provided with the distribution.
  19 //
  20 // * The name of Intel Corporation may not be used to endorse or promote
  21 // products derived from this software without specific prior written
  22 // permission.
  23 //
  24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  35 //
  36 // Intel Corporation is the author of this code, and requests that all
  37 // problem reports or change requests be submitted to it directly at
  38 // http://developer.intel.com/opensource.
  39 //
  40 // *********************************************************************
  41 //
  42 // History:
  43 //
  44 // 2/02/2000 (hand-optimized)
  45 // 4/04/00  Unwind support added
  46 // 12/28/00 Fixed false invalid flags
  47 //
  48 // *********************************************************************
  49 //
  50 // Function:   tanl(x) = tangent(x), for double-extended precision x values
  51 //
  52 // *********************************************************************
  53 //
  54 // Resources Used:
  55 //
  56 //    Floating-Point Registers: f8 (Input and Return Value)
  57 //                              f9-f15
  58 //                              f32-f112
  59 //
  60 //    General Purpose Registers:
  61 //      r32-r48
  62 //      r49-r50 (Used to pass arguments to pi_by_2 reduce routine)
  63 //
  64 //    Predicate Registers:      p6-p15
  65 //
  66 // *********************************************************************
  67 //
  68 // IEEE Special Conditions:
  69 //
  70 //    Denormal  fault raised on denormal inputs
  71 //    Overflow exceptions do not occur
  72 //    Underflow exceptions raised when appropriate for tan
  73 //    (No specialized error handling for this routine)
  74 //    Inexact raised when appropriate by algorithm
  75 //
  76 //    tan(SNaN) = QNaN
  77 //    tan(QNaN) = QNaN
  78 //    tan(inf) = QNaN
  79 //    tan(+/-0) = +/-0
  80 //
  81 // *********************************************************************
  82 //
  83 // Mathematical Description
  84 //
  85 // We consider the computation of FPTANL of Arg. Now, given
  86 //
  87 //      Arg = N pi/2  + alpha,          |alpha| <= pi/4,
  88 //
  89 // basic mathematical relationship shows that
  90 //
  91 //      tan( Arg ) =  tan( alpha )     if N is even;
  92 //                 = -cot( alpha )      otherwise.
  93 //
  94 // The value of alpha is obtained by argument reduction and
  95 // represented by two working precision numbers r and c where
  96 //
  97 //      alpha =  r  +  c     accurately.
  98 //
  99 // The reduction method is described in a previous write up.
 100 // The argument reduction scheme identifies 4 cases. For Cases 2
 101 // and 4, because |alpha| is small, tan(r+c) and -cot(r+c) can be
 102 // computed very easily by 2 or 3 terms of the Taylor series
 103 // expansion as follows:
 104 //
 105 // Case 2:
 106 // -------
 107 //
 108 //      tan(r + c) = r + c + r^3/3          ...accurately
 109 //        -cot(r + c) = -1/(r+c) + r/3          ...accurately
 110 //
 111 // Case 4:
 112 // -------
 113 //
 114 //      tan(r + c) = r + c + r^3/3 + 2r^5/15     ...accurately
 115 //        -cot(r + c) = -1/(r+c) + r/3 + r^3/45     ...accurately
 116 //
 117 //
 118 // The only cases left are Cases 1 and 3 of the argument reduction
 119 // procedure. These two cases will be merged since after the
 120 // argument is reduced in either cases, we have the reduced argument
 121 // represented as r + c and that the magnitude |r + c| is not small
 122 // enough to allow the usage of a very short approximation.
 123 //
 124 // The greatest challenge of this task is that the second terms of
 125 // the Taylor series for tan(r) and -cot(r)
 126 //
 127 //      r + r^3/3 + 2 r^5/15 + ...
 128 //
 129 // and
 130 //
 131 //      -1/r + r/3 + r^3/45 + ...
 132 //
 133 // are not very small when |r| is close to pi/4 and the rounding
 134 // errors will be a concern if simple polynomial accumulation is
 135 // used. When |r| < 2^(-2), however, the second terms will be small
 136 // enough (5 bits or so of right shift) that a normal Horner
 137 // recurrence suffices. Hence there are two cases that we consider
 138 // in the accurate computation of tan(r) and cot(r), |r| <= pi/4.
 139 //
 140 // Case small_r: |r| < 2^(-2)
 141 // --------------------------
 142 //
 143 // Since Arg = N pi/4 + r + c accurately, we have
 144 //
 145 //      tan(Arg) =  tan(r+c)            for N even,
 146 //            = -cot(r+c)          otherwise.
 147 //
 148 // Here for this case, both tan(r) and -cot(r) can be approximated
 149 // by simple polynomials:
 150 //
 151 //      tan(r) =    r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
 152 //        -cot(r) = -1/r + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
 153 //
 154 // accurately. Since |r| is relatively small, tan(r+c) and
 155 // -cot(r+c) can be accurately approximated by replacing r with
 156 // r+c only in the first two terms of the corresponding polynomials.
 157 //
 158 // Note that P1_1 (and Q1_1 for that matter) approximates 1/3 to
 159 // almost 64 sig. bits, thus
 160 //
 161 //      P1_1 (r+c)^3 =  P1_1 r^3 + c * r^2     accurately.
 162 //
 163 // Hence,
 164 //
 165 //      tan(r+c) =    r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
 166 //                     + c*(1 + r^2)
 167 //
 168 //        -cot(r+c) = -1/(r+c) + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
 169 //               + Q1_1*c
 170 //
 171 //
 172 // Case normal_r: 2^(-2) <= |r| <= pi/4
 173 // ------------------------------------
 174 //
 175 // This case is more likely than the previous one if one considers
 176 // r to be uniformly distributed in [-pi/4 pi/4].
 177 //
 178 // The required calculation is either
 179 //
 180 //      tan(r + c)  =  tan(r)  +  correction,  or
 181 //        -cot(r + c)  = -cot(r)  +  correction.
 182 //
 183 // Specifically,
 184 //
 185 //      tan(r + c) =  tan(r) + c tan'(r)  + O(c^2)
 186 //              =  tan(r) + c sec^2(r) + O(c^2)
 187 //              =  tan(r) + c SEC_sq     ...accurately
 188 //                as long as SEC_sq approximates sec^2(r)
 189 //                to, say, 5 bits or so.
 190 //
 191 // Similarly,
 192 //
 193 //        -cot(r + c) = -cot(r) - c cot'(r)  + O(c^2)
 194 //              = -cot(r) + c csc^2(r) + O(c^2)
 195 //              = -cot(r) + c CSC_sq     ...accurately
 196 //                as long as CSC_sq approximates csc^2(r)
 197 //                to, say, 5 bits or so.
 198 //
 199 // We therefore concentrate on accurately calculating tan(r) and
 200 // cot(r) for a working-precision number r, |r| <= pi/4 to within
 201 // 0.1% or so.
 202 //
 203 // We will employ a table-driven approach. Let
 204 //
 205 //      r = sgn_r * 2^k * 1.b_1 b_2 ... b_5 ... b_63
 206 //        = sgn_r * ( B + x )
 207 //
 208 // where
 209 //
 210 //      B = 2^k * 1.b_1 b_2 ... b_5 1
 211 //         x = |r| - B
 212 //
 213 // Now,
 214 //                   tan(B)  +   tan(x)
 215 //      tan( B + x ) =  ------------------------
 216 //                   1 -  tan(B)*tan(x)
 217 //
 218 //               /                         \
 219 //               |   tan(B)  +   tan(x)          |
 220
 221 //      = tan(B) +  | ------------------------ - tan(B) |
 222 //               |     1 -  tan(B)*tan(x)          |
 223 //               \                         /
 224 //
 225 //                 sec^2(B) * tan(x)
 226 //      = tan(B) + ------------------------
 227 //                 1 -  tan(B)*tan(x)
 228 //
 229 //                (1/[sin(B)*cos(B)]) * tan(x)
 230 //      = tan(B) + --------------------------------
 231 //                      cot(B)  -  tan(x)
 232 //
 233 //
 234 // Clearly, the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
 235 // calculated beforehand and stored in a table. Since
 236 //
 237 //      |x| <= 2^k * 2^(-6)  <= 2^(-7)  (because k = -1, -2)
 238 //
 239 // a very short polynomial will be sufficient to approximate tan(x)
 240 // accurately. The details involved in computing the last expression
 241 // will be given in the next section on algorithm description.
 242 //
 243 //
 244 // Now, we turn to the case where cot( B + x ) is needed.
 245 //
 246 //
 247 //                   1 - tan(B)*tan(x)
 248 //      cot( B + x ) =  ------------------------
 249 //                   tan(B)  +  tan(x)
 250 //
 251 //               /                           \
 252 //               |   1 - tan(B)*tan(x)              |
 253
 254 //      = cot(B) +  | ----------------------- - cot(B) |
 255 //               |     tan(B)  +  tan(x)            |
 256 //               \                           /
 257 //
 258 //               [tan(B) + cot(B)] * tan(x)
 259 //      = cot(B) - ----------------------------
 260 //                   tan(B)  +  tan(x)
 261 //
 262 //                (1/[sin(B)*cos(B)]) * tan(x)
 263 //      = cot(B) - --------------------------------
 264 //                      tan(B)  +  tan(x)
 265 //
 266 //
 267 // Note that the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) that
 268 // are needed are the same set of values needed in the previous
 269 // case.
 270 //
 271 // Finally, we can put all the ingredients together as follows:
 272 //
 273 //      Arg = N * pi/2 +  r + c          ...accurately
 274 //
 275 //      tan(Arg) =  tan(r) + correction    if N is even;
 276 //            = -cot(r) + correction    otherwise.
 277 //
 278 // For Cases 2 and 4,
 279 //
 280 //     Case 2:
 281 //     tan(Arg) =  tan(r + c) = r + c + r^3/3           N even
 282 //              = -cot(r + c) = -1/(r+c) + r/3           N odd
 283 //     Case 4:
 284 //     tan(Arg) =  tan(r + c) = r + c + r^3/3 + 2r^5/15  N even
 285 //              = -cot(r + c) = -1/(r+c) + r/3 + r^3/45  N odd
 286 //
 287 //
 288 // For Cases 1 and 3,
 289 //
 290 //     Case small_r: |r| < 2^(-2)
 291 //
 292 //      tan(Arg) =  r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
 293 //                     + c*(1 + r^2)               N even
 294 //
 295 //                  = -1/(r+c) + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
 296 //               + Q1_1*c                    N odd
 297 //
 298 //     Case normal_r: 2^(-2) <= |r| <= pi/4
 299 //
 300 //      tan(Arg) =  tan(r) + c * sec^2(r)     N even
 301 //               = -cot(r) + c * csc^2(r)     otherwise
 302 //
 303 //     For N even,
 304 //
 305 //      tan(Arg) = tan(r) + c*sec^2(r)
 306 //               = tan( sgn_r * (B+x) ) + c * sec^2(|r|)
 307 //                  = sgn_r * ( tan(B+x)  + sgn_r*c*sec^2(|r|) )
 308 //                  = sgn_r * ( tan(B+x)  + sgn_r*c*sec^2(B) )
 309 //
 310 // since B approximates |r| to 2^(-6) in relative accuracy.
 311 //
 312 //                 /            (1/[sin(B)*cos(B)]) * tan(x)
 313 //    tan(Arg) = sgn_r * | tan(B) + --------------------------------
 314 //                 \                     cot(B)  -  tan(x)
 315 //                                        \
 316 //                       + CORR  |
 317
 318 //                                     /
 319 // where
 320 //
 321 //    CORR = sgn_r*c*tan(B)*SC_inv(B);  SC_inv(B) = 1/(sin(B)*cos(B)).
 322 //
 323 // For N odd,
 324 //
 325 //      tan(Arg) = -cot(r) + c*csc^2(r)
 326 //               = -cot( sgn_r * (B+x) ) + c * csc^2(|r|)
 327 //                  = sgn_r * ( -cot(B+x)  + sgn_r*c*csc^2(|r|) )
 328 //                  = sgn_r * ( -cot(B+x)  + sgn_r*c*csc^2(B) )
 329 //
 330 // since B approximates |r| to 2^(-6) in relative accuracy.
 331 //
 332 //                 /            (1/[sin(B)*cos(B)]) * tan(x)
 333 //    tan(Arg) = sgn_r * | -cot(B) + --------------------------------
 334 //                 \                     tan(B)  +  tan(x)
 335 //                                        \
 336 //                       + CORR  |
 337
 338 //                                     /
 339 // where
 340 //
 341 //    CORR = sgn_r*c*cot(B)*SC_inv(B);  SC_inv(B) = 1/(sin(B)*cos(B)).
 342 //
 343 //
 344 // The actual algorithm prescribes how all the mathematical formulas
 345 // are calculated.
 346 //
 347 //
 348 // 2. Algorithmic Description
 349 // ==========================
 350 //
 351 // 2.1 Computation for Cases 2 and 4.
 352 // ----------------------------------
 353 //
 354 // For Case 2, we use two-term polynomials.
 355 //
 356 //    For N even,
 357 //
 358 //    rsq := r * r
 359 //    Result := c + r * rsq * P1_1
 360 //    Result := r + Result          ...in user-defined rounding
 361 //
 362 //    For N odd,
 363 //    S_hi  := -frcpa(r)               ...8 bits
 364 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
 365 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
 366 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
 367 //    S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
 368 //    ...S_hi + S_lo is -1/(r+c) to extra precision
 369 //    S_lo  := S_lo + Q1_1*r
 370 //
 371 //    Result := S_hi + S_lo     ...in user-defined rounding
 372 //
 373 // For Case 4, we use three-term polynomials
 374 //
 375 //    For N even,
 376 //
 377 //    rsq := r * r
 378 //    Result := c + r * rsq * (P1_1 + rsq * P1_2)
 379 //    Result := r + Result          ...in user-defined rounding
 380 //
 381 //    For N odd,
 382 //    S_hi  := -frcpa(r)               ...8 bits
 383 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
 384 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
 385 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
 386 //    S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
 387 //    ...S_hi + S_lo is -1/(r+c) to extra precision
 388 //    rsq   := r * r
 389 //    P      := Q1_1 + rsq*Q1_2
 390 //    S_lo  := S_lo + r*P
 391 //
 392 //    Result := S_hi + S_lo     ...in user-defined rounding
 393 //
 394 //
 395 // Note that the coefficients P1_1, P1_2, Q1_1, and Q1_2 are
 396 // the same as those used in the small_r case of Cases 1 and 3
 397 // below.
 398 //
 399 //
 400 // 2.2 Computation for Cases 1 and 3.
 401 // ----------------------------------
 402 // This is further divided into the case of small_r,
 403 // where |r| < 2^(-2), and the case of normal_r, where |r| lies between
 404 // 2^(-2) and pi/4.
 405 //
 406 // Algorithm for the case of small_r
 407 // ---------------------------------
 408 //
 409 // For N even,
 410 //      rsq   := r * r
 411 //      Poly1 := rsq*(P1_1 + rsq*(P1_2 + rsq*P1_3))
 412 //      r_to_the_8    := rsq * rsq
 413 //      r_to_the_8    := r_to_the_8 * r_to_the_8
 414 //      Poly2 := P1_4 + rsq*(P1_5 + rsq*(P1_6 + ... rsq*P1_9))
 415 //      CORR  := c * ( 1 + rsq )
 416 //      Poly  := Poly1 + r_to_the_8*Poly2
 417 //      Result := r*Poly + CORR
 418 //      Result := r + Result     ...in user-defined rounding
 419 //      ...note that Poly1 and r_to_the_8 can be computed in parallel
 420 //      ...with Poly2 (Poly1 is intentionally set to be much
 421 //      ...shorter than Poly2 so that r_to_the_8 and CORR can be hidden)
 422 //
 423 // For N odd,
 424 //      S_hi  := -frcpa(r)               ...8 bits
 425 //      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
 426 //      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
 427 //      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
 428 //      S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
 429 //      ...S_hi + S_lo is -1/(r+c) to extra precision
 430 //      S_lo  := S_lo + Q1_1*c
 431 //
 432 //      ...S_hi and S_lo are computed in parallel with
 433 //      ...the following
 434 //      rsq := r*r
 435 //      P   := Q1_1 + rsq*(Q1_2 + rsq*(Q1_3 + ... + rsq*Q1_7))
 436 //
 437 //      Result :=  r*P + S_lo
 438 //      Result :=  S_hi  +  Result      ...in user-defined rounding
 439 //
 440 //
 441 // Algorithm for the case of normal_r
 442 // ----------------------------------
 443 //
 444 // Here, we first consider the computation of tan( r + c ). As
 445 // presented in the previous section,
 446 //
 447 //      tan( r + c )  =  tan(r) + c * sec^2(r)
 448 //                 =  sgn_r * [ tan(B+x) + CORR ]
 449 //      CORR = sgn_r * c * tan(B) * 1/[sin(B)*cos(B)]
 450 //
 451 // because sec^2(r) = sec^(|r|), and B approximate |r| to 6.5 bits.
 452 //
 453 //      tan( r + c ) =
 454 //           /           (1/[sin(B)*cos(B)]) * tan(x)
 455 //      sgn_r * | tan(B) + --------------------------------  +
 456 //           \                     cot(B)  -  tan(x)
 457 //                                \
 458 //                          CORR  |
 459
 460 //                                /
 461 //
 462 // The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
 463 // calculated beforehand and stored in a table. Specifically,
 464 // the table values are
 465 //
 466 //      tan(B)                as  T_hi  +  T_lo;
 467 //      cot(B)             as  C_hi  +  C_lo;
 468 //      1/[sin(B)*cos(B)]  as  SC_inv
 469 //
 470 // T_hi, C_hi are in  double-precision  memory format;
 471 // T_lo, C_lo are in  single-precision  memory format;
 472 // SC_inv     is  in extended-precision memory format.
 473 //
 474 // The value of tan(x) will be approximated by a short polynomial of
 475 // the form
 476 //
 477 //      tan(x)  as  x  +  x * P, where
 478 //           P  =   x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
 479 //
 480 // Because |x| <= 2^(-7), cot(B) - x approximates cot(B) - tan(x)
 481 // to a relative accuracy better than 2^(-20). Thus, a good
 482 // initial guess of 1/( cot(B) - tan(x) ) to initiate the iterative
 483 // division is:
 484 //
 485 //      1/(cot(B) - tan(x))      is approximately
 486 //      1/(cot(B) -   x)         is
 487 //      tan(B)/(1 - x*tan(B))    is approximately
 488 //      T_hi / ( 1 - T_hi * x )  is approximately
 489 //
 490 //      T_hi * [ 1 + (Thi * x) + (T_hi * x)^2 ]
 491 //
 492 // The calculation of tan(r+c) therefore proceed as follows:
 493 //
 494 //      Tx     := T_hi * x
 495 //      xsq     := x * x
 496 //
 497 //      V_hi     := T_hi*(1 + Tx*(1 + Tx))
 498 //      P     := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
 499 //      ...V_hi serves as an initial guess of 1/(cot(B) - tan(x))
 500 //         ...good to about 20 bits of accuracy
 501 //
 502 //      tanx     := x + x*P
 503 //      D     := C_hi - tanx
 504 //      ...D is a double precision denominator: cot(B) - tan(x)
 505 //
 506 //      V_hi     := V_hi + V_hi*(1 - V_hi*D)
 507 //      ....V_hi approximates 1/(cot(B)-tan(x)) to 40 bits
 508 //
 509 //      V_lo     := V_hi * ( [ (1 - V_hi*C_hi) + V_hi*tanx ]
 510 //                           - V_hi*C_lo )   ...observe all order
 511 //         ...V_hi + V_lo approximates 1/(cot(B) - tan(x))
 512 //      ...to extra accuracy
 513 //
 514 //      ...               SC_inv(B) * (x + x*P)
 515 //      ...   tan(B) +      ------------------------- + CORR
 516 //         ...                cot(B) - (x + x*P)
 517 //      ...
 518 //      ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 519 //      ...
 520 //
 521 //      Sx     := SC_inv * x
 522 //      CORR     := sgn_r * c * SC_inv * T_hi
 523 //
 524 //      ...put the ingredients together to compute
 525 //      ...               SC_inv(B) * (x + x*P)
 526 //      ...   tan(B) +      ------------------------- + CORR
 527 //         ...                cot(B) - (x + x*P)
 528 //      ...
 529 //      ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 530 //      ...
 531 //      ... = T_hi + T_lo + CORR +
 532 //      ...    Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
 533 //
 534 //      CORR := CORR + T_lo
 535 //      tail := V_lo + P*(V_hi + V_lo)
 536 //         tail := Sx * tail  +  CORR
 537 //      tail := Sx * V_hi  +  tail
 538 //         T_hi := sgn_r * T_hi
 539 //
 540 //         ...T_hi + sgn_r*tail  now approximate
 541 //      ...sgn_r*(tan(B+x) + CORR) accurately
 542 //
 543 //      Result :=  T_hi + sgn_r*tail  ...in user-defined
 544 //                           ...rounding control
 545 //      ...It is crucial that independent paths be fully
 546 //      ...exploited for performance's sake.
 547 //
 548 //
 549 // Next, we consider the computation of -cot( r + c ). As
 550 // presented in the previous section,
 551 //
 552 //        -cot( r + c )  =  -cot(r) + c * csc^2(r)
 553 //                 =  sgn_r * [ -cot(B+x) + CORR ]
 554 //      CORR = sgn_r * c * cot(B) * 1/[sin(B)*cos(B)]
 555 //
 556 // because csc^2(r) = csc^(|r|), and B approximate |r| to 6.5 bits.
 557 //
 558 //        -cot( r + c ) =
 559 //           /             (1/[sin(B)*cos(B)]) * tan(x)
 560 //      sgn_r * | -cot(B) + --------------------------------  +
 561 //           \                     tan(B)  +  tan(x)
 562 //                                \
 563 //                          CORR  |
 564
 565 //                                /
 566 //
 567 // The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
 568 // calculated beforehand and stored in a table. Specifically,
 569 // the table values are
 570 //
 571 //      tan(B)                as  T_hi  +  T_lo;
 572 //      cot(B)             as  C_hi  +  C_lo;
 573 //      1/[sin(B)*cos(B)]  as  SC_inv
 574 //
 575 // T_hi, C_hi are in  double-precision  memory format;
 576 // T_lo, C_lo are in  single-precision  memory format;
 577 // SC_inv     is  in extended-precision memory format.
 578 //
 579 // The value of tan(x) will be approximated by a short polynomial of
 580 // the form
 581 //
 582 //      tan(x)  as  x  +  x * P, where
 583 //           P  =   x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
 584 //
 585 // Because |x| <= 2^(-7), tan(B) + x approximates tan(B) + tan(x)
 586 // to a relative accuracy better than 2^(-18). Thus, a good
 587 // initial guess of 1/( tan(B) + tan(x) ) to initiate the iterative
 588 // division is:
 589 //
 590 //      1/(tan(B) + tan(x))      is approximately
 591 //      1/(tan(B) +   x)         is
 592 //      cot(B)/(1 + x*cot(B))    is approximately
 593 //      C_hi / ( 1 + C_hi * x )  is approximately
 594 //
 595 //      C_hi * [ 1 - (C_hi * x) + (C_hi * x)^2 ]
 596 //
 597 // The calculation of -cot(r+c) therefore proceed as follows:
 598 //
 599 //      Cx     := C_hi * x
 600 //      xsq     := x * x
 601 //
 602 //      V_hi     := C_hi*(1 - Cx*(1 - Cx))
 603 //      P     := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
 604 //      ...V_hi serves as an initial guess of 1/(tan(B) + tan(x))
 605 //         ...good to about 18 bits of accuracy
 606 //
 607 //      tanx     := x + x*P
 608 //      D     := T_hi + tanx
 609 //      ...D is a double precision denominator: tan(B) + tan(x)
 610 //
 611 //      V_hi     := V_hi + V_hi*(1 - V_hi*D)
 612 //      ....V_hi approximates 1/(tan(B)+tan(x)) to 40 bits
 613 //
 614 //      V_lo     := V_hi * ( [ (1 - V_hi*T_hi) - V_hi*tanx ]
 615 //                           - V_hi*T_lo )   ...observe all order
 616 //         ...V_hi + V_lo approximates 1/(tan(B) + tan(x))
 617 //      ...to extra accuracy
 618 //
 619 //      ...               SC_inv(B) * (x + x*P)
 620 //      ...  -cot(B) +      ------------------------- + CORR
 621 //         ...                tan(B) + (x + x*P)
 622 //      ...
 623 //      ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 624 //      ...
 625 //
 626 //      Sx     := SC_inv * x
 627 //      CORR     := sgn_r * c * SC_inv * C_hi
 628 //
 629 //      ...put the ingredients together to compute
 630 //      ...               SC_inv(B) * (x + x*P)
 631 //      ...  -cot(B) +      ------------------------- + CORR
 632 //         ...                tan(B) + (x + x*P)
 633 //      ...
 634 //      ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 635 //      ...
 636 //      ... =-C_hi - C_lo + CORR +
 637 //      ...    Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
 638 //
 639 //      CORR := CORR - C_lo
 640 //      tail := V_lo + P*(V_hi + V_lo)
 641 //         tail := Sx * tail  +  CORR
 642 //      tail := Sx * V_hi  +  tail
 643 //         C_hi := -sgn_r * C_hi
 644 //
 645 //         ...C_hi + sgn_r*tail now approximates
 646 //      ...sgn_r*(-cot(B+x) + CORR) accurately
 647 //
 648 //      Result :=  C_hi + sgn_r*tail   in user-defined rounding control
 649 //      ...It is crucial that independent paths be fully
 650 //      ...exploited for performance's sake.
 651 //
 652 // 3. Implementation Notes
 653 // =======================
 654 //
 655 //   Table entries T_hi, T_lo; C_hi, C_lo; SC_inv
 656 //
 657 //   Recall that 2^(-2) <= |r| <= pi/4;
 658 //
 659 //      r = sgn_r * 2^k * 1.b_1 b_2 ... b_63
 660 //
 661 //   and
 662 //
 663 //        B = 2^k * 1.b_1 b_2 b_3 b_4 b_5 1
 664 //
 665 //   Thus, for k = -2, possible values of B are
 666 //
 667 //          B = 2^(-2) * ( 1 + index/32  +  1/64 ),
 668 //      index ranges from 0 to 31
 669 //
 670 //   For k = -1, however, since |r| <= pi/4 = 0.78...
 671 //   possible values of B are
 672 //
 673 //        B = 2^(-1) * ( 1 + index/32  +  1/64 )
 674 //      index ranges from 0 to 19.
 675 //
 676 //
 677
 678 #include "libm_support.h"
 679
 680 #ifdef _LIBC
 681 .rodata
 682 #else
 683 .data
 684 #endif
 685 .align 128
 686
 687 TANL_BASE_CONSTANTS:
 688 ASM_TYPE_DIRECTIVE(TANL_BASE_CONSTANTS,@object)
 689 data4    0x4B800000, 0xCB800000, 0x38800000, 0xB8800000 // two**24, -two**24
 690                                                         // two**-14, -two**-14
 691 data4    0x4E44152A, 0xA2F9836E, 0x00003FFE, 0x00000000 // two_by_pi
 692 data4    0xCE81B9F1, 0xC84D32B0, 0x00004016, 0x00000000 // P_0
 693 data4    0x2168C235, 0xC90FDAA2, 0x00003FFF, 0x00000000 // P_1
 694 data4    0xFC8F8CBB, 0xECE675D1, 0x0000BFBD, 0x00000000 // P_2
 695 data4    0xACC19C60, 0xB7ED8FBB, 0x0000BF7C, 0x00000000 // P_3
 696 data4    0x5F000000, 0xDF000000, 0x00000000, 0x00000000 // two_to_63, -two_to_63
 697 data4    0x6EC6B45A, 0xA397E504, 0x00003FE7, 0x00000000 // Inv_P_0
 698 data4    0xDBD171A1, 0x8D848E89, 0x0000BFBF, 0x00000000 // d_1
 699 data4    0x18A66F8E, 0xD5394C36, 0x0000BF7C, 0x00000000 // d_2
 700 data4    0x2168C234, 0xC90FDAA2, 0x00003FFE, 0x00000000 // PI_BY_4
 701 data4    0x2168C234, 0xC90FDAA2, 0x0000BFFE, 0x00000000 // MPI_BY_4
 702 data4    0x3E800000, 0xBE800000, 0x00000000, 0x00000000 // two**-2, -two**-2
 703 data4    0x2F000000, 0xAF000000, 0x00000000, 0x00000000 // two**-33, -two**-33
 704 data4    0xAAAAAABD, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P1_1
 705 data4    0x88882E6A, 0x88888888, 0x00003FFC, 0x00000000 // P1_2
 706 data4    0x0F0177B6, 0xDD0DD0DD, 0x00003FFA, 0x00000000 // P1_3
 707 data4    0x646B8C6D, 0xB327A440, 0x00003FF9, 0x00000000 // P1_4
 708 data4    0x1D5F7D20, 0x91371B25, 0x00003FF8, 0x00000000 // P1_5
 709 data4    0x61C67914, 0xEB69A5F1, 0x00003FF6, 0x00000000 // P1_6
 710 data4    0x019318D2, 0xBEDD37BE, 0x00003FF5, 0x00000000 // P1_7
 711 data4    0x3C794015, 0x9979B146, 0x00003FF4, 0x00000000 // P1_8
 712 data4    0x8C6EB58A, 0x8EBD21A3, 0x00003FF3, 0x00000000 // P1_9
 713 data4    0xAAAAAAB4, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // Q1_1
 714 data4    0x0B5FC93E, 0xB60B60B6, 0x00003FF9, 0x00000000 // Q1_2
 715 data4    0x0C9BBFBF, 0x8AB355E0, 0x00003FF6, 0x00000000 // Q1_3
 716 data4    0xCBEE3D4C, 0xDDEBBC89, 0x00003FF2, 0x00000000 // Q1_4
 717 data4    0x5F80BBB6, 0xB3548A68, 0x00003FEF, 0x00000000 // Q1_5
 718 data4    0x4CED5BF1, 0x91362560, 0x00003FEC, 0x00000000 // Q1_6
 719 data4    0x8EE92A83, 0xF189D95A, 0x00003FE8, 0x00000000 // Q1_7
 720 data4    0xAAAB362F, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P2_1
 721 data4    0xE97A6097, 0x88888886, 0x00003FFC, 0x00000000 // P2_2
 722 data4    0x25E716A1, 0xDD108EE0, 0x00003FFA, 0x00000000 // P2_3
 723 //
 724 //  Entries T_hi   double-precision memory format
 725 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 726 //  Entries T_lo  single-precision memory format
 727 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 728 //
 729 data4    0x62400794, 0x3FD09BC3, 0x23A05C32, 0x00000000
 730 data4    0xDFFBC074, 0x3FD124A9, 0x240078B2, 0x00000000
 731 data4    0x5BD4920F, 0x3FD1AE23, 0x23826B8E, 0x00000000
 732 data4    0x15E2701D, 0x3FD23835, 0x22D31154, 0x00000000
 733 data4    0x63739C2D, 0x3FD2C2E4, 0x2265C9E2, 0x00000000
 734 data4    0xAFEEA48B, 0x3FD34E36, 0x245C05EB, 0x00000000
 735 data4    0x7DBB35D1, 0x3FD3DA31, 0x24749F2D, 0x00000000
 736 data4    0x67321619, 0x3FD466DA, 0x2462CECE, 0x00000000
 737 data4    0x1F94A4D5, 0x3FD4F437, 0x246D0DF1, 0x00000000
 738 data4    0x740C3E6D, 0x3FD5824D, 0x240A85B5, 0x00000000
 739 data4    0x4CB1E73D, 0x3FD61123, 0x23F96E33, 0x00000000
 740 data4    0xAD9EA64B, 0x3FD6A0BE, 0x247C5393, 0x00000000
 741 data4    0xB804FD01, 0x3FD73125, 0x241F3B29, 0x00000000
 742 data4    0xAB53EE83, 0x3FD7C25E, 0x2479989B, 0x00000000
 743 data4    0xE6640EED, 0x3FD8546F, 0x23B343BC, 0x00000000
 744 data4    0xE8AF1892, 0x3FD8E75F, 0x241454D1, 0x00000000
 745 data4    0x53928BDA, 0x3FD97B35, 0x238613D9, 0x00000000
 746 data4    0xEB9DE4DE, 0x3FDA0FF6, 0x22859FA7, 0x00000000
 747 data4    0x99ECF92D, 0x3FDAA5AB, 0x237A6D06, 0x00000000
 748 data4    0x6D8F1796, 0x3FDB3C5A, 0x23952F6C, 0x00000000
 749 data4    0x9CFB8BE4, 0x3FDBD40A, 0x2280FC95, 0x00000000
 750 data4    0x87943100, 0x3FDC6CC3, 0x245D2EC0, 0x00000000
 751 data4    0xB736C500, 0x3FDD068C, 0x23C4AD7D, 0x00000000
 752 data4    0xE1DDBC31, 0x3FDDA16D, 0x23D076E6, 0x00000000
 753 data4    0xEB515A93, 0x3FDE3D6E, 0x244809A6, 0x00000000
 754 data4    0xE6E9E5F1, 0x3FDEDA97, 0x220856C8, 0x00000000
 755 data4    0x1963CE69, 0x3FDF78F1, 0x244BE993, 0x00000000
 756 data4    0x7D635BCE, 0x3FE00C41, 0x23D21799, 0x00000000
 757 data4    0x1C302CD3, 0x3FE05CAB, 0x248A1B1D, 0x00000000
 758 data4    0xDB6A1FA0, 0x3FE0ADB9, 0x23D53E33, 0x00000000
 759 data4    0x4A20BA81, 0x3FE0FF72, 0x24DB9ED5, 0x00000000
 760 data4    0x153FA6F5, 0x3FE151D9, 0x24E9E451, 0x00000000
 761 //
 762 //  Entries T_hi   double-precision memory format
 763 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 764 //  Entries T_lo  single-precision memory format
 765 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 766 //
 767 data4    0xBA1BE39E, 0x3FE1CEC4, 0x24B60F9E, 0x00000000
 768 data4    0x5ABD9B2D, 0x3FE277E4, 0x248C2474, 0x00000000
 769 data4    0x0272B110, 0x3FE32418, 0x247B8311, 0x00000000
 770 data4    0x890E2DF0, 0x3FE3D38B, 0x24C55751, 0x00000000
 771 data4    0x46236871, 0x3FE4866D, 0x24E5BC34, 0x00000000
 772 data4    0x45E044B0, 0x3FE53CEE, 0x24001BA4, 0x00000000
 773 data4    0x82EC06E4, 0x3FE5F742, 0x24B973DC, 0x00000000
 774 data4    0x25DF43F9, 0x3FE6B5A1, 0x24895440, 0x00000000
 775 data4    0xCAFD348C, 0x3FE77844, 0x240021CA, 0x00000000
 776 data4    0xCEED6B92, 0x3FE83F6B, 0x24C45372, 0x00000000
 777 data4    0xA34F3665, 0x3FE90B58, 0x240DAD33, 0x00000000
 778 data4    0x2C1E56B4, 0x3FE9DC52, 0x24F846CE, 0x00000000
 779 data4    0x27041578, 0x3FEAB2A4, 0x2323FB6E, 0x00000000
 780 data4    0x9DD8C373, 0x3FEB8E9F, 0x24B3090B, 0x00000000
 781 data4    0x65C9AA7B, 0x3FEC709B, 0x2449F611, 0x00000000
 782 data4    0xACCF8435, 0x3FED58F4, 0x23616A7E, 0x00000000
 783 data4    0x97635082, 0x3FEE480F, 0x24C2FEAE, 0x00000000
 784 data4    0xF0ACC544, 0x3FEF3E57, 0x242CE964, 0x00000000
 785 data4    0xF7E06E4B, 0x3FF01E20, 0x2480D3EE, 0x00000000
 786 data4    0x8A798A69, 0x3FF0A125, 0x24DB8967, 0x00000000
 787 //
 788 //  Entries C_hi   double-precision memory format
 789 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 790 //  Entries C_lo  single-precision memory format
 791 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 792 //
 793 data4    0xE63EFBD0, 0x400ED3E2, 0x259D94D4, 0x00000000
 794 data4    0xC515DAB5, 0x400DDDB4, 0x245F0537, 0x00000000
 795 data4    0xBE19A79F, 0x400CF57A, 0x25D4EA9F, 0x00000000
 796 data4    0xD15298ED, 0x400C1A06, 0x24AE40A0, 0x00000000
 797 data4    0x164B2708, 0x400B4A4C, 0x25A5AAB6, 0x00000000
 798 data4    0x5285B068, 0x400A855A, 0x25524F18, 0x00000000
 799 data4    0x3FFA549F, 0x4009CA5A, 0x24C999C0, 0x00000000
 800 data4    0x646AF623, 0x4009188A, 0x254FD801, 0x00000000
 801 data4    0x6084D0E7, 0x40086F3C, 0x2560F5FD, 0x00000000
 802 data4    0xA29A76EE, 0x4007CDD2, 0x255B9D19, 0x00000000
 803 data4    0x6C8ECA95, 0x400733BE, 0x25CB021B, 0x00000000
 804 data4    0x1F8DDC52, 0x4006A07E, 0x24AB4722, 0x00000000
 805 data4    0xC298AD58, 0x4006139B, 0x252764E2, 0x00000000
 806 data4    0xBAD7164B, 0x40058CAB, 0x24DAF5DB, 0x00000000
 807 data4    0xAE31A5D3, 0x40050B4B, 0x25EA20F4, 0x00000000
 808 data4    0x89F85A8A, 0x40048F21, 0x2583A3E8, 0x00000000
 809 data4    0xA862380D, 0x400417DA, 0x25DCC4CC, 0x00000000
 810 data4    0x1088FCFE, 0x4003A52B, 0x2430A492, 0x00000000
 811 data4    0xCD3527D5, 0x400336CC, 0x255F77CF, 0x00000000
 812 data4    0x5760766D, 0x4002CC7F, 0x25DA0BDA, 0x00000000
 813 data4    0x11CE02E3, 0x40026607, 0x256FF4A2, 0x00000000
 814 data4    0xD37BBE04, 0x4002032C, 0x25208AED, 0x00000000
 815 data4    0x7F050775, 0x4001A3BD, 0x24B72DD6, 0x00000000
 816 data4    0xA554848A, 0x40014789, 0x24AB4DAA, 0x00000000
 817 data4    0x323E81B7, 0x4000EE65, 0x2584C440, 0x00000000
 818 data4    0x21CF1293, 0x40009827, 0x25C9428D, 0x00000000
 819 data4    0x3D415EEB, 0x400044A9, 0x25DC8482, 0x00000000
 820 data4    0xBD72C577, 0x3FFFE78F, 0x257F5070, 0x00000000
 821 data4    0x75EFD28E, 0x3FFF4AC3, 0x23EBBF7A, 0x00000000
 822 data4    0x60B52DDE, 0x3FFEB2AF, 0x22EECA07, 0x00000000
 823 data4    0x35204180, 0x3FFE1F19, 0x24191079, 0x00000000
 824 data4    0x54F7E60A, 0x3FFD8FCA, 0x248D3058, 0x00000000
 825 //
 826 //  Entries C_hi   double-precision memory format
 827 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 828 //  Entries C_lo  single-precision memory format
 829 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 830 //
 831 data4    0x79F6FADE, 0x3FFCC06A, 0x239C7886, 0x00000000
 832 data4    0x891662A6, 0x3FFBB91F, 0x250BD191, 0x00000000
 833 data4    0x529F155D, 0x3FFABFB6, 0x256CC3E6, 0x00000000
 834 data4    0x2E964AE9, 0x3FF9D300, 0x250843E3, 0x00000000
 835 data4    0x89DCB383, 0x3FF8F1EF, 0x2277C87E, 0x00000000
 836 data4    0x7C87DBD6, 0x3FF81B93, 0x256DA6CF, 0x00000000
 837 data4    0x1042EDE4, 0x3FF74F14, 0x2573D28A, 0x00000000
 838 data4    0x1784B360, 0x3FF68BAF, 0x242E489A, 0x00000000
 839 data4    0x7C923C4C, 0x3FF5D0B5, 0x2532D940, 0x00000000
 840 data4    0xF418EF20, 0x3FF51D88, 0x253C7DD6, 0x00000000
 841 data4    0x02F88DAE, 0x3FF4719A, 0x23DB59BF, 0x00000000
 842 data4    0x49DA0788, 0x3FF3CC66, 0x252B4756, 0x00000000
 843 data4    0x0B980DB8, 0x3FF32D77, 0x23FE585F, 0x00000000
 844 data4    0xE56C987A, 0x3FF2945F, 0x25378A63, 0x00000000
 845 data4    0xB16523F6, 0x3FF200BD, 0x247BB2E0, 0x00000000
 846 data4    0x8CE27778, 0x3FF17235, 0x24446538, 0x00000000
 847 data4    0xFDEFE692, 0x3FF0E873, 0x2514638F, 0x00000000
 848 data4    0x33154062, 0x3FF0632C, 0x24A7FC27, 0x00000000
 849 data4    0xB3EF115F, 0x3FEFC42E, 0x248FD0FE, 0x00000000
 850 data4    0x135D26F6, 0x3FEEC9E8, 0x2385C719, 0x00000000
 851 //
 852 //  Entries SC_inv in Swapped IEEE format (extended)
 853 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 854 //
 855 data4    0x1BF30C9E, 0x839D6D4A, 0x00004001, 0x00000000
 856 data4    0x554B0EB0, 0x80092804, 0x00004001, 0x00000000
 857 data4    0xA1CF0DE9, 0xF959F94C, 0x00004000, 0x00000000
 858 data4    0x77378677, 0xF3086BA0, 0x00004000, 0x00000000
 859 data4    0xCCD4723C, 0xED154515, 0x00004000, 0x00000000
 860 data4    0x1C27CF25, 0xE7790944, 0x00004000, 0x00000000
 861 data4    0x8DDACB88, 0xE22D037D, 0x00004000, 0x00000000
 862 data4    0x89C73522, 0xDD2B2D8A, 0x00004000, 0x00000000
 863 data4    0xBB2C1171, 0xD86E1A23, 0x00004000, 0x00000000
 864 data4    0xDFF5E0F9, 0xD3F0E288, 0x00004000, 0x00000000
 865 data4    0x283BEBD5, 0xCFAF16B1, 0x00004000, 0x00000000
 866 data4    0x0D88DD53, 0xCBA4AFAA, 0x00004000, 0x00000000
 867 data4    0xCA67C43D, 0xC7CE03CC, 0x00004000, 0x00000000
 868 data4    0x0CA0DDB0, 0xC427BC82, 0x00004000, 0x00000000
 869 data4    0xF13D8CAB, 0xC0AECD57, 0x00004000, 0x00000000
 870 data4    0x71ECE6B1, 0xBD606C38, 0x00004000, 0x00000000
 871 data4    0xA44C4929, 0xBA3A0A96, 0x00004000, 0x00000000
 872 data4    0xE5CCCEC1, 0xB7394F6F, 0x00004000, 0x00000000
 873 data4    0x9637D8BC, 0xB45C1203, 0x00004000, 0x00000000
 874 data4    0x92CB051B, 0xB1A05528, 0x00004000, 0x00000000
 875 data4    0x6BA2FFD0, 0xAF04432B, 0x00004000, 0x00000000
 876 data4    0x7221235F, 0xAC862A23, 0x00004000, 0x00000000
 877 data4    0x5F00A9D1, 0xAA2478AF, 0x00004000, 0x00000000
 878 data4    0x81E082BF, 0xA7DDBB0C, 0x00004000, 0x00000000
 879 data4    0x45684FEE, 0xA5B0987D, 0x00004000, 0x00000000
 880 data4    0x627A8F53, 0xA39BD0F5, 0x00004000, 0x00000000
 881 data4    0x6EC5C8B0, 0xA19E3B03, 0x00004000, 0x00000000
 882 data4    0x91CD7C66, 0x9FB6C1F0, 0x00004000, 0x00000000
 883 data4    0x1FA3DF8A, 0x9DE46410, 0x00004000, 0x00000000
 884 data4    0xA8F6B888, 0x9C263139, 0x00004000, 0x00000000
 885 data4    0xC27B0450, 0x9A7B4968, 0x00004000, 0x00000000
 886 data4    0x5EE614EE, 0x98E2DB7E, 0x00004000, 0x00000000
 887 //
 888 //  Entries SC_inv in Swapped IEEE format (extended)
 889 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 890 //
 891 data4    0x13B2B5BA, 0x969F335C, 0x00004000, 0x00000000
 892 data4    0xD4C0F548, 0x93D446D9, 0x00004000, 0x00000000
 893 data4    0x61B798AF, 0x9147094F, 0x00004000, 0x00000000
 894 data4    0x758787AC, 0x8EF317CC, 0x00004000, 0x00000000
 895 data4    0xB99EEFDB, 0x8CD498B3, 0x00004000, 0x00000000
 896 data4    0xDFF8BC37, 0x8AE82A7D, 0x00004000, 0x00000000
 897 data4    0xE3C55D42, 0x892AD546, 0x00004000, 0x00000000
 898 data4    0xD15573C1, 0x8799FEA9, 0x00004000, 0x00000000
 899 data4    0x435A4B4C, 0x86335F88, 0x00004000, 0x00000000
 900 data4    0x3E93A87B, 0x84F4FB6E, 0x00004000, 0x00000000
 901 data4    0x80A382FB, 0x83DD1952, 0x00004000, 0x00000000
 902 data4    0xA4CB8C9E, 0x82EA3D7F, 0x00004000, 0x00000000
 903 data4    0x6861D0A8, 0x821B247C, 0x00004000, 0x00000000
 904 data4    0x63E8D244, 0x816EBED1, 0x00004000, 0x00000000
 905 data4    0x27E4CFC6, 0x80E42D91, 0x00004000, 0x00000000
 906 data4    0x28E64AFD, 0x807ABF8D, 0x00004000, 0x00000000
 907 data4    0x863B4FD8, 0x8031EF26, 0x00004000, 0x00000000
 908 data4    0xAE8C11FD, 0x800960AD, 0x00004000, 0x00000000
 909 data4    0x5FDBEC21, 0x8000E147, 0x00004000, 0x00000000
 910 data4    0xA07791FA, 0x80186650, 0x00004000, 0x00000000
 911 ASM_SIZE_DIRECTIVE(TANL_BASE_CONSTANTS)
 912
 913 Arg                 = f8
 914 Result              = f8
 915 fp_tmp              = f9
 916 U_2                 = f10
 917 rsq                =  f11
 918 C_hi                = f12
 919 C_lo                = f13
 920 T_hi                = f14
 921 T_lo                = f15
 922
 923 N_0                 = f32
 924 d_1                 = f33
 925 MPI_BY_4            = f34
 926 tail                = f35
 927 tanx                = f36
 928 Cx                  = f37
 929 Sx                  = f38
 930 sgn_r               = f39
 931 CORR                = f40
 932 P                   = f41
 933 D                   = f42
 934 ArgPrime            = f43
 935 P_0                 = f44
 936
 937 P2_1                = f45
 938 P2_2                = f46
 939 P2_3                = f47
 940
 941 P1_1                = f45
 942 P1_2                = f46
 943 P1_3                = f47
 944
 945 P1_4                = f48
 946 P1_5                = f49
 947 P1_6                = f50
 948 P1_7                = f51
 949 P1_8                = f52
 950 P1_9                = f53
 951
 952 TWO_TO_63           = f54
 953 NEGTWO_TO_63        = f55
 954 x                   = f56
 955 xsq                 = f57
 956 Tx                  = f58
 957 Tx1                 = f59
 958 Set                 = f60
 959 poly1               = f61
 960 poly2               = f62
 961 Poly                = f63
 962 Poly1               = f64
 963 Poly2               = f65
 964 r_to_the_8          = f66
 965 B                   = f67
 966 SC_inv              = f68
 967 Pos_r               = f69
 968 N_0_fix             = f70
 969 PI_BY_4             = f71
 970 NEGTWO_TO_NEG2      = f72
 971 TWO_TO_24           = f73
 972 TWO_TO_NEG14        = f74
 973 TWO_TO_NEG33        = f75
 974 NEGTWO_TO_24        = f76
 975 NEGTWO_TO_NEG14     = f76
 976 NEGTWO_TO_NEG33     = f77
 977 two_by_PI           = f78
 978 N                   = f79
 979 N_fix               = f80
 980 P_1                 = f81
 981 P_2                 = f82
 982 P_3                 = f83
 983 s_val               = f84
 984 w                   = f85
 985 c                   = f86
 986 r                   = f87
 987 A                   = f89
 988 a                   = f90
 989 t                   = f91
 990 U_1                 = f92
 991 d_2                 = f93
 992 TWO_TO_NEG2         = f94
 993 Q1_1                = f95
 994 Q1_2                = f96
 995 Q1_3                = f97
 996 Q1_4                = f98
 997 Q1_5                = f99
 998 Q1_6                = f100
 999 Q1_7                = f101
1000 Q1_8                = f102
1001 S_hi                = f103
1002 S_lo                = f104
1003 V_hi                = f105
1004 V_lo                = f106
1005 U_hi                = f107
1006 U_lo                = f108
1007 U_hiabs             = f109
1008 V_hiabs             = f110
1009 V                   = f111
1010 Inv_P_0             = f112
1011
1012 GR_SAVE_B0     = r33
1013 GR_SAVE_GP     = r34
1014 GR_SAVE_PFS    = r35
1015 delta1         = r36
1016 table_ptr1     = r37
1017 table_ptr2     = r38
1018 i_0            = r39
1019 i_1            = r40
1020 N_fix_gr       = r41
1021 N_inc          = r42
1022 exp_Arg        = r43
1023 exp_r          = r44
1024 sig_r          = r45
1025 lookup         = r46
1026 table_offset   = r47
1027 Create_B       = r48
1028 gr_tmp         = r49
1029
1030 .section .text
1031 .global tanl
1032 .proc tanl
1033 tanl:
1034 #ifdef _LIBC
1035 .global __tanl
1036 .proc __tanl
1037 __tanl:
1038 #endif
1039 { .mfi
1040 alloc r32 = ar.pfs, 0,17,2,0
1041 (p0)   fclass.m.unc  p6,p0 = Arg, 0x1E7
1042       addl gr_tmp = -1,r0
1043 }
1044 { .mfi
1045   nop.m 0
1046 (p0)   fclass.nm.unc  p7,p0 = Arg, 0x1FF
1047   nop.i 0
1048 };;
1049
1050 { .mfi
1051 (p0)  addl           table_ptr1   = @ltoff(TANL_BASE_CONSTANTS), gp
1052         nop.f 999
1053       nop.i 0
1054 }
1055 ;;
1056 { .mmi
1057 (p0)  ld8 table_ptr1 = [table_ptr1]
1058       setf.sig fp_tmp = gr_tmp   // Make a constant so fmpy produces inexact
1059       nop.i 999
1060 }
1061 ;;
1062
1063 //
1064 //     Check for NatVals, Infs , NaNs, and Zeros
1065 //     Check for everything - if false, then must be pseudo-zero
1066 //     or pseudo-nan.
1067 //     Local table pointer
1068 //
1069 { .mbb
1070 (p0)   add table_ptr2 = 96, table_ptr1
1071 (p6)   br.cond.spnt L(TANL_SPECIAL)
1072 (p7)   br.cond.spnt L(TANL_SPECIAL) ;;
1073 }
1074 //
1075 //     Point to Inv_P_0
1076 //     Branch out to deal with unsupporteds and special values.
1077 //
1078 { .mmf
1079 (p0)   ldfs TWO_TO_24 = [table_ptr1],4
1080 (p0)   ldfs TWO_TO_63 = [table_ptr2],4
1081 //
1082 //     Load -2**24, load -2**63.
1083 //
1084 (p0)   fcmp.eq.s0 p0, p6 = Arg, f1 ;;
1085 }
1086 { .mfi
1087 (p0)   ldfs NEGTWO_TO_63 = [table_ptr2],12
1088 (p0)   fnorm.s1     Arg = Arg
1089         nop.i 999
1090 }
1091 //
1092 //     Load 2**24, Load 2**63.
1093 //
1094 { .mmi
1095 (p0)   ldfs NEGTWO_TO_24 = [table_ptr1],12 ;;
1096 //
1097 //     Do fcmp to generate Denormal exception
1098 //     - can't do FNORM (will generate Underflow when U is unmasked!)
1099 //     Normalize input argument.
1100 //
1101 (p0)   ldfe two_by_PI = [table_ptr1],16
1102         nop.i 999
1103 }
1104 { .mmi
1105 (p0)   ldfe Inv_P_0 = [table_ptr2],16 ;;
1106 (p0)   ldfe d_1 = [table_ptr2],16
1107         nop.i 999
1108 }
1109 //
1110 //     Decide about the paths to take:
1111 //     PR_1 and PR_3 set if -2**24 < Arg < 2**24 - CASE 1 OR 2
1112 //     OTHERWISE - CASE 3 OR 4
1113 //     Load inverse of P_0 .
1114 //     Set PR_6 if Arg <= -2**63
1115 //     Are there any Infs, NaNs, or zeros?
1116 //
1117 { .mmi
1118 (p0)   ldfe P_0 = [table_ptr1],16 ;;
1119 (p0)   ldfe d_2 = [table_ptr2],16
1120         nop.i 999
1121 }
1122 //
1123 //     Set PR_8 if Arg <= -2**24
1124 //     Set PR_6 if Arg >=  2**63
1125 //
1126 { .mmi
1127 (p0)   ldfe P_1 = [table_ptr1],16 ;;
1128 (p0)   ldfe PI_BY_4 = [table_ptr2],16
1129         nop.i 999
1130 }
1131 //
1132 //     Set PR_8 if Arg >= 2**24
1133 //
1134 { .mmi
1135 (p0)   ldfe P_2 = [table_ptr1],16 ;;
1136 (p0)   ldfe   MPI_BY_4 = [table_ptr2],16
1137         nop.i 999
1138 }
1139 //
1140 //     Load  P_2 and PI_BY_4
1141 //
1142 { .mfi
1143 (p0)   ldfe   P_3 = [table_ptr1],16
1144         nop.f 999
1145         nop.i 999 ;;
1146 }
1147 { .mfi
1148         nop.m 999
1149 (p0)   fcmp.le.unc.s1 p6,p7 = Arg,NEGTWO_TO_63
1150         nop.i 999
1151 }
1152 { .mfi
1153         nop.m 999
1154 (p0)   fcmp.le.unc.s1 p8,p9 = Arg,NEGTWO_TO_24
1155         nop.i 999 ;;
1156 }
1157 { .mfi
1158         nop.m 999
1159 (p7)   fcmp.ge.s1 p6,p0 = Arg,TWO_TO_63
1160         nop.i 999
1161 }
1162 { .mfi
1163         nop.m 999
1164 (p9)   fcmp.ge.s1 p8,p0 = Arg,TWO_TO_24
1165         nop.i 999 ;;
1166 }
1167 { .mib
1168         nop.m 999
1169         nop.i 999
1170 //
1171 //     Load  P_3 and -PI_BY_4
1172 //
1173 (p6)   br.cond.spnt L(TANL_ARG_TOO_LARGE) ;;
1174 }
1175 { .mib
1176         nop.m 999
1177         nop.i 999
1178 //
1179 //     Load 2**(-2).
1180 //     Load -2**(-2).
1181 //     Branch out if we have a special argument.
1182 //     Branch out if the magnitude of the input argument is too large
1183 //     - do this branch before the next.
1184 //
1185 (p8)   br.cond.spnt L(TANL_LARGER_ARG) ;;
1186 }
1187 //
1188 //     Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
1189 //
1190 { .mfi
1191 (p0)   ldfs TWO_TO_NEG2 = [table_ptr2],4
1192 //     ARGUMENT REDUCTION CODE - CASE 1 and 2
1193 //     Load 2**(-2).
1194 //     Load -2**(-2).
1195 (p0)   fmpy.s1 N = Arg,two_by_PI
1196         nop.i 999 ;;
1197 }
1198 { .mfi
1199 (p0)   ldfs NEGTWO_TO_NEG2 = [table_ptr2],12
1200 //
1201 //     N = Arg * 2/pi
1202 //
1203 (p0)   fcmp.lt.unc.s1 p8,p9= Arg,PI_BY_4
1204         nop.i 999 ;;
1205 }
1206 { .mfi
1207         nop.m 999
1208 //
1209 //     if Arg < pi/4,  set PR_8.
1210 //
1211 (p8)   fcmp.gt.s1 p8,p9= Arg,MPI_BY_4
1212         nop.i 999 ;;
1213 }
1214 //
1215 //     Case 1: Is |r| < 2**(-2).
1216 //     Arg is the same as r in this case.
1217 //     r = Arg
1218 //     c = 0
1219 //
1220 { .mfi
1221 (p8)   mov N_fix_gr = r0
1222 //
1223 //     if Arg > -pi/4, reset PR_8.
1224 //     Select the case when |Arg| < pi/4 - set PR[8] = true.
1225 //     Else Select the case when |Arg| >= pi/4 - set PR[9] = true.
1226 //
1227 (p0)   fcvt.fx.s1 N_fix = N
1228         nop.i 999 ;;
1229 }
1230 { .mfi
1231         nop.m 999
1232 //
1233 //     Grab the integer part of N .
1234 //
1235 (p8)   mov r = Arg
1236         nop.i 999
1237 }
1238 { .mfi
1239         nop.m 999
1240 (p8)   mov c = f0
1241         nop.i 999 ;;
1242 }
1243 { .mfi
1244         nop.m 999
1245 (p8)   fcmp.lt.unc.s1 p10, p11 = Arg, TWO_TO_NEG2
1246         nop.i 999 ;;
1247 }
1248 { .mfi
1249         nop.m 999
1250 (p10)  fcmp.gt.s1 p10,p0 = Arg, NEGTWO_TO_NEG2
1251         nop.i 999 ;;
1252 }
1253 { .mfi
1254         nop.m 999
1255 //
1256 //     Case 2: Place integer part of N in GP register.
1257 //
1258 (p9)   fcvt.xf N = N_fix
1259         nop.i 999 ;;
1260 }
1261 { .mib
1262 (p9)   getf.sig N_fix_gr = N_fix
1263         nop.i 999
1264 //
1265 //     Case 2: Convert integer N_fix back to normalized floating-point value.
1266 //
1267 (p10)  br.cond.spnt L(TANL_SMALL_R) ;;
1268 }
1269 { .mib
1270         nop.m 999
1271         nop.i 999
1272 (p8)   br.cond.sptk L(TANL_NORMAL_R) ;;
1273 }
1274 //
1275 //     Case 1: PR_3 is only affected  when PR_1 is set.
1276 //
1277 { .mmi
1278 (p9)   ldfs TWO_TO_NEG33 = [table_ptr2], 4 ;;
1279 //
1280 //     Case 2: Load 2**(-33).
1281 //
1282 (p9)   ldfs NEGTWO_TO_NEG33 = [table_ptr2], 4
1283         nop.i 999 ;;
1284 }
1285 { .mfi
1286         nop.m 999
1287 //
1288 //     Case 2: Load -2**(-33).
1289 //
1290 (p9)   fnma.s1 s_val = N, P_1, Arg
1291         nop.i 999
1292 }
1293 { .mfi
1294         nop.m 999
1295 (p9)   fmpy.s1 w = N, P_2
1296         nop.i 999 ;;
1297 }
1298 { .mfi
1299         nop.m 999
1300 //
1301 //     Case 2: w = N * P_2
1302 //     Case 2: s_val = -N * P_1  + Arg
1303 //
1304 (p0)   fcmp.lt.unc.s1 p9,p8 = s_val, TWO_TO_NEG33
1305         nop.i 999 ;;
1306 }
1307 { .mfi
1308         nop.m 999
1309 //
1310 //     Decide between case_1 and case_2 reduce:
1311 //
1312 (p9)   fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33
1313         nop.i 999 ;;
1314 }
1315 { .mfi
1316         nop.m 999
1317 //
1318 //     Case 1_reduce:  s <= -2**(-33) or s >= 2**(-33)
1319 //     Case 2_reduce: -2**(-33) < s < 2**(-33)
1320 //
1321 (p8)   fsub.s1 r = s_val, w
1322         nop.i 999
1323 }
1324 { .mfi
1325         nop.m 999
1326 (p9)   fmpy.s1 w = N, P_3
1327         nop.i 999 ;;
1328 }
1329 { .mfi
1330         nop.m 999
1331 (p9)   fma.s1  U_1 = N, P_2, w
1332         nop.i 999
1333 }
1334 { .mfi
1335         nop.m 999
1336 //
1337 //     Case 1_reduce: Is |r| < 2**(-2), if so set PR_10
1338 //     else set PR_11.
1339 //
1340 (p8)   fsub.s1 c = s_val, r
1341         nop.i 999 ;;
1342 }
1343 { .mfi
1344         nop.m 999
1345 //
1346 //     Case 1_reduce: r = s + w (change sign)
1347 //     Case 2_reduce: w = N * P_3 (change sign)
1348 //
1349 (p8)   fcmp.lt.unc.s1 p10, p11 = r, TWO_TO_NEG2
1350         nop.i 999 ;;
1351 }
1352 { .mfi
1353         nop.m 999
1354 (p10)  fcmp.gt.s1 p10, p11 = r, NEGTWO_TO_NEG2
1355         nop.i 999 ;;
1356 }
1357 { .mfi
1358         nop.m 999
1359 (p9)   fsub.s1 r = s_val, U_1
1360         nop.i 999
1361 }
1362 { .mfi
1363         nop.m 999
1364 //
1365 //     Case 1_reduce: c is complete here.
1366 //     c = c + w (w has not been negated.)
1367 //     Case 2_reduce: r is complete here - continue to calculate c .
1368 //     r = s - U_1
1369 //
1370 (p9)   fms.s1 U_2 = N, P_2, U_1
1371         nop.i 999 ;;
1372 }
1373 { .mfi
1374         nop.m 999
1375 //
1376 //     Case 1_reduce: c = s - r
1377 //     Case 2_reduce: U_1 = N * P_2 + w
1378 //
1379 (p8)   fsub.s1 c = c, w
1380         nop.i 999 ;;
1381 }
1382 { .mfi
1383         nop.m 999
1384 (p9)   fsub.s1 s_val = s_val, r
1385         nop.i 999
1386 }
1387 { .mfb
1388         nop.m 999
1389 //
1390 //     Case 2_reduce:
1391 //     U_2 = N * P_2 - U_1
1392 //     Not needed until later.
1393 //
1394 (p9)   fadd.s1 U_2 = U_2, w
1395 //
1396 //     Case 2_reduce:
1397 //     s = s - r
1398 //     U_2 = U_2 + w
1399 //
1400 (p10)  br.cond.spnt L(TANL_SMALL_R) ;;
1401 }
1402 { .mib
1403         nop.m 999
1404         nop.i 999
1405 (p11)  br.cond.sptk L(TANL_NORMAL_R) ;;
1406 }
1407 { .mii
1408         nop.m 999
1409 //
1410 //     Case 2_reduce:
1411 //     c = c - U_2
1412 //     c is complete here
1413 //     Argument reduction ends here.
1414 //
1415 (p9)   extr.u i_1 = N_fix_gr, 0, 1 ;;
1416 (p9)   cmp.eq.unc p11, p12 = 0x0000,i_1 ;;
1417 }
1418 { .mfi
1419         nop.m 999
1420 //
1421 //     Is i_1  even or odd?
1422 //     if i_1 == 0, set p11, else set p12.
1423 //
1424 (p11)  fmpy.s1 rsq = r, r
1425         nop.i 999 ;;
1426 }
1427 { .mfi
1428         nop.m 999
1429 (p12)  frcpa.s1 S_hi,p0 = f1, r
1430         nop.i 999
1431 }
1432
1433
1434
1435 //
1436 //     Case 1: Branch to SMALL_R or NORMAL_R.
1437 //     Case 1 is done now.
1438 //
1439
1440 { .mfi
1441 (p9)   addl           table_ptr1   = @ltoff(TANL_BASE_CONSTANTS), gp
1442 (p9)   fsub.s1 c = s_val, U_1
1443        nop.i 999 ;;
1444 }
1445 ;;
1446
1447 { .mmi
1448 (p9)  ld8 table_ptr1 = [table_ptr1]
1449       nop.m 999
1450       nop.i 999
1451 }
1452 ;;
1453
1454
1455 { .mmi
1456 (p9)   add table_ptr1 = 224, table_ptr1 ;;
1457 (p9)   ldfe P1_1 = [table_ptr1],144
1458         nop.i 999 ;;
1459 }
1460 //
1461 //     Get [i_1] -  lsb of N_fix_gr .
1462 //     Load P1_1 and point to Q1_1 .
1463 //
1464 { .mfi
1465 (p9)   ldfe Q1_1 = [table_ptr1] , 0
1466 //
1467 //     N even: rsq = r * Z
1468 //     N odd:  S_hi = frcpa(r)
1469 //
1470 (p12)  fmerge.ns S_hi = S_hi, S_hi
1471         nop.i 999
1472 }
1473 { .mfi
1474         nop.m 999
1475 //
1476 //     Case 2_reduce:
1477 //     c = s - U_1
1478 //
1479 (p9)   fsub.s1 c = c, U_2
1480         nop.i 999 ;;
1481 }
1482 { .mfi
1483         nop.m 999
1484 (p12)  fma.s1  poly1 = S_hi, r, f1
1485         nop.i 999 ;;
1486 }
1487 { .mfi
1488         nop.m 999
1489 //
1490 //     N odd:  Change sign of S_hi
1491 //
1492 (p11)  fmpy.s1 rsq = rsq, P1_1
1493         nop.i 999 ;;
1494 }
1495 { .mfi
1496         nop.m 999
1497 (p12)  fma.s1 S_hi = S_hi, poly1, S_hi
1498         nop.i 999 ;;
1499 }
1500 { .mfi
1501         nop.m 999
1502 //
1503 //     N even: rsq = rsq * P1_1
1504 //     N odd:  poly1 =  1.0 +  S_hi * r    16 bits partial  account for necessary
1505 //
1506 (p11)  fma.s1 Result = r, rsq, c
1507         nop.i 999 ;;
1508 }
1509 { .mfi
1510         nop.m 999
1511 //
1512 //     N even: Result = c  + r * rsq
1513 //     N odd:  S_hi  = S_hi + S_hi*poly1  16 bits account for necessary
1514 //
1515 (p12)  fma.s1 poly1 = S_hi, r, f1
1516         nop.i 999 ;;
1517 }
1518 { .mfi
1519         nop.m 999
1520 //
1521 //     N even: Result = Result + r
1522 //     N odd:  poly1  = 1.0 + S_hi * r        32 bits partial
1523 //
1524 (p11)  fadd.s0 Result = r, Result
1525         nop.i 999 ;;
1526 }
1527 { .mfi
1528         nop.m 999
1529 (p12)  fma.s1  S_hi = S_hi, poly1, S_hi
1530         nop.i 999 ;;
1531 }
1532 { .mfi
1533         nop.m 999
1534 //
1535 //     N even: Result1 = Result + r
1536 //     N odd:   S_hi  = S_hi * poly1 + S_hi   32 bits
1537 //
1538 (p12)  fma.s1 poly1 = S_hi, r, f1
1539         nop.i 999 ;;
1540 }
1541 { .mfi
1542         nop.m 999
1543 //
1544 //     N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
1545 //
1546 (p12)  fma.s1 S_hi = S_hi, poly1, S_hi
1547         nop.i 999 ;;
1548 }
1549 { .mfi
1550         nop.m 999
1551 //
1552 //     N odd:  poly1  =  S_hi * poly + 1.0    64 bits
1553 //
1554 (p12)  fma.s1 poly1 = S_hi, r, f1
1555         nop.i 999 ;;
1556 }
1557 { .mfi
1558         nop.m 999
1559 //
1560 //     N odd:  poly1  =  S_hi * r + 1.0
1561 //
1562 (p12)  fma.s1 poly1 = S_hi, c, poly1
1563         nop.i 999 ;;
1564 }
1565 { .mfi
1566         nop.m 999
1567 //
1568 //     N odd:  poly1  =  S_hi * c + poly1
1569 //
1570 (p12)  fmpy.s1 S_lo = S_hi, poly1
1571         nop.i 999 ;;
1572 }
1573 { .mfi
1574         nop.m 999
1575 //
1576 //     N odd:  S_lo  =  S_hi *  poly1
1577 //
1578 (p12)  fma.s1 S_lo = Q1_1, r, S_lo
1579         nop.i 999
1580 }
1581 { .mfi
1582         nop.m 999
1583 //
1584 //     N odd:  Result =  S_hi + S_lo
1585 //
1586 (p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
1587         nop.i 999 ;;
1588 }
1589 { .mfb
1590         nop.m 999
1591 //
1592 //     N odd:  S_lo  =  S_lo + Q1_1 * r
1593 //
1594 (p12)  fadd.s0 Result = S_hi, S_lo
1595 (p0)   br.ret.sptk b0 ;;
1596 }
1597
1598
1599 L(TANL_LARGER_ARG):
1600
1601 //
1602 // ARGUMENT REDUCTION CODE - CASE 3 and 4
1603 //
1604
1605 { .mfi
1606 (p0)  addl           table_ptr1   = @ltoff(TANL_BASE_CONSTANTS), gp
1607 (p0)  fmpy.s1 N_0 = Arg, Inv_P_0
1608         nop.i 999
1609 }
1610 ;;
1611
1612 { .mmi
1613 (p0)  ld8 table_ptr1 = [table_ptr1]
1614       nop.m 999
1615       nop.i 999
1616 }
1617 ;;
1618
1619
1620 //
1621 //    Adjust table_ptr1 to beginning of table.
1622 //    N_0 = Arg * Inv_P_0
1623 //
1624 { .mmi
1625 (p0)  add table_ptr1 = 8, table_ptr1 ;;
1626 //
1627 //    Point to  2*-14
1628 //
1629 (p0)  ldfs TWO_TO_NEG14 = [table_ptr1], 4
1630         nop.i 999 ;;
1631 }
1632 //
1633 //    Load 2**(-14).
1634 //
1635 { .mmi
1636 (p0)  ldfs NEGTWO_TO_NEG14 = [table_ptr1], 180 ;;
1637 //
1638 //    N_0_fix  = integer part of N_0 .
1639 //    Adjust table_ptr1 to beginning of table.
1640 //
1641 (p0)  ldfs TWO_TO_NEG2 = [table_ptr1], 4
1642         nop.i 999 ;;
1643 }
1644 //
1645 //    Make N_0 the integer part.
1646 //
1647 { .mfi
1648 (p0)  ldfs NEGTWO_TO_NEG2 = [table_ptr1]
1649 //
1650 //    Load -2**(-14).
1651 //
1652 (p0)  fcvt.fx.s1 N_0_fix = N_0
1653         nop.i 999 ;;
1654 }
1655 { .mfi
1656         nop.m 999
1657 (p0)  fcvt.xf N_0 = N_0_fix
1658         nop.i 999 ;;
1659 }
1660 { .mfi
1661         nop.m 999
1662 (p0)  fnma.s1 ArgPrime = N_0, P_0, Arg
1663         nop.i 999
1664 }
1665 { .mfi
1666         nop.m 999
1667 (p0)  fmpy.s1 w = N_0, d_1
1668         nop.i 999 ;;
1669 }
1670 { .mfi
1671         nop.m 999
1672 //
1673 //    ArgPrime = -N_0 * P_0 + Arg
1674 //    w  = N_0 * d_1
1675 //
1676 (p0)  fmpy.s1 N = ArgPrime, two_by_PI
1677         nop.i 999 ;;
1678 }
1679 { .mfi
1680         nop.m 999
1681 //
1682 //    N = ArgPrime * 2/pi
1683 //
1684 (p0)  fcvt.fx.s1 N_fix = N
1685         nop.i 999 ;;
1686 }
1687 { .mfi
1688         nop.m 999
1689 //
1690 //    N_fix is the integer part.
1691 //
1692 (p0)  fcvt.xf N = N_fix
1693         nop.i 999 ;;
1694 }
1695 { .mfi
1696 (p0)  getf.sig N_fix_gr = N_fix
1697         nop.f 999
1698         nop.i 999 ;;
1699 }
1700 { .mfi
1701         nop.m 999
1702 //
1703 //    N is the integer part of the reduced-reduced argument.
1704 //    Put the integer in a GP register.
1705 //
1706 (p0)  fnma.s1 s_val = N, P_1, ArgPrime
1707         nop.i 999
1708 }
1709 { .mfi
1710         nop.m 999
1711 (p0)  fnma.s1 w = N, P_2, w
1712         nop.i 999 ;;
1713 }
1714 { .mfi
1715         nop.m 999
1716 //
1717 //    s_val = -N*P_1 + ArgPrime
1718 //    w = -N*P_2 + w
1719 //
1720 (p0)  fcmp.lt.unc.s1 p11, p10 = s_val, TWO_TO_NEG14
1721         nop.i 999 ;;
1722 }
1723 { .mfi
1724         nop.m 999
1725 (p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14
1726         nop.i 999 ;;
1727 }
1728 { .mfi
1729         nop.m 999
1730 //
1731 //    Case 3: r = s_val + w (Z complete)
1732 //    Case 4: U_hi = N_0 * d_1
1733 //
1734 (p10) fmpy.s1 V_hi = N, P_2
1735         nop.i 999
1736 }
1737 { .mfi
1738         nop.m 999
1739 (p11) fmpy.s1 U_hi = N_0, d_1
1740         nop.i 999 ;;
1741 }
1742 { .mfi
1743         nop.m 999
1744 //
1745 //    Case 3: r = s_val + w (Z complete)
1746 //    Case 4: U_hi = N_0 * d_1
1747 //
1748 (p11) fmpy.s1 V_hi = N, P_2
1749         nop.i 999
1750 }
1751 { .mfi
1752         nop.m 999
1753 (p11) fmpy.s1 U_hi = N_0, d_1
1754         nop.i 999 ;;
1755 }
1756 { .mfi
1757         nop.m 999
1758 //
1759 //    Decide between case 3 and 4:
1760 //    Case 3:  s <= -2**(-14) or s >= 2**(-14)
1761 //    Case 4: -2**(-14) < s < 2**(-14)
1762 //
1763 (p10) fadd.s1 r = s_val, w
1764         nop.i 999
1765 }
1766 { .mfi
1767         nop.m 999
1768 (p11) fmpy.s1 w = N, P_3
1769         nop.i 999 ;;
1770 }
1771 { .mfi
1772         nop.m 999
1773 //
1774 //    Case 4: We need abs of both U_hi and V_hi - dont
1775 //    worry about switched sign of V_hi .
1776 //
1777 (p11) fsub.s1 A = U_hi, V_hi
1778         nop.i 999
1779 }
1780 { .mfi
1781         nop.m 999
1782 //
1783 //    Case 4: A =  U_hi + V_hi
1784 //    Note: Worry about switched sign of V_hi, so subtract instead of add.
1785 //
1786 (p11) fnma.s1 V_lo = N, P_2, V_hi
1787         nop.i 999 ;;
1788 }
1789 { .mfi
1790         nop.m 999
1791 (p11) fms.s1 U_lo = N_0, d_1, U_hi
1792         nop.i 999 ;;
1793 }
1794 { .mfi
1795         nop.m 999
1796 (p11) fabs V_hiabs = V_hi
1797         nop.i 999
1798 }
1799 { .mfi
1800         nop.m 999
1801 //
1802 //    Case 4: V_hi = N * P_2
1803 //            w = N * P_3
1804 //    Note the product does not include the (-) as in the writeup
1805 //    so (-) missing for V_hi and w .
1806 (p10) fadd.s1 r = s_val, w
1807         nop.i 999 ;;
1808 }
1809 { .mfi
1810         nop.m 999
1811 //
1812 //    Case 3: c = s_val - r
1813 //    Case 4: U_lo = N_0 * d_1 - U_hi
1814 //
1815 (p11) fabs U_hiabs = U_hi
1816         nop.i 999
1817 }
1818 { .mfi
1819         nop.m 999
1820 (p11) fmpy.s1 w = N, P_3
1821         nop.i 999 ;;
1822 }
1823 { .mfi
1824         nop.m 999
1825 //
1826 //    Case 4: Set P_12 if U_hiabs >= V_hiabs
1827 //
1828 (p11) fadd.s1 C_hi = s_val, A
1829         nop.i 999 ;;
1830 }
1831 { .mfi
1832         nop.m 999
1833 //
1834 //    Case 4: C_hi = s_val + A
1835 //
1836 (p11) fadd.s1 t = U_lo, V_lo
1837         nop.i 999 ;;
1838 }
1839 { .mfi
1840         nop.m 999
1841 //
1842 //    Case 3: Is |r| < 2**(-2), if so set PR_7
1843 //    else set PR_8.
1844 //    Case 3: If PR_7 is set, prepare to branch to Small_R.
1845 //    Case 3: If PR_8 is set, prepare to branch to Normal_R.
1846 //
1847 (p10) fsub.s1 c = s_val, r
1848         nop.i 999 ;;
1849 }
1850 { .mfi
1851         nop.m 999
1852 //
1853 //    Case 3: c = (s - r) + w (c complete)
1854 //
1855 (p11) fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs
1856         nop.i 999
1857 }
1858 { .mfi
1859         nop.m 999
1860 (p11) fms.s1 w = N_0, d_2, w
1861         nop.i 999 ;;
1862 }
1863 { .mfi
1864         nop.m 999
1865 //
1866 //    Case 4: V_hi = N * P_2
1867 //            w = N * P_3
1868 //    Note the product does not include the (-) as in the writeup
1869 //    so (-) missing for V_hi and w .
1870 //
1871 (p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2
1872         nop.i 999 ;;
1873 }
1874 { .mfi
1875         nop.m 999
1876 (p14) fcmp.gt.s1 p14, p15 = r, NEGTWO_TO_NEG2
1877         nop.i 999 ;;
1878 }
1879 { .mfb
1880         nop.m 999
1881 //
1882 //    Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup)
1883 //    Note: the (-) is still missing for V_hi .
1884 //    Case 4: w = w + N_0 * d_2
1885 //    Note: the (-) is now incorporated in w .
1886 //
1887 (p10) fadd.s1 c = c, w
1888 //
1889 //    Case 4: t = U_lo + V_lo
1890 //    Note: remember V_lo should be (-), subtract instead of add. NO
1891 //
1892 (p14) br.cond.spnt L(TANL_SMALL_R) ;;
1893 }
1894 { .mib
1895         nop.m 999
1896         nop.i 999
1897 (p15) br.cond.spnt L(TANL_NORMAL_R) ;;
1898 }
1899 { .mfi
1900         nop.m 999
1901 //
1902 //    Case 3: Vector off when |r| < 2**(-2).  Recall that PR_3 will be true.
1903 //    The remaining stuff is for Case 4.
1904 //
1905 (p12) fsub.s1 a = U_hi, A
1906 (p11) extr.u i_1 = N_fix_gr, 0, 1 ;;
1907 }
1908 { .mfi
1909         nop.m 999
1910 //
1911 //    Case 4: C_lo = s_val - C_hi
1912 //
1913 (p11) fadd.s1 t = t, w
1914         nop.i 999
1915 }
1916 { .mfi
1917         nop.m 999
1918 (p13) fadd.s1 a = V_hi, A
1919         nop.i 999 ;;
1920 }
1921
1922
1923
1924 //
1925 //    Case 4: a = U_hi - A
1926 //            a = V_hi - A (do an add to account for missing (-) on V_hi
1927 //
1928
1929 { .mfi
1930 (p11)  addl           table_ptr1   = @ltoff(TANL_BASE_CONSTANTS), gp
1931 (p11) fsub.s1 C_lo = s_val, C_hi
1932         nop.i 999
1933 }
1934 ;;
1935
1936
1937
1938 //
1939 //    Case 4: a = (U_hi - A)  + V_hi
1940 //            a = (V_hi - A)  + U_hi
1941 //    In each case account for negative missing form V_hi .
1942 //
1943
1944
1945 { .mmi
1946 (p11)  ld8 table_ptr1 = [table_ptr1]
1947       nop.m 999
1948       nop.i 999
1949 }
1950 ;;
1951
1952
1953 //
1954 //    Case 4: C_lo = (s_val - C_hi) + A
1955 //
1956 { .mmi
1957 (p11) add table_ptr1 = 224, table_ptr1 ;;
1958 (p11) ldfe P1_1 = [table_ptr1], 16
1959         nop.i 999 ;;
1960 }
1961 { .mfi
1962 (p11) ldfe P1_2 = [table_ptr1], 128
1963 //
1964 //    Case 4: w = U_lo + V_lo  + w
1965 //
1966 (p12) fsub.s1 a = a, V_hi
1967         nop.i 999 ;;
1968 }
1969 //
1970 //    Case 4: r = C_hi + C_lo
1971 //
1972 { .mfi
1973 (p11) ldfe Q1_1 = [table_ptr1], 16
1974 (p11) fadd.s1 C_lo = C_lo, A
1975         nop.i 999 ;;
1976 }
1977 //
1978 //    Case 4: c = C_hi - r
1979 //    Get [i_1] - lsb of N_fix_gr.
1980 //
1981 { .mfi
1982 (p11) ldfe Q1_2 = [table_ptr1], 16
1983         nop.f 999
1984         nop.i 999 ;;
1985 }
1986 { .mfi
1987         nop.m 999
1988 (p13) fsub.s1 a = U_hi, a
1989         nop.i 999 ;;
1990 }
1991 { .mfi
1992         nop.m 999
1993 (p11) fadd.s1 t = t, a
1994         nop.i 999 ;;
1995 }
1996 { .mfi
1997         nop.m 999
1998 //
1999 //    Case 4: t = t + a
2000 //
2001 (p11) fadd.s1 C_lo = C_lo, t
2002         nop.i 999 ;;
2003 }
2004 { .mfi
2005         nop.m 999
2006 //
2007 //    Case 4: C_lo = C_lo + t
2008 //
2009 (p11) fadd.s1 r = C_hi, C_lo
2010         nop.i 999 ;;
2011 }
2012 { .mfi
2013         nop.m 999
2014 (p11) fsub.s1 c = C_hi, r
2015         nop.i 999
2016 }
2017 { .mfi
2018         nop.m 999
2019 //
2020 //    Case 4: c = c + C_lo  finished.
2021 //    Is i_1  even or odd?
2022 //    if i_1 == 0, set PR_4, else set PR_5.
2023 //
2024 // r and c have been computed.
2025 // We known whether this is the sine or cosine routine.
2026 // Make sure ftz mode is set - should be automatic when using wre
2027 (p0)  fmpy.s1 rsq = r, r
2028         nop.i 999 ;;
2029 }
2030 { .mfi
2031         nop.m 999
2032 (p11) fadd.s1 c = c , C_lo
2033 (p11) cmp.eq.unc p11, p12 =  0x0000, i_1 ;;
2034 }
2035 { .mfi
2036         nop.m 999
2037 (p12) frcpa.s1 S_hi, p0 = f1, r
2038         nop.i 999
2039 }
2040 { .mfi
2041         nop.m 999
2042 //
2043 //    N odd: Change sign of S_hi
2044 //
2045 (p11) fma.s1 Result = rsq, P1_2, P1_1
2046         nop.i 999 ;;
2047 }
2048 { .mfi
2049         nop.m 999
2050 (p12) fma.s1 P = rsq, Q1_2, Q1_1
2051         nop.i 999
2052 }
2053 { .mfi
2054         nop.m 999
2055 //
2056 //    N odd:  Result  =  S_hi + S_lo      (User supplied rounding mode for C1)
2057 //
2058 (p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2059         nop.i 999 ;;
2060 }
2061 { .mfi
2062         nop.m 999
2063 //
2064 //    N even: rsq = r * r
2065 //    N odd:  S_hi = frcpa(r)
2066 //
2067 (p12) fmerge.ns S_hi = S_hi, S_hi
2068         nop.i 999
2069 }
2070 { .mfi
2071         nop.m 999
2072 //
2073 //    N even: rsq = rsq * P1_2 + P1_1
2074 //    N odd:  poly1 =  1.0 +  S_hi * r    16 bits partial  account for necessary
2075 //
2076 (p11) fmpy.s1 Result = rsq, Result
2077         nop.i 999 ;;
2078 }
2079 { .mfi
2080         nop.m 999
2081 (p12) fma.s1 poly1 = S_hi, r,f1
2082         nop.i 999
2083 }
2084 { .mfi
2085         nop.m 999
2086 //
2087 //    N even: Result =  Result * rsq
2088 //    N odd:  S_hi  = S_hi + S_hi*poly1  16 bits account for necessary
2089 //
2090 (p11) fma.s1 Result = r, Result, c
2091         nop.i 999 ;;
2092 }
2093 { .mfi
2094         nop.m 999
2095 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2096         nop.i 999
2097 }
2098 { .mfi
2099         nop.m 999
2100 //
2101 //    N odd:   S_hi  = S_hi * poly1 + S_hi   32 bits
2102 //
2103 (p11) fadd.s0 Result= r, Result
2104         nop.i 999 ;;
2105 }
2106 { .mfi
2107         nop.m 999
2108 (p12) fma.s1 poly1 =  S_hi, r, f1
2109         nop.i 999 ;;
2110 }
2111 { .mfi
2112         nop.m 999
2113 //
2114 //    N even: Result = Result * r + c
2115 //    N odd:  poly1  = 1.0 + S_hi * r        32 bits partial
2116 //
2117 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2118         nop.i 999 ;;
2119 }
2120 { .mfi
2121         nop.m 999
2122 (p12) fma.s1 poly1 = S_hi, r, f1
2123         nop.i 999 ;;
2124 }
2125 { .mfi
2126         nop.m 999
2127 //
2128 //    N even: Result1 = Result + r  (Rounding mode S0)
2129 //    N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
2130 //
2131 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2132         nop.i 999 ;;
2133 }
2134 { .mfi
2135         nop.m 999
2136 //
2137 //    N odd:  poly1  =  S_hi * poly + S_hi    64 bits
2138 //
2139 (p12) fma.s1 poly1 = S_hi, r, f1
2140         nop.i 999 ;;
2141 }
2142 { .mfi
2143         nop.m 999
2144 //
2145 //    N odd:  poly1  =  S_hi * r + 1.0
2146 //
2147 (p12) fma.s1 poly1 = S_hi, c, poly1
2148         nop.i 999 ;;
2149 }
2150 { .mfi
2151         nop.m 999
2152 //
2153 //    N odd:  poly1  =  S_hi * c + poly1
2154 //
2155 (p12) fmpy.s1 S_lo = S_hi, poly1
2156         nop.i 999 ;;
2157 }
2158 { .mfi
2159         nop.m 999
2160 //
2161 //    N odd:  S_lo  =  S_hi *  poly1
2162 //
2163 (p12) fma.s1 S_lo = P, r, S_lo
2164         nop.i 999 ;;
2165 }
2166 { .mfb
2167         nop.m 999
2168 //
2169 //    N odd:  S_lo  =  S_lo + r * P
2170 //
2171 (p12) fadd.s0 Result = S_hi, S_lo
2172 (p0)   br.ret.sptk b0 ;;
2173 }
2174
2175
2176 L(TANL_SMALL_R):
2177 { .mii
2178         nop.m 999
2179 (p0)  extr.u i_1 = N_fix_gr, 0, 1 ;;
2180 (p0)  cmp.eq.unc p11, p12 = 0x0000, i_1
2181 }
2182 { .mfi
2183         nop.m 999
2184 (p0)  fmpy.s1 rsq = r, r
2185         nop.i 999 ;;
2186 }
2187 { .mfi
2188 (p0)  addl           table_ptr1   = @ltoff(TANL_BASE_CONSTANTS), gp
2189 (p12) frcpa.s1 S_hi, p0 = f1, r
2190         nop.i 999
2191 }
2192 ;;
2193
2194
2195 { .mmi
2196 (p0)  ld8 table_ptr1 = [table_ptr1]
2197       nop.m 999
2198       nop.i 999
2199 }
2200 ;;
2201
2202 // *****************************************************************
2203 // *****************************************************************
2204 // *****************************************************************
2205
2206
2207 { .mmi
2208 (p0)  add table_ptr1 = 224, table_ptr1 ;;
2209 (p0)  ldfe P1_1 = [table_ptr1], 16
2210         nop.i 999 ;;
2211 }
2212 //    r and c have been computed.
2213 //    We known whether this is the sine or cosine routine.
2214 //    Make sure ftz mode is set - should be automatic when using wre
2215 //    |r| < 2**(-2)
2216 { .mfi
2217 (p0)  ldfe P1_2 = [table_ptr1], 16
2218 (p11) fmpy.s1 r_to_the_8 = rsq, rsq
2219         nop.i 999 ;;
2220 }
2221 //
2222 //    Set table_ptr1 to beginning of constant table.
2223 //    Get [i_1] - lsb of N_fix_gr.
2224 //
2225 { .mfi
2226 (p0)  ldfe P1_3 = [table_ptr1], 96
2227 //
2228 //    N even: rsq = r * r
2229 //    N odd:  S_hi = frcpa(r)
2230 //
2231 (p12) fmerge.ns S_hi = S_hi, S_hi
2232         nop.i 999 ;;
2233 }
2234 //
2235 //    Is i_1  even or odd?
2236 //    if i_1 == 0, set PR_11.
2237 //    if i_1 != 0, set PR_12.
2238 //
2239 { .mfi
2240 (p11) ldfe P1_9 = [table_ptr1], -16
2241 //
2242 //    N even: Poly2 = P1_7 + Poly2 * rsq
2243 //    N odd:  poly2 = Q1_5 + poly2 * rsq
2244 //
2245 (p11) fadd.s1 CORR = rsq, f1
2246         nop.i 999 ;;
2247 }
2248 { .mmi
2249 (p11) ldfe P1_8 = [table_ptr1], -16 ;;
2250 //
2251 //    N even: Poly1 = P1_2 + P1_3 * rsq
2252 //    N odd:  poly1 =  1.0 +  S_hi * r
2253 //    16 bits partial  account for necessary (-1)
2254 //
2255 (p11) ldfe P1_7 = [table_ptr1], -16
2256         nop.i 999 ;;
2257 }
2258 //
2259 //    N even: Poly1 = P1_1 + Poly1 * rsq
2260 //    N odd:  S_hi  =  S_hi + S_hi * poly1)     16 bits account for necessary
2261 //
2262 { .mfi
2263 (p11) ldfe P1_6 = [table_ptr1], -16
2264 //
2265 //    N even: Poly2 = P1_5 + Poly2 * rsq
2266 //    N odd:  poly2 = Q1_3 + poly2 * rsq
2267 //
2268 (p11) fmpy.s1 r_to_the_8 = r_to_the_8, r_to_the_8
2269         nop.i 999 ;;
2270 }
2271 //
2272 //    N even: Poly1 =  Poly1 * rsq
2273 //    N odd:  poly1  = 1.0 + S_hi * r         32 bits partial
2274 //
2275 { .mfi
2276 (p11) ldfe P1_5 = [table_ptr1], -16
2277 (p12) fma.s1 poly1 =  S_hi, r, f1
2278         nop.i 999 ;;
2279 }
2280
2281 //
2282 //    N even: CORR =  CORR * c
2283 //    N odd:  S_hi  =  S_hi * poly1 + S_hi    32 bits
2284 //
2285
2286 //
2287 //    N even: Poly2 = P1_6 + Poly2 * rsq
2288 //    N odd:  poly2 = Q1_4 + poly2 * rsq
2289 //
2290
2291 { .mmf
2292 (p11) ldfe P1_4 = [table_ptr1], -16
2293 (p0)  addl           table_ptr2   = @ltoff(TANL_BASE_CONSTANTS), gp
2294 (p11) fmpy.s1 CORR =  CORR, c
2295 }
2296 ;;
2297
2298
2299 { .mmi
2300 (p0)  ld8 table_ptr2 = [table_ptr2]
2301       nop.m 999
2302       nop.i 999
2303 }
2304 ;;
2305
2306
2307 { .mii
2308 (p0)  add table_ptr2 = 464, table_ptr2
2309         nop.i 999 ;;
2310         nop.i 999
2311 }
2312 { .mfi
2313         nop.m 999
2314 (p11) fma.s1 Poly1 = P1_3, rsq, P1_2
2315         nop.i 999 ;;
2316 }
2317 { .mfi
2318 (p0)  ldfe Q1_7 = [table_ptr2], -16
2319 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2320         nop.i 999 ;;
2321 }
2322 { .mfi
2323 (p0)  ldfe Q1_6 = [table_ptr2], -16
2324 (p11) fma.s1 Poly2 = P1_9, rsq, P1_8
2325         nop.i 999 ;;
2326 }
2327 { .mmi
2328 (p0)  ldfe Q1_5 = [table_ptr2], -16 ;;
2329 (p12) ldfe Q1_4 = [table_ptr2], -16
2330         nop.i 999 ;;
2331 }
2332 { .mfi
2333 (p12) ldfe Q1_3 = [table_ptr2], -16
2334 //
2335 //    N even: Poly2 = P1_8 + P1_9 * rsq
2336 //    N odd:  poly2 = Q1_6 + Q1_7 * rsq
2337 //
2338 (p11) fma.s1 Poly1 = Poly1, rsq, P1_1
2339         nop.i 999 ;;
2340 }
2341 { .mfi
2342 (p12) ldfe Q1_2 = [table_ptr2], -16
2343 (p12) fma.s1 poly1 = S_hi, r, f1
2344         nop.i 999 ;;
2345 }
2346 { .mfi
2347 (p12) ldfe Q1_1 = [table_ptr2], -16
2348 (p11) fma.s1 Poly2 = Poly2, rsq, P1_7
2349         nop.i 999 ;;
2350 }
2351 { .mfi
2352         nop.m 999
2353 //
2354 //    N even: CORR =  rsq + 1
2355 //    N even: r_to_the_8 =  rsq * rsq
2356 //
2357 (p11) fmpy.s1 Poly1 = Poly1, rsq
2358         nop.i 999 ;;
2359 }
2360 { .mfi
2361         nop.m 999
2362 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2363         nop.i 999
2364 }
2365 { .mfi
2366         nop.m 999
2367 (p12) fma.s1 poly2 = Q1_7, rsq, Q1_6
2368         nop.i 999 ;;
2369 }
2370 { .mfi
2371         nop.m 999
2372 (p11) fma.s1 Poly2 = Poly2, rsq, P1_6
2373         nop.i 999 ;;
2374 }
2375 { .mfi
2376         nop.m 999
2377 (p12) fma.s1 poly1 = S_hi, r, f1
2378         nop.i 999
2379 }
2380 { .mfi
2381         nop.m 999
2382 (p12) fma.s1 poly2 = poly2, rsq, Q1_5
2383         nop.i 999 ;;
2384 }
2385 { .mfi
2386         nop.m 999
2387 (p11) fma.s1 Poly2= Poly2, rsq, P1_5
2388         nop.i 999 ;;
2389 }
2390 { .mfi
2391         nop.m 999
2392 (p12) fma.s1 S_hi =  S_hi, poly1, S_hi
2393         nop.i 999
2394 }
2395 { .mfi
2396         nop.m 999
2397 (p12) fma.s1 poly2 = poly2, rsq, Q1_4
2398         nop.i 999 ;;
2399 }
2400 { .mfi
2401         nop.m 999
2402 //
2403 //    N even: r_to_the_8 = r_to_the_8 * r_to_the_8
2404 //    N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
2405 //
2406 (p11) fma.s1 Poly2 = Poly2, rsq, P1_4
2407         nop.i 999 ;;
2408 }
2409 { .mfi
2410         nop.m 999
2411 //
2412 //    N even: Result = CORR + Poly * r
2413 //    N odd:  P = Q1_1 + poly2 * rsq
2414 //
2415 (p12) fma.s1 poly1 = S_hi, r, f1
2416         nop.i 999
2417 }
2418 { .mfi
2419         nop.m 999
2420 (p12) fma.s1 poly2 = poly2, rsq, Q1_3
2421         nop.i 999 ;;
2422 }
2423 { .mfi
2424         nop.m 999
2425 //
2426 //    N even: Poly2 = P1_4 + Poly2 * rsq
2427 //    N odd:  poly2 = Q1_2 + poly2 * rsq
2428 //
2429 (p11) fma.s1 Poly = Poly2, r_to_the_8, Poly1
2430         nop.i 999 ;;
2431 }
2432 { .mfi
2433         nop.m 999
2434 (p12) fma.s1 poly1 = S_hi, c, poly1
2435         nop.i 999
2436 }
2437 { .mfi
2438         nop.m 999
2439 (p12) fma.s1 poly2 = poly2, rsq, Q1_2
2440         nop.i 999 ;;
2441 }
2442
2443 { .mfi
2444         nop.m 999
2445 //
2446 //    N even: Poly = Poly1 + Poly2 * r_to_the_8
2447 //    N odd:  S_hi =  S_hi * poly1 + S_hi    64 bits
2448 //
2449 (p11) fma.s1 Result = Poly, r, CORR
2450         nop.i 999 ;;
2451 }
2452 { .mfi
2453         nop.m 999
2454 //
2455 //    N even: Result =  r + Result  (User supplied rounding mode)
2456 //    N odd:  poly1  =  S_hi * c + poly1
2457 //
2458 (p12) fmpy.s1 S_lo = S_hi, poly1
2459         nop.i 999
2460 }
2461 { .mfi
2462         nop.m 999
2463 (p12) fma.s1 P = poly2, rsq, Q1_1
2464         nop.i 999 ;;
2465 }
2466 { .mfi
2467         nop.m 999
2468 //
2469 //    N odd:  poly1  =  S_hi * r + 1.0
2470 //
2471 //
2472 //    N odd:  S_lo  =  S_hi *  poly1
2473 //
2474 (p11) fadd.s0 Result = Result, r
2475         nop.i 999 ;;
2476 }
2477 { .mfi
2478         nop.m 999
2479 //
2480 //    N odd:  S_lo  =  Q1_1 * c + S_lo
2481 //
2482 (p12) fma.s1 S_lo = Q1_1, c, S_lo
2483         nop.i 999
2484 }
2485 { .mfi
2486         nop.m 999
2487 (p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2488         nop.i 999 ;;
2489 }
2490 { .mfi
2491         nop.m 999
2492 //
2493 //    N odd:  Result =  S_lo + r * P
2494 //
2495 (p12) fma.s1 Result = P, r, S_lo
2496         nop.i 999 ;;
2497 }
2498 { .mfb
2499         nop.m 999
2500 //
2501 //    N odd:  Result = Result + S_hi  (user supplied rounding mode)
2502 //
2503 (p12) fadd.s0 Result = Result, S_hi
2504 (p0)   br.ret.sptk b0 ;;
2505 }
2506
2507
2508 L(TANL_NORMAL_R):
2509 { .mfi
2510 (p0)  getf.sig sig_r = r
2511 // *******************************************************************
2512 // *******************************************************************
2513 // *******************************************************************
2514 //
2515 //    r and c have been computed.
2516 //    Make sure ftz mode is set - should be automatic when using wre
2517 //
2518 //
2519 //    Get [i_1] -  lsb of N_fix_gr alone.
2520 //
2521 (p0)  fmerge.s  Pos_r = f1, r
2522 (p0)  extr.u i_1 = N_fix_gr, 0, 1 ;;
2523 }
2524 { .mfi
2525         nop.m 999
2526 (p0)  fmerge.s  sgn_r =  r, f1
2527 (p0)  cmp.eq.unc p11, p12 = 0x0000, i_1 ;;
2528 }
2529 { .mfi
2530         nop.m 999
2531         nop.f 999
2532 (p0)  extr.u lookup = sig_r, 58, 5
2533 }
2534 { .mlx
2535         nop.m 999
2536 (p0)  movl Create_B = 0x8200000000000000 ;;
2537 }
2538 { .mfi
2539 (p0)  addl           table_ptr1   = @ltoff(TANL_BASE_CONSTANTS), gp
2540         nop.f 999
2541 (p0)  dep Create_B = lookup, Create_B, 58, 5
2542 }
2543 ;;
2544
2545
2546 //
2547 //    Get [i_1] -  lsb of N_fix_gr alone.
2548 //    Pos_r = abs (r)
2549 //
2550
2551
2552 { .mmi
2553 (p0)  ld8 table_ptr1 = [table_ptr1]
2554       nop.m 999
2555       nop.i 999
2556 }
2557 ;;
2558
2559
2560 { .mmi
2561         nop.m 999
2562 (p0)  setf.sig B = Create_B
2563 //
2564 //    Set table_ptr1 and table_ptr2 to base address of
2565 //    constant table.
2566 //
2567 (p0)  add table_ptr1 = 480, table_ptr1 ;;
2568 }
2569 { .mmb
2570         nop.m 999
2571 //
2572 //    Is i_1 or i_0  == 0 ?
2573 //    Create the constant  1 00000 1000000000000000000000...
2574 //
2575 (p0)  ldfe P2_1 = [table_ptr1], 16
2576         nop.b 999
2577 }
2578 { .mmi
2579         nop.m 999 ;;
2580 (p0)  getf.exp exp_r = Pos_r
2581         nop.i 999
2582 }
2583 //
2584 //    Get r's exponent
2585 //    Get r's significand
2586 //
2587 { .mmi
2588 (p0)  ldfe P2_2 = [table_ptr1], 16 ;;
2589 //
2590 //    Get the 5 bits or r for the lookup.   1.xxxxx ....
2591 //    from sig_r.
2592 //    Grab  lsb of exp of B
2593 //
2594 (p0)  ldfe P2_3 = [table_ptr1], 16
2595         nop.i 999 ;;
2596 }
2597 { .mii
2598         nop.m 999
2599 (p0)  andcm table_offset = 0x0001, exp_r ;;
2600 (p0)  shl table_offset = table_offset, 9 ;;
2601 }
2602 { .mii
2603         nop.m 999
2604 //
2605 //    Deposit   0 00000 1000000000000000000000... on
2606 //              1 xxxxx yyyyyyyyyyyyyyyyyyyyyy...,
2607 //    getting rid of the ys.
2608 //    Is  B = 2** -2 or  B= 2** -1? If 2**-1, then
2609 //    we want an offset of 512 for table addressing.
2610 //
2611 (p0)  shladd table_offset = lookup, 4, table_offset ;;
2612 //
2613 //    B =  ........ 1xxxxx 1000000000000000000...
2614 //
2615 (p0)  add table_ptr1 = table_ptr1, table_offset ;;
2616 }
2617 { .mmb
2618         nop.m 999
2619 //
2620 //   B =  ........ 1xxxxx 1000000000000000000...
2621 //   Convert B so it has the same exponent as Pos_r
2622 //
2623 (p0)  ldfd T_hi = [table_ptr1], 8
2624         nop.b 999 ;;
2625 }
2626
2627
2628
2629 //
2630 //    x = |r| - B
2631 //    Load T_hi.
2632 //    Load C_hi.
2633 //
2634
2635 { .mmf
2636 (p0)  addl           table_ptr2   = @ltoff(TANL_BASE_CONSTANTS), gp
2637 (p0)  ldfs T_lo = [table_ptr1]
2638 (p0)  fmerge.se B = Pos_r, B
2639 }
2640 ;;
2641
2642
2643 { .mmi
2644 (p0)  ld8 table_ptr2 = [table_ptr2]
2645       nop.m 999
2646       nop.i 999
2647 }
2648 ;;
2649
2650
2651 { .mii
2652 (p0)  add table_ptr2 = 1360, table_ptr2
2653         nop.i 999 ;;
2654 (p0)  add table_ptr2 = table_ptr2, table_offset ;;
2655 }
2656 { .mfi
2657 (p0)  ldfd C_hi = [table_ptr2], 8
2658 (p0)  fsub.s1 x = Pos_r, B
2659         nop.i 999 ;;
2660 }
2661 { .mii
2662 (p0)  ldfs C_lo = [table_ptr2],255
2663         nop.i 999 ;;
2664 //
2665 //    xsq = x * x
2666 //    N even: Tx = T_hi * x
2667 //    Load T_lo.
2668 //    Load C_lo - increment pointer to get SC_inv
2669 //    - cant get all the way, do an add later.
2670 //
2671 (p0)  add table_ptr2 = 569, table_ptr2 ;;
2672 }
2673 //
2674 //    N even: Tx1 = Tx + 1
2675 //    N odd:  Cx1 = 1 - Cx
2676 //
2677 { .mfi
2678 (p0)  ldfe SC_inv = [table_ptr2], 0
2679         nop.f 999
2680         nop.i 999 ;;
2681 }
2682 { .mfi
2683         nop.m 999
2684 (p0)  fmpy.s1 xsq = x, x
2685         nop.i 999
2686 }
2687 { .mfi
2688         nop.m 999
2689 (p11) fmpy.s1 Tx = T_hi, x
2690         nop.i 999 ;;
2691 }
2692 { .mfi
2693         nop.m 999
2694 (p12) fmpy.s1 Cx = C_hi, x
2695         nop.i 999 ;;
2696 }
2697 { .mfi
2698         nop.m 999
2699 //
2700 //    N odd: Cx = C_hi * x
2701 //
2702 (p0)  fma.s1 P = P2_3, xsq, P2_2
2703         nop.i 999
2704 }
2705 { .mfi
2706         nop.m 999
2707 //
2708 //    N even and odd: P = P2_3 + P2_2 * xsq
2709 //
2710 (p11) fadd.s1 Tx1 = Tx, f1
2711         nop.i 999 ;;
2712 }
2713 { .mfi
2714         nop.m 999
2715 //
2716 //    N even: D = C_hi - tanx
2717 //    N odd: D = T_hi + tanx
2718 //
2719 (p11) fmpy.s1 CORR = SC_inv, T_hi
2720         nop.i 999
2721 }
2722 { .mfi
2723         nop.m 999
2724 (p0)  fmpy.s1 Sx = SC_inv, x
2725         nop.i 999 ;;
2726 }
2727 { .mfi
2728         nop.m 999
2729 (p12) fmpy.s1 CORR = SC_inv, C_hi
2730         nop.i 999 ;;
2731 }
2732 { .mfi
2733         nop.m 999
2734 (p12) fsub.s1 V_hi = f1, Cx
2735         nop.i 999 ;;
2736 }
2737 { .mfi
2738         nop.m 999
2739 (p0)  fma.s1 P = P, xsq, P2_1
2740         nop.i 999
2741 }
2742 { .mfi
2743         nop.m 999
2744 //
2745 //    N even and odd: P = P2_1 + P * xsq
2746 //
2747 (p11) fma.s1 V_hi = Tx, Tx1, f1
2748         nop.i 999 ;;
2749 }
2750 { .mfi
2751         nop.m 999
2752 //
2753 //    N even: Result  = sgn_r * tail + T_hi (user rounding mode for C1)
2754 //    N odd:  Result  = sgn_r * tail + C_hi (user rounding mode for C1)
2755 //
2756 (p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2757         nop.i 999 ;;
2758 }
2759 { .mfi
2760         nop.m 999
2761 (p0)  fmpy.s1 CORR = CORR, c
2762         nop.i 999 ;;
2763 }
2764 { .mfi
2765         nop.m 999
2766 (p12) fnma.s1 V_hi = Cx,V_hi,f1
2767         nop.i 999 ;;
2768 }
2769 { .mfi
2770         nop.m 999
2771 //
2772 //    N even: V_hi = Tx * Tx1 + 1
2773 //    N odd: Cx1 = 1 - Cx * Cx1
2774 //
2775 (p0)  fmpy.s1 P = P, xsq
2776         nop.i 999
2777 }
2778 { .mfi
2779         nop.m 999
2780 //
2781 //    N even and odd: P = P * xsq
2782 //
2783 (p11) fmpy.s1 V_hi = V_hi, T_hi
2784         nop.i 999 ;;
2785 }
2786 { .mfi
2787         nop.m 999
2788 //
2789 //    N even and odd: tail = P * tail + V_lo
2790 //
2791 (p11) fmpy.s1 T_hi = sgn_r, T_hi
2792         nop.i 999 ;;
2793 }
2794 { .mfi
2795         nop.m 999
2796 (p0)  fmpy.s1 CORR = CORR, sgn_r
2797         nop.i 999 ;;
2798 }
2799 { .mfi
2800         nop.m 999
2801 (p12) fmpy.s1 V_hi = V_hi,C_hi
2802         nop.i 999 ;;
2803 }
2804 { .mfi
2805         nop.m 999
2806 //
2807 //    N even: V_hi = T_hi * V_hi
2808 //    N odd: V_hi  = C_hi * V_hi
2809 //
2810 (p0)  fma.s1 tanx = P, x, x
2811         nop.i 999
2812 }
2813 { .mfi
2814         nop.m 999
2815 (p12) fnmpy.s1 C_hi = sgn_r, C_hi
2816         nop.i 999 ;;
2817 }
2818 { .mfi
2819         nop.m 999
2820 //
2821 //    N even: V_lo = 1 - V_hi + C_hi
2822 //    N odd: V_lo = 1 - V_hi + T_hi
2823 //
2824 (p11) fadd.s1 CORR = CORR, T_lo
2825         nop.i 999
2826 }
2827 { .mfi
2828         nop.m 999
2829 (p12) fsub.s1 CORR = CORR, C_lo
2830         nop.i 999 ;;
2831 }
2832 { .mfi
2833         nop.m 999
2834 //
2835 //    N even and odd: tanx = x + x * P
2836 //    N even and odd: Sx = SC_inv * x
2837 //
2838 (p11) fsub.s1 D = C_hi, tanx
2839         nop.i 999
2840 }
2841 { .mfi
2842         nop.m 999
2843 (p12) fadd.s1 D = T_hi, tanx
2844         nop.i 999 ;;
2845 }
2846 { .mfi
2847         nop.m 999
2848 //
2849 //    N odd: CORR = SC_inv * C_hi
2850 //    N even: CORR = SC_inv * T_hi
2851 //
2852 (p0)  fnma.s1 D = V_hi, D, f1
2853         nop.i 999 ;;
2854 }
2855 { .mfi
2856         nop.m 999
2857 //
2858 //    N even and odd: D = 1 - V_hi * D
2859 //    N even and odd: CORR = CORR * c
2860 //
2861 (p0)  fma.s1 V_hi = V_hi, D, V_hi
2862         nop.i 999 ;;
2863 }
2864 { .mfi
2865         nop.m 999
2866 //
2867 //    N even and odd: V_hi = V_hi + V_hi * D
2868 //    N even and odd: CORR = sgn_r * CORR
2869 //
2870 (p11) fnma.s1 V_lo = V_hi, C_hi, f1
2871         nop.i 999
2872 }
2873 { .mfi
2874         nop.m 999
2875 (p12) fnma.s1 V_lo = V_hi, T_hi, f1
2876         nop.i 999 ;;
2877 }
2878 { .mfi
2879         nop.m 999
2880 //
2881 //    N even: CORR = COOR + T_lo
2882 //    N odd: CORR = CORR - C_lo
2883 //
2884 (p11) fma.s1 V_lo = tanx, V_hi, V_lo
2885         nop.i 999
2886 }
2887 { .mfi
2888         nop.m 999
2889 (p12) fnma.s1 V_lo = tanx, V_hi, V_lo
2890         nop.i 999 ;;
2891 }
2892 { .mfi
2893         nop.m 999
2894 //
2895 //    N even: V_lo = V_lo + V_hi * tanx
2896 //    N odd: V_lo = V_lo - V_hi * tanx
2897 //
2898 (p11) fnma.s1 V_lo = C_lo, V_hi, V_lo
2899         nop.i 999
2900 }
2901 { .mfi
2902         nop.m 999
2903 (p12) fnma.s1 V_lo = T_lo, V_hi, V_lo
2904         nop.i 999 ;;
2905 }
2906 { .mfi
2907         nop.m 999
2908 //
2909 //    N  even: V_lo = V_lo - V_hi * C_lo
2910 //    N  odd: V_lo = V_lo - V_hi * T_lo
2911 //
2912 (p0)  fmpy.s1 V_lo = V_hi, V_lo
2913         nop.i 999 ;;
2914 }
2915 { .mfi
2916         nop.m 999
2917 //
2918 //    N even and odd: V_lo = V_lo * V_hi
2919 //
2920 (p0)  fadd.s1 tail = V_hi, V_lo
2921         nop.i 999 ;;
2922 }
2923 { .mfi
2924         nop.m 999
2925 //
2926 //    N even and odd: tail = V_hi + V_lo
2927 //
2928 (p0)  fma.s1 tail = tail, P, V_lo
2929         nop.i 999 ;;
2930 }
2931 { .mfi
2932         nop.m 999
2933 //
2934 //    N even: T_hi = sgn_r * T_hi
2935 //    N odd : C_hi = -sgn_r * C_hi
2936 //
2937 (p0)  fma.s1 tail = tail, Sx, CORR
2938         nop.i 999 ;;
2939 }
2940 { .mfi
2941         nop.m 999
2942 //
2943 //    N even and odd: tail = Sx * tail + CORR
2944 //
2945 (p0)  fma.s1 tail = V_hi, Sx, tail
2946         nop.i 999 ;;
2947 }
2948 { .mfi
2949         nop.m 999
2950 //
2951 //    N even an odd: tail = Sx * V_hi + tail
2952 //
2953 (p11) fma.s0 Result = sgn_r, tail, T_hi
2954         nop.i 999
2955 }
2956 { .mfb
2957         nop.m 999
2958 (p12) fma.s0 Result = sgn_r, tail, C_hi
2959 (p0)   br.ret.sptk b0 ;;
2960 }
2961
2962 L(TANL_SPECIAL):
2963 { .mfb
2964         nop.m 999
2965 (p0)   fmpy.s0 Arg = Arg, f0
2966 (p0)   br.ret.sptk b0 ;;
2967 }
2968 //
2969 //     Code for NaNs, Unsupporteds, Infs, or +/- zero ?
2970 //     Invalid raised for Infs and SNaNs.
2971 //
2972
2973 .endp  tanl
2974 ASM_SIZE_DIRECTIVE(tanl)
2975
2976 // *******************************************************************
2977 // *******************************************************************
2978 // *******************************************************************
2979 //
2980 //     Special Code to handle very large argument case.
2981 //     Call int pi_by_2_reduce(&x,&r,&c)
2982 //     for |arguments| >= 2**63
2983 //     (Arg or x) is in f8
2984 //     Address to save r and c as double
2985 // *******************************************************************
2986 // *******************************************************************
2987 // *******************************************************************
2988
2989 .proc __libm_callout
2990 __libm_callout:
2991 L(TANL_ARG_TOO_LARGE):
2992 .prologue
2993 { .mfi
2994         add   r50=-32,sp                        // Parameter: r address
2995         nop.f 0
2996 .save   ar.pfs,GR_SAVE_PFS
2997         mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
2998 }
2999 { .mfi
3000 .fframe 64
3001         add sp=-64,sp                           // Create new stack
3002         nop.f 0
3003         mov GR_SAVE_GP=gp                       // Save gp
3004 };;
3005 { .mmi
3006         stfe [r50] = f0,16                      // Clear Parameter r on stack
3007         add  r49 = 16,sp                        // Parameter x address
3008 .save   b0, GR_SAVE_B0
3009         mov GR_SAVE_B0=b0                       // Save b0
3010 };;
3011 .body
3012 { .mib
3013         stfe [r50] = f0,-16                     // Clear Parameter c on stack
3014         nop.i 0
3015         nop.b 0
3016 }
3017 { .mib
3018         stfe [r49] = Arg                        // Store Parameter x on stack
3019         nop.i 0
3020 (p0)    br.call.sptk b0=__libm_pi_by_2_reduce# ;;
3021 };;
3022 //
3023 //     Load 2^-2
3024 //
3025 { .mmi
3026 (p0)   ldfe  Arg =[r49],16
3027 //
3028 //     Call argument reduction
3029 //
3030 (p0)   ldfs  TWO_TO_NEG2 = [table_ptr2],4
3031 //     Get Arg off stack
3032 //     Get r off stack - hi order part
3033 //     Get c off stack - lo order part
3034 (p0)   mov   N_fix_gr = r8 ;;
3035 }
3036 { .mmb
3037 (p0)   ldfe  r =[r50],16
3038 (p0)   ldfs  NEGTWO_TO_NEG2 = [table_ptr2],4
3039         nop.b 999 ;;
3040 }
3041 { .mfi
3042 (p0)   ldfe  c =[r50],-32
3043         nop.f 999
3044         nop.i 999 ;;
3045 }
3046 { .mfi
3047 .restore sp
3048        add   sp = 64,sp                       // Restore stack pointer
3049 //
3050 //     Is |r| < 2**(-2)
3051 //
3052 (p0)   fcmp.lt.unc.s1  p6, p0 = r, TWO_TO_NEG2
3053 mov    b0 = GR_SAVE_B0                        // Restore return address
3054 };;
3055 { .mfi
3056        mov   gp = GR_SAVE_GP                  // Restore gp
3057 (p6)   fcmp.gt.unc.s1  p6, p0 = r, NEGTWO_TO_NEG2
3058        mov   ar.pfs = GR_SAVE_PFS             // Restore gp
3059 };;
3060 { .mbb
3061         nop.m 999
3062 (p6)   br.cond.spnt L(TANL_SMALL_R)
3063 (p0)   br.cond.sptk L(TANL_NORMAL_R) ;;
3064 }
3065
3066 .endp __libm_callout
3067 ASM_SIZE_DIRECTIVE(__libm_callout)
3068
3069 .type __libm_pi_by_2_reduce#,@function
3070 .global __libm_pi_by_2_reduce#