sysdeps/ia64/fpu/libm_tan.S

   1 .file "libm_tan.s"
   2
   3 // Copyright (c) 2000, 2001, Intel Corporation
   4 // All rights reserved.
   5 //
   6 // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
   7 // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
   8 //
   9 // WARRANTY DISCLAIMER
  10 //
  11 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  12 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  13 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  14 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  15 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  16 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  17 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  18 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  19 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  20 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  21 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  22 //
  23 // Intel Corporation is the author of this code, and requests that all
  24 // problem reports or change requests be submitted to it directly at
  25 // http://developer.intel.com/opensource.
  26 //
  27 // *********************************************************************
  28 //
  29 // History:
  30 // 02/02/00 Initial Version
  31 // 4/04/00  Unwind support added
  32 // 12/28/00 Fixed false invalid flags
  33 //
  34 // *********************************************************************
  35 //
  36 // Function:   tan(x) = tangent(x), for double precision x values
  37 //
  38 // *********************************************************************
  39 //
  40 // Accuracy:       Very accurate for double-precision values
  41 //
  42 // *********************************************************************
  43 //
  44 // Resources Used:
  45 //
  46 //    Floating-Point Registers: f8 (Input and Return Value)
  47 //                              f9-f15
  48 //                              f32-f112
  49 //
  50 //    General Purpose Registers:
  51 //      r32-r48
  52 //      r49-r50 (Used to pass arguments to pi_by_2 reduce routine)
  53 //
  54 //    Predicate Registers:      p6-p15
  55 //
  56 // *********************************************************************
  57 //
  58 // IEEE Special Conditions:
  59 //
  60 //    Denormal  fault raised on denormal inputs
  61 //    Overflow exceptions do not occur
  62 //    Underflow exceptions raised when appropriate for tan
  63 //    (No specialized error handling for this routine)
  64 //    Inexact raised when appropriate by algorithm
  65 //
  66 //    tan(SNaN) = QNaN
  67 //    tan(QNaN) = QNaN
  68 //    tan(inf) = QNaN
  69 //    tan(+/-0) = +/-0
  70 //
  71 // *********************************************************************
  72 //
  73 // Mathematical Description
  74 //
  75 // We consider the computation of FPTAN of Arg. Now, given
  76 //
  77 //      Arg = N pi/2  + alpha,          |alpha| <= pi/4,
  78 //
  79 // basic mathematical relationship shows that
  80 //
  81 //      tan( Arg ) =  tan( alpha )     if N is even;
  82 //                 = -cot( alpha )      otherwise.
  83 //
  84 // The value of alpha is obtained by argument reduction and
  85 // represented by two working precision numbers r and c where
  86 //
  87 //      alpha =  r  +  c     accurately.
  88 //
  89 // The reduction method is described in a previous write up.
  90 // The argument reduction scheme identifies 4 cases. For Cases 2
  91 // and 4, because |alpha| is small, tan(r+c) and -cot(r+c) can be
  92 // computed very easily by 2 or 3 terms of the Taylor series
  93 // expansion as follows:
  94 //
  95 // Case 2:
  96 // -------
  97 //
  98 //      tan(r + c) = r + c + r^3/3          ...accurately
  99 //        -cot(r + c) = -1/(r+c) + r/3          ...accurately
 100 //
 101 // Case 4:
 102 // -------
 103 //
 104 //      tan(r + c) = r + c + r^3/3 + 2r^5/15     ...accurately
 105 //        -cot(r + c) = -1/(r+c) + r/3 + r^3/45     ...accurately
 106 //
 107 //
 108 // The only cases left are Cases 1 and 3 of the argument reduction
 109 // procedure. These two cases will be merged since after the
 110 // argument is reduced in either cases, we have the reduced argument
 111 // represented as r + c and that the magnitude |r + c| is not small
 112 // enough to allow the usage of a very short approximation.
 113 //
 114 // The greatest challenge of this task is that the second terms of
 115 // the Taylor series for tan(r) and -cot(r)
 116 //
 117 //      r + r^3/3 + 2 r^5/15 + ...
 118 //
 119 // and
 120 //
 121 //      -1/r + r/3 + r^3/45 + ...
 122 //
 123 // are not very small when |r| is close to pi/4 and the rounding
 124 // errors will be a concern if simple polynomial accumulation is
 125 // used. When |r| < 2^(-2), however, the second terms will be small
 126 // enough (5 bits or so of right shift) that a normal Horner
 127 // recurrence suffices. Hence there are two cases that we consider
 128 // in the accurate computation of tan(r) and cot(r), |r| <= pi/4.
 129 //
 130 // Case small_r: |r| < 2^(-2)
 131 // --------------------------
 132 //
 133 // Since Arg = N pi/4 + r + c accurately, we have
 134 //
 135 //      tan(Arg) =  tan(r+c)            for N even,
 136 //            = -cot(r+c)          otherwise.
 137 //
 138 // Here for this case, both tan(r) and -cot(r) can be approximated
 139 // by simple polynomials:
 140 //
 141 //      tan(r) =    r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
 142 //        -cot(r) = -1/r + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
 143 //
 144 // accurately. Since |r| is relatively small, tan(r+c) and
 145 // -cot(r+c) can be accurately approximated by replacing r with
 146 // r+c only in the first two terms of the corresponding polynomials.
 147 //
 148 // Note that P1_1 (and Q1_1 for that matter) approximates 1/3 to
 149 // almost 64 sig. bits, thus
 150 //
 151 //      P1_1 (r+c)^3 =  P1_1 r^3 + c * r^2     accurately.
 152 //
 153 // Hence,
 154 //
 155 //      tan(r+c) =    r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
 156 //                     + c*(1 + r^2)
 157 //
 158 //        -cot(r+c) = -1/(r+c) + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
 159 //               + Q1_1*c
 160 //
 161 //
 162 // Case normal_r: 2^(-2) <= |r| <= pi/4
 163 // ------------------------------------
 164 //
 165 // This case is more likely than the previous one if one considers
 166 // r to be uniformly distributed in [-pi/4 pi/4].
 167 //
 168 // The required calculation is either
 169 //
 170 //      tan(r + c)  =  tan(r)  +  correction,  or
 171 //        -cot(r + c)  = -cot(r)  +  correction.
 172 //
 173 // Specifically,
 174 //
 175 //      tan(r + c) =  tan(r) + c tan'(r)  + O(c^2)
 176 //              =  tan(r) + c sec^2(r) + O(c^2)
 177 //              =  tan(r) + c SEC_sq     ...accurately
 178 //                as long as SEC_sq approximates sec^2(r)
 179 //                to, say, 5 bits or so.
 180 //
 181 // Similarly,
 182 //
 183 //        -cot(r + c) = -cot(r) - c cot'(r)  + O(c^2)
 184 //              = -cot(r) + c csc^2(r) + O(c^2)
 185 //              = -cot(r) + c CSC_sq     ...accurately
 186 //                as long as CSC_sq approximates csc^2(r)
 187 //                to, say, 5 bits or so.
 188 //
 189 // We therefore concentrate on accurately calculating tan(r) and
 190 // cot(r) for a working-precision number r, |r| <= pi/4 to within
 191 // 0.1% or so.
 192 //
 193 // We will employ a table-driven approach. Let
 194 //
 195 //      r = sgn_r * 2^k * 1.b_1 b_2 ... b_5 ... b_63
 196 //        = sgn_r * ( B + x )
 197 //
 198 // where
 199 //
 200 //      B = 2^k * 1.b_1 b_2 ... b_5 1
 201 //         x = |r| - B
 202 //
 203 // Now,
 204 //                   tan(B)  +   tan(x)
 205 //      tan( B + x ) =  ------------------------
 206 //                   1 -  tan(B)*tan(x)
 207 //
 208 //               /                         \
 209 //               |   tan(B)  +   tan(x)          |
 210
 211 //      = tan(B) +  | ------------------------ - tan(B) |
 212 //               |     1 -  tan(B)*tan(x)          |
 213 //               \                         /
 214 //
 215 //                 sec^2(B) * tan(x)
 216 //      = tan(B) + ------------------------
 217 //                 1 -  tan(B)*tan(x)
 218 //
 219 //                (1/[sin(B)*cos(B)]) * tan(x)
 220 //      = tan(B) + --------------------------------
 221 //                      cot(B)  -  tan(x)
 222 //
 223 //
 224 // Clearly, the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
 225 // calculated beforehand and stored in a table. Since
 226 //
 227 //      |x| <= 2^k * 2^(-6)  <= 2^(-7)  (because k = -1, -2)
 228 //
 229 // a very short polynomial will be sufficient to approximate tan(x)
 230 // accurately. The details involved in computing the last expression
 231 // will be given in the next section on algorithm description.
 232 //
 233 //
 234 // Now, we turn to the case where cot( B + x ) is needed.
 235 //
 236 //
 237 //                   1 - tan(B)*tan(x)
 238 //      cot( B + x ) =  ------------------------
 239 //                   tan(B)  +  tan(x)
 240 //
 241 //               /                           \
 242 //               |   1 - tan(B)*tan(x)              |
 243
 244 //      = cot(B) +  | ----------------------- - cot(B) |
 245 //               |     tan(B)  +  tan(x)            |
 246 //               \                           /
 247 //
 248 //               [tan(B) + cot(B)] * tan(x)
 249 //      = cot(B) - ----------------------------
 250 //                   tan(B)  +  tan(x)
 251 //
 252 //                (1/[sin(B)*cos(B)]) * tan(x)
 253 //      = cot(B) - --------------------------------
 254 //                      tan(B)  +  tan(x)
 255 //
 256 //
 257 // Note that the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) that
 258 // are needed are the same set of values needed in the previous
 259 // case.
 260 //
 261 // Finally, we can put all the ingredients together as follows:
 262 //
 263 //      Arg = N * pi/2 +  r + c          ...accurately
 264 //
 265 //      tan(Arg) =  tan(r) + correction    if N is even;
 266 //            = -cot(r) + correction    otherwise.
 267 //
 268 // For Cases 2 and 4,
 269 //
 270 //     Case 2:
 271 //     tan(Arg) =  tan(r + c) = r + c + r^3/3           N even
 272 //              = -cot(r + c) = -1/(r+c) + r/3           N odd
 273 //     Case 4:
 274 //     tan(Arg) =  tan(r + c) = r + c + r^3/3 + 2r^5/15  N even
 275 //              = -cot(r + c) = -1/(r+c) + r/3 + r^3/45  N odd
 276 //
 277 //
 278 // For Cases 1 and 3,
 279 //
 280 //     Case small_r: |r| < 2^(-2)
 281 //
 282 //      tan(Arg) =  r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
 283 //                     + c*(1 + r^2)               N even
 284 //
 285 //                  = -1/(r+c) + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
 286 //               + Q1_1*c                    N odd
 287 //
 288 //     Case normal_r: 2^(-2) <= |r| <= pi/4
 289 //
 290 //      tan(Arg) =  tan(r) + c * sec^2(r)     N even
 291 //               = -cot(r) + c * csc^2(r)     otherwise
 292 //
 293 //     For N even,
 294 //
 295 //      tan(Arg) = tan(r) + c*sec^2(r)
 296 //               = tan( sgn_r * (B+x) ) + c * sec^2(|r|)
 297 //                  = sgn_r * ( tan(B+x)  + sgn_r*c*sec^2(|r|) )
 298 //                  = sgn_r * ( tan(B+x)  + sgn_r*c*sec^2(B) )
 299 //
 300 // since B approximates |r| to 2^(-6) in relative accuracy.
 301 //
 302 //                 /            (1/[sin(B)*cos(B)]) * tan(x)
 303 //    tan(Arg) = sgn_r * | tan(B) + --------------------------------
 304 //                 \                     cot(B)  -  tan(x)
 305 //                                        \
 306 //                       + CORR  |
 307
 308 //                                     /
 309 // where
 310 //
 311 //    CORR = sgn_r*c*tan(B)*SC_inv(B);  SC_inv(B) = 1/(sin(B)*cos(B)).
 312 //
 313 // For N odd,
 314 //
 315 //      tan(Arg) = -cot(r) + c*csc^2(r)
 316 //               = -cot( sgn_r * (B+x) ) + c * csc^2(|r|)
 317 //                  = sgn_r * ( -cot(B+x)  + sgn_r*c*csc^2(|r|) )
 318 //                  = sgn_r * ( -cot(B+x)  + sgn_r*c*csc^2(B) )
 319 //
 320 // since B approximates |r| to 2^(-6) in relative accuracy.
 321 //
 322 //                 /            (1/[sin(B)*cos(B)]) * tan(x)
 323 //    tan(Arg) = sgn_r * | -cot(B) + --------------------------------
 324 //                 \                     tan(B)  +  tan(x)
 325 //                                        \
 326 //                       + CORR  |
 327
 328 //                                     /
 329 // where
 330 //
 331 //    CORR = sgn_r*c*cot(B)*SC_inv(B);  SC_inv(B) = 1/(sin(B)*cos(B)).
 332 //
 333 //
 334 // The actual algorithm prescribes how all the mathematical formulas
 335 // are calculated.
 336 //
 337 //
 338 // 2. Algorithmic Description
 339 // ==========================
 340 //
 341 // 2.1 Computation for Cases 2 and 4.
 342 // ----------------------------------
 343 //
 344 // For Case 2, we use two-term polynomials.
 345 //
 346 //    For N even,
 347 //
 348 //    rsq := r * r
 349 //    Result := c + r * rsq * P1_1
 350 //    Result := r + Result          ...in user-defined rounding
 351 //
 352 //    For N odd,
 353 //    S_hi  := -frcpa(r)               ...8 bits
 354 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
 355 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
 356 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
 357 //    S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
 358 //    ...S_hi + S_lo is -1/(r+c) to extra precision
 359 //    S_lo  := S_lo + Q1_1*r
 360 //
 361 //    Result := S_hi + S_lo     ...in user-defined rounding
 362 //
 363 // For Case 4, we use three-term polynomials
 364 //
 365 //    For N even,
 366 //
 367 //    rsq := r * r
 368 //    Result := c + r * rsq * (P1_1 + rsq * P1_2)
 369 //    Result := r + Result          ...in user-defined rounding
 370 //
 371 //    For N odd,
 372 //    S_hi  := -frcpa(r)               ...8 bits
 373 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
 374 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
 375 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
 376 //    S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
 377 //    ...S_hi + S_lo is -1/(r+c) to extra precision
 378 //    rsq   := r * r
 379 //    P      := Q1_1 + rsq*Q1_2
 380 //    S_lo  := S_lo + r*P
 381 //
 382 //    Result := S_hi + S_lo     ...in user-defined rounding
 383 //
 384 //
 385 // Note that the coefficients P1_1, P1_2, Q1_1, and Q1_2 are
 386 // the same as those used in the small_r case of Cases 1 and 3
 387 // below.
 388 //
 389 //
 390 // 2.2 Computation for Cases 1 and 3.
 391 // ----------------------------------
 392 // This is further divided into the case of small_r,
 393 // where |r| < 2^(-2), and the case of normal_r, where |r| lies between
 394 // 2^(-2) and pi/4.
 395 //
 396 // Algorithm for the case of small_r
 397 // ---------------------------------
 398 //
 399 // For N even,
 400 //      rsq   := r * r
 401 //      Poly1 := rsq*(P1_1 + rsq*(P1_2 + rsq*P1_3))
 402 //      r_to_the_8    := rsq * rsq
 403 //      r_to_the_8    := r_to_the_8 * r_to_the_8
 404 //      Poly2 := P1_4 + rsq*(P1_5 + rsq*(P1_6 + ... rsq*P1_9))
 405 //      CORR  := c * ( 1 + rsq )
 406 //      Poly  := Poly1 + r_to_the_8*Poly2
 407 //      Result := r*Poly + CORR
 408 //      Result := r + Result     ...in user-defined rounding
 409 //      ...note that Poly1 and r_to_the_8 can be computed in parallel
 410 //      ...with Poly2 (Poly1 is intentionally set to be much
 411 //      ...shorter than Poly2 so that r_to_the_8 and CORR can be hidden)
 412 //
 413 // For N odd,
 414 //      S_hi  := -frcpa(r)               ...8 bits
 415 //      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
 416 //      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
 417 //      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
 418 //      S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
 419 //      ...S_hi + S_lo is -1/(r+c) to extra precision
 420 //      S_lo  := S_lo + Q1_1*c
 421 //
 422 //      ...S_hi and S_lo are computed in parallel with
 423 //      ...the following
 424 //      rsq := r*r
 425 //      P   := Q1_1 + rsq*(Q1_2 + rsq*(Q1_3 + ... + rsq*Q1_7))
 426 //
 427 //      Result :=  r*P + S_lo
 428 //      Result :=  S_hi  +  Result      ...in user-defined rounding
 429 //
 430 //
 431 // Algorithm for the case of normal_r
 432 // ----------------------------------
 433 //
 434 // Here, we first consider the computation of tan( r + c ). As
 435 // presented in the previous section,
 436 //
 437 //      tan( r + c )  =  tan(r) + c * sec^2(r)
 438 //                 =  sgn_r * [ tan(B+x) + CORR ]
 439 //      CORR = sgn_r * c * tan(B) * 1/[sin(B)*cos(B)]
 440 //
 441 // because sec^2(r) = sec^(|r|), and B approximate |r| to 6.5 bits.
 442 //
 443 //      tan( r + c ) =
 444 //           /           (1/[sin(B)*cos(B)]) * tan(x)
 445 //      sgn_r * | tan(B) + --------------------------------  +
 446 //           \                     cot(B)  -  tan(x)
 447 //                                \
 448 //                          CORR  |
 449
 450 //                                /
 451 //
 452 // The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
 453 // calculated beforehand and stored in a table. Specifically,
 454 // the table values are
 455 //
 456 //      tan(B)                as  T_hi  +  T_lo;
 457 //      cot(B)             as  C_hi  +  C_lo;
 458 //      1/[sin(B)*cos(B)]  as  SC_inv
 459 //
 460 // T_hi, C_hi are in  double-precision  memory format;
 461 // T_lo, C_lo are in  single-precision  memory format;
 462 // SC_inv     is  in extended-precision memory format.
 463 //
 464 // The value of tan(x) will be approximated by a short polynomial of
 465 // the form
 466 //
 467 //      tan(x)  as  x  +  x * P, where
 468 //           P  =   x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
 469 //
 470 // Because |x| <= 2^(-7), cot(B) - x approximates cot(B) - tan(x)
 471 // to a relative accuracy better than 2^(-20). Thus, a good
 472 // initial guess of 1/( cot(B) - tan(x) ) to initiate the iterative
 473 // division is:
 474 //
 475 //      1/(cot(B) - tan(x))      is approximately
 476 //      1/(cot(B) -   x)         is
 477 //      tan(B)/(1 - x*tan(B))    is approximately
 478 //      T_hi / ( 1 - T_hi * x )  is approximately
 479 //
 480 //      T_hi * [ 1 + (Thi * x) + (T_hi * x)^2 ]
 481 //
 482 // The calculation of tan(r+c) therefore proceed as follows:
 483 //
 484 //      Tx     := T_hi * x
 485 //      xsq     := x * x
 486 //
 487 //      V_hi     := T_hi*(1 + Tx*(1 + Tx))
 488 //      P     := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
 489 //      ...V_hi serves as an initial guess of 1/(cot(B) - tan(x))
 490 //         ...good to about 20 bits of accuracy
 491 //
 492 //      tanx     := x + x*P
 493 //      D     := C_hi - tanx
 494 //      ...D is a double precision denominator: cot(B) - tan(x)
 495 //
 496 //      V_hi     := V_hi + V_hi*(1 - V_hi*D)
 497 //      ....V_hi approximates 1/(cot(B)-tan(x)) to 40 bits
 498 //
 499 //      V_lo     := V_hi * ( [ (1 - V_hi*C_hi) + V_hi*tanx ]
 500 //                           - V_hi*C_lo )   ...observe all order
 501 //         ...V_hi + V_lo approximates 1/(cot(B) - tan(x))
 502 //      ...to extra accuracy
 503 //
 504 //      ...               SC_inv(B) * (x + x*P)
 505 //      ...   tan(B) +      ------------------------- + CORR
 506 //         ...                cot(B) - (x + x*P)
 507 //      ...
 508 //      ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 509 //      ...
 510 //
 511 //      Sx     := SC_inv * x
 512 //      CORR     := sgn_r * c * SC_inv * T_hi
 513 //
 514 //      ...put the ingredients together to compute
 515 //      ...               SC_inv(B) * (x + x*P)
 516 //      ...   tan(B) +      ------------------------- + CORR
 517 //         ...                cot(B) - (x + x*P)
 518 //      ...
 519 //      ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 520 //      ...
 521 //      ... = T_hi + T_lo + CORR +
 522 //      ...    Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
 523 //
 524 //      CORR := CORR + T_lo
 525 //      tail := V_lo + P*(V_hi + V_lo)
 526 //         tail := Sx * tail  +  CORR
 527 //      tail := Sx * V_hi  +  tail
 528 //         T_hi := sgn_r * T_hi
 529 //
 530 //         ...T_hi + sgn_r*tail  now approximate
 531 //      ...sgn_r*(tan(B+x) + CORR) accurately
 532 //
 533 //      Result :=  T_hi + sgn_r*tail  ...in user-defined
 534 //                           ...rounding control
 535 //      ...It is crucial that independent paths be fully
 536 //      ...exploited for performance's sake.
 537 //
 538 //
 539 // Next, we consider the computation of -cot( r + c ). As
 540 // presented in the previous section,
 541 //
 542 //        -cot( r + c )  =  -cot(r) + c * csc^2(r)
 543 //                 =  sgn_r * [ -cot(B+x) + CORR ]
 544 //      CORR = sgn_r * c * cot(B) * 1/[sin(B)*cos(B)]
 545 //
 546 // because csc^2(r) = csc^(|r|), and B approximate |r| to 6.5 bits.
 547 //
 548 //        -cot( r + c ) =
 549 //           /             (1/[sin(B)*cos(B)]) * tan(x)
 550 //      sgn_r * | -cot(B) + --------------------------------  +
 551 //           \                     tan(B)  +  tan(x)
 552 //                                \
 553 //                          CORR  |
 554
 555 //                                /
 556 //
 557 // The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
 558 // calculated beforehand and stored in a table. Specifically,
 559 // the table values are
 560 //
 561 //      tan(B)                as  T_hi  +  T_lo;
 562 //      cot(B)             as  C_hi  +  C_lo;
 563 //      1/[sin(B)*cos(B)]  as  SC_inv
 564 //
 565 // T_hi, C_hi are in  double-precision  memory format;
 566 // T_lo, C_lo are in  single-precision  memory format;
 567 // SC_inv     is  in extended-precision memory format.
 568 //
 569 // The value of tan(x) will be approximated by a short polynomial of
 570 // the form
 571 //
 572 //      tan(x)  as  x  +  x * P, where
 573 //           P  =   x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
 574 //
 575 // Because |x| <= 2^(-7), tan(B) + x approximates tan(B) + tan(x)
 576 // to a relative accuracy better than 2^(-18). Thus, a good
 577 // initial guess of 1/( tan(B) + tan(x) ) to initiate the iterative
 578 // division is:
 579 //
 580 //      1/(tan(B) + tan(x))      is approximately
 581 //      1/(tan(B) +   x)         is
 582 //      cot(B)/(1 + x*cot(B))    is approximately
 583 //      C_hi / ( 1 + C_hi * x )  is approximately
 584 //
 585 //      C_hi * [ 1 - (C_hi * x) + (C_hi * x)^2 ]
 586 //
 587 // The calculation of -cot(r+c) therefore proceed as follows:
 588 //
 589 //      Cx     := C_hi * x
 590 //      xsq     := x * x
 591 //
 592 //      V_hi     := C_hi*(1 - Cx*(1 - Cx))
 593 //      P     := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
 594 //      ...V_hi serves as an initial guess of 1/(tan(B) + tan(x))
 595 //         ...good to about 18 bits of accuracy
 596 //
 597 //      tanx     := x + x*P
 598 //      D     := T_hi + tanx
 599 //      ...D is a double precision denominator: tan(B) + tan(x)
 600 //
 601 //      V_hi     := V_hi + V_hi*(1 - V_hi*D)
 602 //      ....V_hi approximates 1/(tan(B)+tan(x)) to 40 bits
 603 //
 604 //      V_lo     := V_hi * ( [ (1 - V_hi*T_hi) - V_hi*tanx ]
 605 //                           - V_hi*T_lo )   ...observe all order
 606 //         ...V_hi + V_lo approximates 1/(tan(B) + tan(x))
 607 //      ...to extra accuracy
 608 //
 609 //      ...               SC_inv(B) * (x + x*P)
 610 //      ...  -cot(B) +      ------------------------- + CORR
 611 //         ...                tan(B) + (x + x*P)
 612 //      ...
 613 //      ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 614 //      ...
 615 //
 616 //      Sx     := SC_inv * x
 617 //      CORR     := sgn_r * c * SC_inv * C_hi
 618 //
 619 //      ...put the ingredients together to compute
 620 //      ...               SC_inv(B) * (x + x*P)
 621 //      ...  -cot(B) +      ------------------------- + CORR
 622 //         ...                tan(B) + (x + x*P)
 623 //      ...
 624 //      ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 625 //      ...
 626 //      ... =-C_hi - C_lo + CORR +
 627 //      ...    Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
 628 //
 629 //      CORR := CORR - C_lo
 630 //      tail := V_lo + P*(V_hi + V_lo)
 631 //         tail := Sx * tail  +  CORR
 632 //      tail := Sx * V_hi  +  tail
 633 //         C_hi := -sgn_r * C_hi
 634 //
 635 //         ...C_hi + sgn_r*tail now approximates
 636 //      ...sgn_r*(-cot(B+x) + CORR) accurately
 637 //
 638 //      Result :=  C_hi + sgn_r*tail   in user-defined rounding control
 639 //      ...It is crucial that independent paths be fully
 640 //      ...exploited for performance's sake.
 641 //
 642 // 3. Implementation Notes
 643 // =======================
 644 //
 645 //   Table entries T_hi, T_lo; C_hi, C_lo; SC_inv
 646 //
 647 //   Recall that 2^(-2) <= |r| <= pi/4;
 648 //
 649 //      r = sgn_r * 2^k * 1.b_1 b_2 ... b_63
 650 //
 651 //   and
 652 //
 653 //        B = 2^k * 1.b_1 b_2 b_3 b_4 b_5 1
 654 //
 655 //   Thus, for k = -2, possible values of B are
 656 //
 657 //          B = 2^(-2) * ( 1 + index/32  +  1/64 ),
 658 //      index ranges from 0 to 31
 659 //
 660 //   For k = -1, however, since |r| <= pi/4 = 0.78...
 661 //   possible values of B are
 662 //
 663 //        B = 2^(-1) * ( 1 + index/32  +  1/64 )
 664 //      index ranges from 0 to 19.
 665 //
 666 //
 667
 668 #include "libm_support.h"
 669
 670 #ifdef _LIBC
 671 .rodata
 672 #else
 673 .data
 674 #endif
 675
 676 .align 128
 677
 678 TAN_BASE_CONSTANTS:
 679 ASM_TYPE_DIRECTIVE(TAN_BASE_CONSTANTS,@object)
 680 data4    0x4B800000, 0xCB800000, 0x38800000, 0xB8800000 // two**24, -two**24
 681                                                         // two**-14, -two**-14
 682 data4    0x4E44152A, 0xA2F9836E, 0x00003FFE, 0x00000000 // two_by_pi
 683 data4    0xCE81B9F1, 0xC84D32B0, 0x00004016, 0x00000000 // P_0
 684 data4    0x2168C235, 0xC90FDAA2, 0x00003FFF, 0x00000000 // P_1
 685 data4    0xFC8F8CBB, 0xECE675D1, 0x0000BFBD, 0x00000000 // P_2
 686 data4    0xACC19C60, 0xB7ED8FBB, 0x0000BF7C, 0x00000000 // P_3
 687 data4    0x5F000000, 0xDF000000, 0x00000000, 0x00000000 // two_to_63, -two_to_63
 688 data4    0x6EC6B45A, 0xA397E504, 0x00003FE7, 0x00000000 // Inv_P_0
 689 data4    0xDBD171A1, 0x8D848E89, 0x0000BFBF, 0x00000000 // d_1
 690 data4    0x18A66F8E, 0xD5394C36, 0x0000BF7C, 0x00000000 // d_2
 691 data4    0x2168C234, 0xC90FDAA2, 0x00003FFE, 0x00000000 // PI_BY_4
 692 data4    0x2168C234, 0xC90FDAA2, 0x0000BFFE, 0x00000000 // MPI_BY_4
 693 data4    0x3E800000, 0xBE800000, 0x00000000, 0x00000000 // two**-2, -two**-2
 694 data4    0x2F000000, 0xAF000000, 0x00000000, 0x00000000 // two**-33, -two**-33
 695 data4    0xAAAAAABD, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P1_1
 696 data4    0x88882E6A, 0x88888888, 0x00003FFC, 0x00000000 // P1_2
 697 data4    0x0F0177B6, 0xDD0DD0DD, 0x00003FFA, 0x00000000 // P1_3
 698 data4    0x646B8C6D, 0xB327A440, 0x00003FF9, 0x00000000 // P1_4
 699 data4    0x1D5F7D20, 0x91371B25, 0x00003FF8, 0x00000000 // P1_5
 700 data4    0x61C67914, 0xEB69A5F1, 0x00003FF6, 0x00000000 // P1_6
 701 data4    0x019318D2, 0xBEDD37BE, 0x00003FF5, 0x00000000 // P1_7
 702 data4    0x3C794015, 0x9979B146, 0x00003FF4, 0x00000000 // P1_8
 703 data4    0x8C6EB58A, 0x8EBD21A3, 0x00003FF3, 0x00000000 // P1_9
 704 data4    0xAAAAAAB4, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // Q1_1
 705 data4    0x0B5FC93E, 0xB60B60B6, 0x00003FF9, 0x00000000 // Q1_2
 706 data4    0x0C9BBFBF, 0x8AB355E0, 0x00003FF6, 0x00000000 // Q1_3
 707 data4    0xCBEE3D4C, 0xDDEBBC89, 0x00003FF2, 0x00000000 // Q1_4
 708 data4    0x5F80BBB6, 0xB3548A68, 0x00003FEF, 0x00000000 // Q1_5
 709 data4    0x4CED5BF1, 0x91362560, 0x00003FEC, 0x00000000 // Q1_6
 710 data4    0x8EE92A83, 0xF189D95A, 0x00003FE8, 0x00000000 // Q1_7
 711 data4    0xAAAB362F, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P2_1
 712 data4    0xE97A6097, 0x88888886, 0x00003FFC, 0x00000000 // P2_2
 713 data4    0x25E716A1, 0xDD108EE0, 0x00003FFA, 0x00000000 // P2_3
 714 //
 715 //  Entries T_hi   double-precision memory format
 716 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 717 //  Entries T_lo  single-precision memory format
 718 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 719 //
 720 data4    0x62400794, 0x3FD09BC3, 0x23A05C32, 0x00000000
 721 data4    0xDFFBC074, 0x3FD124A9, 0x240078B2, 0x00000000
 722 data4    0x5BD4920F, 0x3FD1AE23, 0x23826B8E, 0x00000000
 723 data4    0x15E2701D, 0x3FD23835, 0x22D31154, 0x00000000
 724 data4    0x63739C2D, 0x3FD2C2E4, 0x2265C9E2, 0x00000000
 725 data4    0xAFEEA48B, 0x3FD34E36, 0x245C05EB, 0x00000000
 726 data4    0x7DBB35D1, 0x3FD3DA31, 0x24749F2D, 0x00000000
 727 data4    0x67321619, 0x3FD466DA, 0x2462CECE, 0x00000000
 728 data4    0x1F94A4D5, 0x3FD4F437, 0x246D0DF1, 0x00000000
 729 data4    0x740C3E6D, 0x3FD5824D, 0x240A85B5, 0x00000000
 730 data4    0x4CB1E73D, 0x3FD61123, 0x23F96E33, 0x00000000
 731 data4    0xAD9EA64B, 0x3FD6A0BE, 0x247C5393, 0x00000000
 732 data4    0xB804FD01, 0x3FD73125, 0x241F3B29, 0x00000000
 733 data4    0xAB53EE83, 0x3FD7C25E, 0x2479989B, 0x00000000
 734 data4    0xE6640EED, 0x3FD8546F, 0x23B343BC, 0x00000000
 735 data4    0xE8AF1892, 0x3FD8E75F, 0x241454D1, 0x00000000
 736 data4    0x53928BDA, 0x3FD97B35, 0x238613D9, 0x00000000
 737 data4    0xEB9DE4DE, 0x3FDA0FF6, 0x22859FA7, 0x00000000
 738 data4    0x99ECF92D, 0x3FDAA5AB, 0x237A6D06, 0x00000000
 739 data4    0x6D8F1796, 0x3FDB3C5A, 0x23952F6C, 0x00000000
 740 data4    0x9CFB8BE4, 0x3FDBD40A, 0x2280FC95, 0x00000000
 741 data4    0x87943100, 0x3FDC6CC3, 0x245D2EC0, 0x00000000
 742 data4    0xB736C500, 0x3FDD068C, 0x23C4AD7D, 0x00000000
 743 data4    0xE1DDBC31, 0x3FDDA16D, 0x23D076E6, 0x00000000
 744 data4    0xEB515A93, 0x3FDE3D6E, 0x244809A6, 0x00000000
 745 data4    0xE6E9E5F1, 0x3FDEDA97, 0x220856C8, 0x00000000
 746 data4    0x1963CE69, 0x3FDF78F1, 0x244BE993, 0x00000000
 747 data4    0x7D635BCE, 0x3FE00C41, 0x23D21799, 0x00000000
 748 data4    0x1C302CD3, 0x3FE05CAB, 0x248A1B1D, 0x00000000
 749 data4    0xDB6A1FA0, 0x3FE0ADB9, 0x23D53E33, 0x00000000
 750 data4    0x4A20BA81, 0x3FE0FF72, 0x24DB9ED5, 0x00000000
 751 data4    0x153FA6F5, 0x3FE151D9, 0x24E9E451, 0x00000000
 752 //
 753 //  Entries T_hi   double-precision memory format
 754 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 755 //  Entries T_lo  single-precision memory format
 756 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 757 //
 758 data4    0xBA1BE39E, 0x3FE1CEC4, 0x24B60F9E, 0x00000000
 759 data4    0x5ABD9B2D, 0x3FE277E4, 0x248C2474, 0x00000000
 760 data4    0x0272B110, 0x3FE32418, 0x247B8311, 0x00000000
 761 data4    0x890E2DF0, 0x3FE3D38B, 0x24C55751, 0x00000000
 762 data4    0x46236871, 0x3FE4866D, 0x24E5BC34, 0x00000000
 763 data4    0x45E044B0, 0x3FE53CEE, 0x24001BA4, 0x00000000
 764 data4    0x82EC06E4, 0x3FE5F742, 0x24B973DC, 0x00000000
 765 data4    0x25DF43F9, 0x3FE6B5A1, 0x24895440, 0x00000000
 766 data4    0xCAFD348C, 0x3FE77844, 0x240021CA, 0x00000000
 767 data4    0xCEED6B92, 0x3FE83F6B, 0x24C45372, 0x00000000
 768 data4    0xA34F3665, 0x3FE90B58, 0x240DAD33, 0x00000000
 769 data4    0x2C1E56B4, 0x3FE9DC52, 0x24F846CE, 0x00000000
 770 data4    0x27041578, 0x3FEAB2A4, 0x2323FB6E, 0x00000000
 771 data4    0x9DD8C373, 0x3FEB8E9F, 0x24B3090B, 0x00000000
 772 data4    0x65C9AA7B, 0x3FEC709B, 0x2449F611, 0x00000000
 773 data4    0xACCF8435, 0x3FED58F4, 0x23616A7E, 0x00000000
 774 data4    0x97635082, 0x3FEE480F, 0x24C2FEAE, 0x00000000
 775 data4    0xF0ACC544, 0x3FEF3E57, 0x242CE964, 0x00000000
 776 data4    0xF7E06E4B, 0x3FF01E20, 0x2480D3EE, 0x00000000
 777 data4    0x8A798A69, 0x3FF0A125, 0x24DB8967, 0x00000000
 778 //
 779 //  Entries C_hi   double-precision memory format
 780 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 781 //  Entries C_lo  single-precision memory format
 782 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 783 //
 784 data4    0xE63EFBD0, 0x400ED3E2, 0x259D94D4, 0x00000000
 785 data4    0xC515DAB5, 0x400DDDB4, 0x245F0537, 0x00000000
 786 data4    0xBE19A79F, 0x400CF57A, 0x25D4EA9F, 0x00000000
 787 data4    0xD15298ED, 0x400C1A06, 0x24AE40A0, 0x00000000
 788 data4    0x164B2708, 0x400B4A4C, 0x25A5AAB6, 0x00000000
 789 data4    0x5285B068, 0x400A855A, 0x25524F18, 0x00000000
 790 data4    0x3FFA549F, 0x4009CA5A, 0x24C999C0, 0x00000000
 791 data4    0x646AF623, 0x4009188A, 0x254FD801, 0x00000000
 792 data4    0x6084D0E7, 0x40086F3C, 0x2560F5FD, 0x00000000
 793 data4    0xA29A76EE, 0x4007CDD2, 0x255B9D19, 0x00000000
 794 data4    0x6C8ECA95, 0x400733BE, 0x25CB021B, 0x00000000
 795 data4    0x1F8DDC52, 0x4006A07E, 0x24AB4722, 0x00000000
 796 data4    0xC298AD58, 0x4006139B, 0x252764E2, 0x00000000
 797 data4    0xBAD7164B, 0x40058CAB, 0x24DAF5DB, 0x00000000
 798 data4    0xAE31A5D3, 0x40050B4B, 0x25EA20F4, 0x00000000
 799 data4    0x89F85A8A, 0x40048F21, 0x2583A3E8, 0x00000000
 800 data4    0xA862380D, 0x400417DA, 0x25DCC4CC, 0x00000000
 801 data4    0x1088FCFE, 0x4003A52B, 0x2430A492, 0x00000000
 802 data4    0xCD3527D5, 0x400336CC, 0x255F77CF, 0x00000000
 803 data4    0x5760766D, 0x4002CC7F, 0x25DA0BDA, 0x00000000
 804 data4    0x11CE02E3, 0x40026607, 0x256FF4A2, 0x00000000
 805 data4    0xD37BBE04, 0x4002032C, 0x25208AED, 0x00000000
 806 data4    0x7F050775, 0x4001A3BD, 0x24B72DD6, 0x00000000
 807 data4    0xA554848A, 0x40014789, 0x24AB4DAA, 0x00000000
 808 data4    0x323E81B7, 0x4000EE65, 0x2584C440, 0x00000000
 809 data4    0x21CF1293, 0x40009827, 0x25C9428D, 0x00000000
 810 data4    0x3D415EEB, 0x400044A9, 0x25DC8482, 0x00000000
 811 data4    0xBD72C577, 0x3FFFE78F, 0x257F5070, 0x00000000
 812 data4    0x75EFD28E, 0x3FFF4AC3, 0x23EBBF7A, 0x00000000
 813 data4    0x60B52DDE, 0x3FFEB2AF, 0x22EECA07, 0x00000000
 814 data4    0x35204180, 0x3FFE1F19, 0x24191079, 0x00000000
 815 data4    0x54F7E60A, 0x3FFD8FCA, 0x248D3058, 0x00000000
 816 //
 817 //  Entries C_hi   double-precision memory format
 818 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 819 //  Entries C_lo  single-precision memory format
 820 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 821 //
 822 data4    0x79F6FADE, 0x3FFCC06A, 0x239C7886, 0x00000000
 823 data4    0x891662A6, 0x3FFBB91F, 0x250BD191, 0x00000000
 824 data4    0x529F155D, 0x3FFABFB6, 0x256CC3E6, 0x00000000
 825 data4    0x2E964AE9, 0x3FF9D300, 0x250843E3, 0x00000000
 826 data4    0x89DCB383, 0x3FF8F1EF, 0x2277C87E, 0x00000000
 827 data4    0x7C87DBD6, 0x3FF81B93, 0x256DA6CF, 0x00000000
 828 data4    0x1042EDE4, 0x3FF74F14, 0x2573D28A, 0x00000000
 829 data4    0x1784B360, 0x3FF68BAF, 0x242E489A, 0x00000000
 830 data4    0x7C923C4C, 0x3FF5D0B5, 0x2532D940, 0x00000000
 831 data4    0xF418EF20, 0x3FF51D88, 0x253C7DD6, 0x00000000
 832 data4    0x02F88DAE, 0x3FF4719A, 0x23DB59BF, 0x00000000
 833 data4    0x49DA0788, 0x3FF3CC66, 0x252B4756, 0x00000000
 834 data4    0x0B980DB8, 0x3FF32D77, 0x23FE585F, 0x00000000
 835 data4    0xE56C987A, 0x3FF2945F, 0x25378A63, 0x00000000
 836 data4    0xB16523F6, 0x3FF200BD, 0x247BB2E0, 0x00000000
 837 data4    0x8CE27778, 0x3FF17235, 0x24446538, 0x00000000
 838 data4    0xFDEFE692, 0x3FF0E873, 0x2514638F, 0x00000000
 839 data4    0x33154062, 0x3FF0632C, 0x24A7FC27, 0x00000000
 840 data4    0xB3EF115F, 0x3FEFC42E, 0x248FD0FE, 0x00000000
 841 data4    0x135D26F6, 0x3FEEC9E8, 0x2385C719, 0x00000000
 842 //
 843 //  Entries SC_inv in Swapped IEEE format (extended)
 844 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 845 //
 846 data4    0x1BF30C9E, 0x839D6D4A, 0x00004001, 0x00000000
 847 data4    0x554B0EB0, 0x80092804, 0x00004001, 0x00000000
 848 data4    0xA1CF0DE9, 0xF959F94C, 0x00004000, 0x00000000
 849 data4    0x77378677, 0xF3086BA0, 0x00004000, 0x00000000
 850 data4    0xCCD4723C, 0xED154515, 0x00004000, 0x00000000
 851 data4    0x1C27CF25, 0xE7790944, 0x00004000, 0x00000000
 852 data4    0x8DDACB88, 0xE22D037D, 0x00004000, 0x00000000
 853 data4    0x89C73522, 0xDD2B2D8A, 0x00004000, 0x00000000
 854 data4    0xBB2C1171, 0xD86E1A23, 0x00004000, 0x00000000
 855 data4    0xDFF5E0F9, 0xD3F0E288, 0x00004000, 0x00000000
 856 data4    0x283BEBD5, 0xCFAF16B1, 0x00004000, 0x00000000
 857 data4    0x0D88DD53, 0xCBA4AFAA, 0x00004000, 0x00000000
 858 data4    0xCA67C43D, 0xC7CE03CC, 0x00004000, 0x00000000
 859 data4    0x0CA0DDB0, 0xC427BC82, 0x00004000, 0x00000000
 860 data4    0xF13D8CAB, 0xC0AECD57, 0x00004000, 0x00000000
 861 data4    0x71ECE6B1, 0xBD606C38, 0x00004000, 0x00000000
 862 data4    0xA44C4929, 0xBA3A0A96, 0x00004000, 0x00000000
 863 data4    0xE5CCCEC1, 0xB7394F6F, 0x00004000, 0x00000000
 864 data4    0x9637D8BC, 0xB45C1203, 0x00004000, 0x00000000
 865 data4    0x92CB051B, 0xB1A05528, 0x00004000, 0x00000000
 866 data4    0x6BA2FFD0, 0xAF04432B, 0x00004000, 0x00000000
 867 data4    0x7221235F, 0xAC862A23, 0x00004000, 0x00000000
 868 data4    0x5F00A9D1, 0xAA2478AF, 0x00004000, 0x00000000
 869 data4    0x81E082BF, 0xA7DDBB0C, 0x00004000, 0x00000000
 870 data4    0x45684FEE, 0xA5B0987D, 0x00004000, 0x00000000
 871 data4    0x627A8F53, 0xA39BD0F5, 0x00004000, 0x00000000
 872 data4    0x6EC5C8B0, 0xA19E3B03, 0x00004000, 0x00000000
 873 data4    0x91CD7C66, 0x9FB6C1F0, 0x00004000, 0x00000000
 874 data4    0x1FA3DF8A, 0x9DE46410, 0x00004000, 0x00000000
 875 data4    0xA8F6B888, 0x9C263139, 0x00004000, 0x00000000
 876 data4    0xC27B0450, 0x9A7B4968, 0x00004000, 0x00000000
 877 data4    0x5EE614EE, 0x98E2DB7E, 0x00004000, 0x00000000
 878 //
 879 //  Entries SC_inv in Swapped IEEE format (extended)
 880 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 881 //
 882 data4    0x13B2B5BA, 0x969F335C, 0x00004000, 0x00000000
 883 data4    0xD4C0F548, 0x93D446D9, 0x00004000, 0x00000000
 884 data4    0x61B798AF, 0x9147094F, 0x00004000, 0x00000000
 885 data4    0x758787AC, 0x8EF317CC, 0x00004000, 0x00000000
 886 data4    0xB99EEFDB, 0x8CD498B3, 0x00004000, 0x00000000
 887 data4    0xDFF8BC37, 0x8AE82A7D, 0x00004000, 0x00000000
 888 data4    0xE3C55D42, 0x892AD546, 0x00004000, 0x00000000
 889 data4    0xD15573C1, 0x8799FEA9, 0x00004000, 0x00000000
 890 data4    0x435A4B4C, 0x86335F88, 0x00004000, 0x00000000
 891 data4    0x3E93A87B, 0x84F4FB6E, 0x00004000, 0x00000000
 892 data4    0x80A382FB, 0x83DD1952, 0x00004000, 0x00000000
 893 data4    0xA4CB8C9E, 0x82EA3D7F, 0x00004000, 0x00000000
 894 data4    0x6861D0A8, 0x821B247C, 0x00004000, 0x00000000
 895 data4    0x63E8D244, 0x816EBED1, 0x00004000, 0x00000000
 896 data4    0x27E4CFC6, 0x80E42D91, 0x00004000, 0x00000000
 897 data4    0x28E64AFD, 0x807ABF8D, 0x00004000, 0x00000000
 898 data4    0x863B4FD8, 0x8031EF26, 0x00004000, 0x00000000
 899 data4    0xAE8C11FD, 0x800960AD, 0x00004000, 0x00000000
 900 data4    0x5FDBEC21, 0x8000E147, 0x00004000, 0x00000000
 901 data4    0xA07791FA, 0x80186650, 0x00004000, 0x00000000
 902
 903 Arg                 = f8
 904 Result              = f8
 905 fp_tmp              = f9
 906 U_2                 = f10
 907 rsq                =  f11
 908 C_hi                = f12
 909 C_lo                = f13
 910 T_hi                = f14
 911 T_lo                = f15
 912
 913 N_0                 = f32
 914 d_1                 = f33
 915 MPI_BY_4            = f34
 916 tail                = f35
 917 tanx                = f36
 918 Cx                  = f37
 919 Sx                  = f38
 920 sgn_r               = f39
 921 CORR                = f40
 922 P                   = f41
 923 D                   = f42
 924 ArgPrime            = f43
 925 P_0                 = f44
 926
 927 P2_1                = f45
 928 P2_2                = f46
 929 P2_3                = f47
 930
 931 P1_1                = f45
 932 P1_2                = f46
 933 P1_3                = f47
 934
 935 P1_4                = f48
 936 P1_5                = f49
 937 P1_6                = f50
 938 P1_7                = f51
 939 P1_8                = f52
 940 P1_9                = f53
 941
 942 TWO_TO_63           = f54
 943 NEGTWO_TO_63        = f55
 944 x                   = f56
 945 xsq                 = f57
 946 Tx                  = f58
 947 Tx1                 = f59
 948 Set                 = f60
 949 poly1               = f61
 950 poly2               = f62
 951 Poly                = f63
 952 Poly1               = f64
 953 Poly2               = f65
 954 r_to_the_8          = f66
 955 B                   = f67
 956 SC_inv              = f68
 957 Pos_r               = f69
 958 N_0_fix             = f70
 959 PI_BY_4             = f71
 960 NEGTWO_TO_NEG2      = f72
 961 TWO_TO_24           = f73
 962 TWO_TO_NEG14        = f74
 963 TWO_TO_NEG33        = f75
 964 NEGTWO_TO_24        = f76
 965 NEGTWO_TO_NEG14     = f76
 966 NEGTWO_TO_NEG33     = f77
 967 two_by_PI           = f78
 968 N                   = f79
 969 N_fix               = f80
 970 P_1                 = f81
 971 P_2                 = f82
 972 P_3                 = f83
 973 s_val               = f84
 974 w                   = f85
 975 c                   = f86
 976 r                   = f87
 977 Z                   = f88
 978 A                   = f89
 979 a                   = f90
 980 t                   = f91
 981 U_1                 = f92
 982 d_2                 = f93
 983 TWO_TO_NEG2         = f94
 984 Q1_1                = f95
 985 Q1_2                = f96
 986 Q1_3                = f97
 987 Q1_4                = f98
 988 Q1_5                = f99
 989 Q1_6                = f100
 990 Q1_7                = f101
 991 Q1_8                = f102
 992 S_hi                = f103
 993 S_lo                = f104
 994 V_hi                = f105
 995 V_lo                = f106
 996 U_hi                = f107
 997 U_lo                = f108
 998 U_hiabs             = f109
 999 V_hiabs             = f110
1000 V                   = f111
1001 Inv_P_0             = f112
1002
1003 GR_SAVE_B0     = r33
1004 GR_SAVE_GP     = r34
1005 GR_SAVE_PFS    = r35
1006
1007 delta1         = r36
1008 table_ptr1     = r37
1009 table_ptr2     = r38
1010 i_0            = r39
1011 i_1            = r40
1012 N_fix_gr       = r41
1013 N_inc          = r42
1014 exp_Arg        = r43
1015 exp_r          = r44
1016 sig_r          = r45
1017 lookup         = r46
1018 table_offset   = r47
1019 Create_B       = r48
1020 gr_tmp         = r49
1021
1022 GR_Parameter_X = r49
1023 GR_Parameter_r = r50
1024
1025
1026
1027 .global __libm_tan
1028 .section .text
1029 .proc __libm_tan
1030
1031
1032 __libm_tan:
1033
1034 { .mfi
1035 alloc r32 = ar.pfs, 0,17,2,0
1036 (p0)   fclass.m.unc  p6,p0 = Arg, 0x1E7
1037       addl gr_tmp = -1,r0
1038 }
1039 ;;
1040
1041 { .mfi
1042        nop.m 999
1043 (p0)   fclass.nm.unc  p7,p0 = Arg, 0x1FF
1044        nop.i 999
1045 }
1046 ;;
1047
1048 { .mfi
1049 (p0)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
1050        nop.f 999
1051        nop.i 999
1052 }
1053 ;;
1054
1055 { .mmi
1056       ld8 table_ptr1 = [table_ptr1]
1057       setf.sig fp_tmp = gr_tmp   // Make a constant so fmpy produces inexact
1058       nop.i 999
1059 }
1060 ;;
1061
1062 //
1063 //     Check for NatVals, Infs , NaNs, and Zeros
1064 //     Check for everything - if false, then must be pseudo-zero
1065 //     or pseudo-nan.
1066 //     Local table pointer
1067 //
1068
1069 { .mbb
1070 (p0)   add table_ptr2 = 96, table_ptr1
1071 (p6)   br.cond.spnt __libm_TAN_SPECIAL
1072 (p7)   br.cond.spnt __libm_TAN_SPECIAL ;;
1073 }
1074 //
1075 //     Point to Inv_P_0
1076 //     Branch out to deal with unsupporteds and special values.
1077 //
1078
1079 { .mmf
1080 (p0)   ldfs TWO_TO_24 = [table_ptr1],4
1081 (p0)   ldfs TWO_TO_63 = [table_ptr2],4
1082 //
1083 //     Load -2**24, load -2**63.
1084 //
1085 (p0)   fcmp.eq.s0 p0, p6 = Arg, f1 ;;
1086 }
1087
1088 { .mfi
1089 (p0)   ldfs NEGTWO_TO_63 = [table_ptr2],12
1090 (p0)   fnorm.s1     Arg = Arg
1091         nop.i 999
1092 }
1093 //
1094 //     Load 2**24, Load 2**63.
1095 //
1096
1097 { .mmi
1098 (p0)   ldfs NEGTWO_TO_24 = [table_ptr1],12 ;;
1099 //
1100 //     Do fcmp to generate Denormal exception
1101 //     - can't do FNORM (will generate Underflow when U is unmasked!)
1102 //     Normalize input argument.
1103 //
1104 (p0)   ldfe two_by_PI = [table_ptr1],16
1105         nop.i 999
1106 }
1107
1108 { .mmi
1109 (p0)   ldfe Inv_P_0 = [table_ptr2],16 ;;
1110 (p0)   ldfe d_1 = [table_ptr2],16
1111         nop.i 999
1112 }
1113 //
1114 //     Decide about the paths to take:
1115 //     PR_1 and PR_3 set if -2**24 < Arg < 2**24 - CASE 1 OR 2
1116 //     OTHERWISE - CASE 3 OR 4
1117 //     Load inverse of P_0 .
1118 //     Set PR_6 if Arg <= -2**63
1119 //     Are there any Infs, NaNs, or zeros?
1120 //
1121
1122 { .mmi
1123 (p0)   ldfe P_0 = [table_ptr1],16 ;;
1124 (p0)   ldfe d_2 = [table_ptr2],16
1125         nop.i 999
1126 }
1127 //
1128 //     Set PR_8 if Arg <= -2**24
1129 //     Set PR_6 if Arg >=  2**63
1130 //
1131
1132 { .mmi
1133 (p0)   ldfe P_1 = [table_ptr1],16 ;;
1134 (p0)   ldfe PI_BY_4 = [table_ptr2],16
1135         nop.i 999
1136 }
1137 //
1138 //     Set PR_8 if Arg >= 2**24
1139 //
1140
1141 { .mmi
1142 (p0)   ldfe P_2 = [table_ptr1],16 ;;
1143 (p0)   ldfe   MPI_BY_4 = [table_ptr2],16
1144         nop.i 999
1145 }
1146 //
1147 //     Load  P_2 and PI_BY_4
1148 //
1149
1150 { .mfi
1151 (p0)   ldfe   P_3 = [table_ptr1],16
1152         nop.f 999
1153         nop.i 999 ;;
1154 }
1155
1156 { .mfi
1157         nop.m 999
1158 (p0)   fcmp.le.unc.s1 p6,p7 = Arg,NEGTWO_TO_63
1159         nop.i 999
1160 }
1161
1162 { .mfi
1163         nop.m 999
1164 (p0)   fcmp.le.unc.s1 p8,p9 = Arg,NEGTWO_TO_24
1165         nop.i 999 ;;
1166 }
1167
1168 { .mfi
1169         nop.m 999
1170 (p7)   fcmp.ge.s1 p6,p0 = Arg,TWO_TO_63
1171         nop.i 999
1172 }
1173
1174 { .mfi
1175         nop.m 999
1176 (p9)   fcmp.ge.s1 p8,p0 = Arg,TWO_TO_24
1177         nop.i 999 ;;
1178 }
1179
1180 { .mib
1181         nop.m 999
1182         nop.i 999
1183 //
1184 //     Load  P_3 and -PI_BY_4
1185 //
1186 (p6)   br.cond.spnt TAN_ARG_TOO_LARGE ;;
1187 }
1188
1189 { .mib
1190         nop.m 999
1191         nop.i 999
1192 //
1193 //     Load 2**(-2).
1194 //     Load -2**(-2).
1195 //     Branch out if we have a special argument.
1196 //     Branch out if the magnitude of the input argument is too large
1197 //     - do this branch before the next.
1198 //
1199 (p8)   br.cond.spnt TAN_LARGER_ARG ;;
1200 }
1201 //
1202 //     Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
1203 //
1204
1205 { .mfi
1206 (p0)   ldfs TWO_TO_NEG2 = [table_ptr2],4
1207 //     ARGUMENT REDUCTION CODE - CASE 1 and 2
1208 //     Load 2**(-2).
1209 //     Load -2**(-2).
1210 (p0)   fmpy.s1 N = Arg,two_by_PI
1211         nop.i 999 ;;
1212 }
1213
1214 { .mfi
1215 (p0)   ldfs NEGTWO_TO_NEG2 = [table_ptr2],12
1216 //
1217 //     N = Arg * 2/pi
1218 //
1219 (p0)   fcmp.lt.unc.s1 p8,p9= Arg,PI_BY_4
1220         nop.i 999 ;;
1221 }
1222
1223 { .mfi
1224         nop.m 999
1225 //
1226 //     if Arg < pi/4,  set PR_8.
1227 //
1228 (p8)   fcmp.gt.s1 p8,p9= Arg,MPI_BY_4
1229         nop.i 999 ;;
1230 }
1231 //
1232 //     Case 1: Is |r| < 2**(-2).
1233 //     Arg is the same as r in this case.
1234 //     r = Arg
1235 //     c = 0
1236 //
1237
1238 { .mfi
1239 (p8)   mov N_fix_gr = r0
1240 //
1241 //     if Arg > -pi/4, reset PR_8.
1242 //     Select the case when |Arg| < pi/4 - set PR[8] = true.
1243 //     Else Select the case when |Arg| >= pi/4 - set PR[9] = true.
1244 //
1245 (p0)   fcvt.fx.s1 N_fix = N
1246         nop.i 999 ;;
1247 }
1248
1249 { .mfi
1250         nop.m 999
1251 //
1252 //     Grab the integer part of N .
1253 //
1254 (p8)   mov r = Arg
1255         nop.i 999
1256 }
1257
1258 { .mfi
1259         nop.m 999
1260 (p8)   mov c = f0
1261         nop.i 999 ;;
1262 }
1263
1264 { .mfi
1265         nop.m 999
1266 (p8)   fcmp.lt.unc.s1 p10, p11 = Arg, TWO_TO_NEG2
1267         nop.i 999 ;;
1268 }
1269
1270 { .mfi
1271         nop.m 999
1272 (p10)  fcmp.gt.s1 p10,p0 = Arg, NEGTWO_TO_NEG2
1273         nop.i 999 ;;
1274 }
1275
1276 { .mfi
1277         nop.m 999
1278 //
1279 //     Case 2: Place integer part of N in GP register.
1280 //
1281 (p9)   fcvt.xf N = N_fix
1282         nop.i 999 ;;
1283 }
1284
1285 { .mib
1286 (p9)   getf.sig N_fix_gr = N_fix
1287         nop.i 999
1288 //
1289 //     Case 2: Convert integer N_fix back to normalized floating-point value.
1290 //
1291 (p10)  br.cond.spnt TAN_SMALL_R ;;
1292 }
1293
1294 { .mib
1295         nop.m 999
1296         nop.i 999
1297 (p8)   br.cond.sptk TAN_NORMAL_R ;;
1298 }
1299 //
1300 //     Case 1: PR_3 is only affected  when PR_1 is set.
1301 //
1302
1303 { .mmi
1304 (p9)   ldfs TWO_TO_NEG33 = [table_ptr2], 4 ;;
1305 //
1306 //     Case 2: Load 2**(-33).
1307 //
1308 (p9)   ldfs NEGTWO_TO_NEG33 = [table_ptr2], 4
1309         nop.i 999 ;;
1310 }
1311
1312 { .mfi
1313         nop.m 999
1314 //
1315 //     Case 2: Load -2**(-33).
1316 //
1317 (p9)   fnma.s1 s_val = N, P_1, Arg
1318         nop.i 999
1319 }
1320
1321 { .mfi
1322         nop.m 999
1323 (p9)   fmpy.s1 w = N, P_2
1324         nop.i 999 ;;
1325 }
1326
1327 { .mfi
1328         nop.m 999
1329 //
1330 //     Case 2: w = N * P_2
1331 //     Case 2: s_val = -N * P_1  + Arg
1332 //
1333 (p0)   fcmp.lt.unc.s1 p9,p8 = s_val, TWO_TO_NEG33
1334         nop.i 999 ;;
1335 }
1336
1337 { .mfi
1338         nop.m 999
1339 //
1340 //     Decide between case_1 and case_2 reduce:
1341 //
1342 (p9)   fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33
1343         nop.i 999 ;;
1344 }
1345
1346 { .mfi
1347         nop.m 999
1348 //
1349 //     Case 1_reduce:  s <= -2**(-33) or s >= 2**(-33)
1350 //     Case 2_reduce: -2**(-33) < s < 2**(-33)
1351 //
1352 (p8)   fsub.s1 r = s_val, w
1353         nop.i 999
1354 }
1355
1356 { .mfi
1357         nop.m 999
1358 (p9)   fmpy.s1 w = N, P_3
1359         nop.i 999 ;;
1360 }
1361
1362 { .mfi
1363         nop.m 999
1364 (p9)   fma.s1  U_1 = N, P_2, w
1365         nop.i 999
1366 }
1367
1368 { .mfi
1369         nop.m 999
1370 //
1371 //     Case 1_reduce: Is |r| < 2**(-2), if so set PR_10
1372 //     else set PR_11.
1373 //
1374 (p8)   fsub.s1 c = s_val, r
1375         nop.i 999 ;;
1376 }
1377
1378 { .mfi
1379         nop.m 999
1380 //
1381 //     Case 1_reduce: r = s + w (change sign)
1382 //     Case 2_reduce: w = N * P_3 (change sign)
1383 //
1384 (p8)   fcmp.lt.unc.s1 p10, p11 = r, TWO_TO_NEG2
1385         nop.i 999 ;;
1386 }
1387
1388 { .mfi
1389         nop.m 999
1390 (p10)  fcmp.gt.s1 p10, p11 = r, NEGTWO_TO_NEG2
1391         nop.i 999 ;;
1392 }
1393
1394 { .mfi
1395         nop.m 999
1396 (p9)   fsub.s1 r = s_val, U_1
1397         nop.i 999
1398 }
1399
1400 { .mfi
1401         nop.m 999
1402 //
1403 //     Case 1_reduce: c is complete here.
1404 //     c = c + w (w has not been negated.)
1405 //     Case 2_reduce: r is complete here - continue to calculate c .
1406 //     r = s - U_1
1407 //
1408 (p9)   fms.s1 U_2 = N, P_2, U_1
1409         nop.i 999 ;;
1410 }
1411
1412 { .mfi
1413         nop.m 999
1414 //
1415 //     Case 1_reduce: c = s - r
1416 //     Case 2_reduce: U_1 = N * P_2 + w
1417 //
1418 (p8)   fsub.s1 c = c, w
1419         nop.i 999 ;;
1420 }
1421
1422 { .mfi
1423         nop.m 999
1424 (p9)   fsub.s1 s_val = s_val, r
1425         nop.i 999
1426 }
1427
1428 { .mfb
1429         nop.m 999
1430 //
1431 //     Case 2_reduce:
1432 //     U_2 = N * P_2 - U_1
1433 //     Not needed until later.
1434 //
1435 (p9)   fadd.s1 U_2 = U_2, w
1436 //
1437 //     Case 2_reduce:
1438 //     s = s - r
1439 //     U_2 = U_2 + w
1440 //
1441 (p10)  br.cond.spnt TAN_SMALL_R ;;
1442 }
1443
1444 { .mib
1445         nop.m 999
1446         nop.i 999
1447 (p11)  br.cond.sptk TAN_NORMAL_R ;;
1448 }
1449
1450 { .mii
1451         nop.m 999
1452 //
1453 //     Case 2_reduce:
1454 //     c = c - U_2
1455 //     c is complete here
1456 //     Argument reduction ends here.
1457 //
1458 (p9)   extr.u i_1 = N_fix_gr, 0, 1 ;;
1459 (p9)   cmp.eq.unc p11, p12 = 0x0000,i_1 ;;
1460 }
1461
1462 { .mfi
1463         nop.m 999
1464 //
1465 //     Is i_1  even or odd?
1466 //     if i_1 == 0, set p11, else set p12.
1467 //
1468 (p11)  fmpy.s1 rsq = r, Z
1469         nop.i 999 ;;
1470 }
1471
1472 { .mfi
1473         nop.m 999
1474 (p12)  frcpa.s1 S_hi,p0 = f1, r
1475         nop.i 999
1476 }
1477
1478 //
1479 //     Case 1: Branch to SMALL_R or NORMAL_R.
1480 //     Case 1 is done now.
1481 //
1482
1483 { .mfi
1484 (p9)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
1485 (p9)   fsub.s1 c = s_val, U_1
1486         nop.i 999 ;;
1487 }
1488 ;;
1489
1490 { .mmi
1491 (p9)  ld8 table_ptr1 = [table_ptr1]
1492       nop.m 999
1493       nop.i 999
1494 }
1495 ;;
1496
1497 { .mmi
1498 (p9)   add table_ptr1 = 224, table_ptr1 ;;
1499 (p9)   ldfe P1_1 = [table_ptr1],144
1500         nop.i 999 ;;
1501 }
1502 //
1503 //     Get [i_1] -  lsb of N_fix_gr .
1504 //     Load P1_1 and point to Q1_1 .
1505 //
1506
1507 { .mfi
1508 (p9)   ldfe Q1_1 = [table_ptr1] , 0
1509 //
1510 //     N even: rsq = r * Z
1511 //     N odd:  S_hi = frcpa(r)
1512 //
1513 (p12)  fmerge.ns S_hi = S_hi, S_hi
1514         nop.i 999
1515 }
1516
1517 { .mfi
1518         nop.m 999
1519 //
1520 //     Case 2_reduce:
1521 //     c = s - U_1
1522 //
1523 (p9)   fsub.s1 c = c, U_2
1524         nop.i 999 ;;
1525 }
1526
1527 { .mfi
1528         nop.m 999
1529 (p12)  fma.s1  poly1 = S_hi, r, f1
1530         nop.i 999 ;;
1531 }
1532
1533 { .mfi
1534         nop.m 999
1535 //
1536 //     N odd:  Change sign of S_hi
1537 //
1538 (p11)  fmpy.s1 rsq = rsq, P1_1
1539         nop.i 999 ;;
1540 }
1541
1542 { .mfi
1543         nop.m 999
1544 (p12)  fma.s1 S_hi = S_hi, poly1, S_hi
1545         nop.i 999 ;;
1546 }
1547
1548 { .mfi
1549         nop.m 999
1550 //
1551 //     N even: rsq = rsq * P1_1
1552 //     N odd:  poly1 =  1.0 +  S_hi * r    16 bits partial  account for necessary
1553 //
1554 (p11)  fma.s1 Result = r, rsq, c
1555         nop.i 999 ;;
1556 }
1557
1558 { .mfi
1559         nop.m 999
1560 //
1561 //     N even: Result = c  + r * rsq
1562 //     N odd:  S_hi  = S_hi + S_hi*poly1  16 bits account for necessary
1563 //
1564 (p12)  fma.s1 poly1 = S_hi, r, f1
1565         nop.i 999 ;;
1566 }
1567
1568 { .mfi
1569         nop.m 999
1570 //
1571 //     N even: Result = Result + r
1572 //     N odd:  poly1  = 1.0 + S_hi * r        32 bits partial
1573 //
1574 (p11)  fadd.s0 Result = r, Result
1575         nop.i 999 ;;
1576 }
1577
1578 { .mfi
1579         nop.m 999
1580 (p12)  fma.s1  S_hi = S_hi, poly1, S_hi
1581         nop.i 999 ;;
1582 }
1583
1584 { .mfi
1585         nop.m 999
1586 //
1587 //     N even: Result1 = Result + r
1588 //     N odd:   S_hi  = S_hi * poly1 + S_hi   32 bits
1589 //
1590 (p12)  fma.s1 poly1 = S_hi, r, f1
1591         nop.i 999 ;;
1592 }
1593
1594 { .mfi
1595         nop.m 999
1596 //
1597 //     N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
1598 //
1599 (p12)  fma.s1 S_hi = S_hi, poly1, S_hi
1600         nop.i 999 ;;
1601 }
1602
1603 { .mfi
1604         nop.m 999
1605 //
1606 //     N odd:  poly1  =  S_hi * poly + 1.0    64 bits
1607 //
1608 (p12)  fma.s1 poly1 = S_hi, r, f1
1609         nop.i 999 ;;
1610 }
1611
1612 { .mfi
1613         nop.m 999
1614 //
1615 //     N odd:  poly1  =  S_hi * r + 1.0
1616 //
1617 (p12)  fma.s1 poly1 = S_hi, c, poly1
1618         nop.i 999 ;;
1619 }
1620
1621 { .mfi
1622         nop.m 999
1623 //
1624 //     N odd:  poly1  =  S_hi * c + poly1
1625 //
1626 (p12)  fmpy.s1 S_lo = S_hi, poly1
1627         nop.i 999 ;;
1628 }
1629
1630 { .mfi
1631         nop.m 999
1632 //
1633 //     N odd:  S_lo  =  S_hi *  poly1
1634 //
1635 (p12)  fma.s1 S_lo = Q1_1, r, S_lo
1636         nop.i 999
1637 }
1638
1639 { .mfi
1640         nop.m 999
1641 //
1642 //     N odd:  Result =  S_hi + S_lo
1643 //
1644 (p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
1645         nop.i 999 ;;
1646 }
1647
1648 { .mfb
1649         nop.m 999
1650 //
1651 //     N odd:  S_lo  =  S_lo + Q1_1 * r
1652 //
1653 (p12)  fadd.s0 Result = S_hi, S_lo
1654 (p0)   br.ret.sptk b0 ;;
1655 }
1656
1657
1658 TAN_LARGER_ARG:
1659
1660 { .mmf
1661 (p0)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
1662       nop.m 999
1663 (p0)  fmpy.s1 N_0 = Arg, Inv_P_0
1664 }
1665 ;;
1666
1667 //
1668 // ARGUMENT REDUCTION CODE - CASE 3 and 4
1669 //
1670 //
1671 //    Adjust table_ptr1 to beginning of table.
1672 //    N_0 = Arg * Inv_P_0
1673 //
1674
1675
1676 { .mmi
1677 (p0)  ld8 table_ptr1 = [table_ptr1]
1678       nop.m 999
1679       nop.i 999
1680 }
1681 ;;
1682
1683
1684 { .mmi
1685 (p0)  add table_ptr1 = 8, table_ptr1 ;;
1686 //
1687 //    Point to  2*-14
1688 //
1689 (p0)  ldfs TWO_TO_NEG14 = [table_ptr1], 4
1690         nop.i 999 ;;
1691 }
1692 //
1693 //    Load 2**(-14).
1694 //
1695
1696 { .mmi
1697 (p0)  ldfs NEGTWO_TO_NEG14 = [table_ptr1], 180 ;;
1698 //
1699 //    N_0_fix  = integer part of N_0 .
1700 //    Adjust table_ptr1 to beginning of table.
1701 //
1702 (p0)  ldfs TWO_TO_NEG2 = [table_ptr1], 4
1703         nop.i 999 ;;
1704 }
1705 //
1706 //    Make N_0 the integer part.
1707 //
1708
1709 { .mfi
1710 (p0)  ldfs NEGTWO_TO_NEG2 = [table_ptr1]
1711 //
1712 //    Load -2**(-14).
1713 //
1714 (p0)  fcvt.fx.s1 N_0_fix = N_0
1715         nop.i 999 ;;
1716 }
1717
1718 { .mfi
1719         nop.m 999
1720 (p0)  fcvt.xf N_0 = N_0_fix
1721         nop.i 999 ;;
1722 }
1723
1724 { .mfi
1725         nop.m 999
1726 (p0)  fnma.s1 ArgPrime = N_0, P_0, Arg
1727         nop.i 999
1728 }
1729
1730 { .mfi
1731         nop.m 999
1732 (p0)  fmpy.s1 w = N_0, d_1
1733         nop.i 999 ;;
1734 }
1735
1736 { .mfi
1737         nop.m 999
1738 //
1739 //    ArgPrime = -N_0 * P_0 + Arg
1740 //    w  = N_0 * d_1
1741 //
1742 (p0)  fmpy.s1 N = ArgPrime, two_by_PI
1743         nop.i 999 ;;
1744 }
1745
1746 { .mfi
1747         nop.m 999
1748 //
1749 //    N = ArgPrime * 2/pi
1750 //
1751 (p0)  fcvt.fx.s1 N_fix = N
1752         nop.i 999 ;;
1753 }
1754
1755 { .mfi
1756         nop.m 999
1757 //
1758 //    N_fix is the integer part.
1759 //
1760 (p0)  fcvt.xf N = N_fix
1761         nop.i 999 ;;
1762 }
1763
1764 { .mfi
1765 (p0)  getf.sig N_fix_gr = N_fix
1766         nop.f 999
1767         nop.i 999 ;;
1768 }
1769
1770 { .mfi
1771         nop.m 999
1772 //
1773 //    N is the integer part of the reduced-reduced argument.
1774 //    Put the integer in a GP register.
1775 //
1776 (p0)  fnma.s1 s_val = N, P_1, ArgPrime
1777         nop.i 999
1778 }
1779
1780 { .mfi
1781         nop.m 999
1782 (p0)  fnma.s1 w = N, P_2, w
1783         nop.i 999 ;;
1784 }
1785
1786 { .mfi
1787         nop.m 999
1788 //
1789 //    s_val = -N*P_1 + ArgPrime
1790 //    w = -N*P_2 + w
1791 //
1792 (p0)  fcmp.lt.unc.s1 p11, p10 = s_val, TWO_TO_NEG14
1793         nop.i 999 ;;
1794 }
1795
1796 { .mfi
1797         nop.m 999
1798 (p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14
1799         nop.i 999 ;;
1800 }
1801
1802 { .mfi
1803         nop.m 999
1804 //
1805 //    Case 3: r = s_val + w (Z complete)
1806 //    Case 4: U_hi = N_0 * d_1
1807 //
1808 (p10) fmpy.s1 V_hi = N, P_2
1809         nop.i 999
1810 }
1811
1812 { .mfi
1813         nop.m 999
1814 (p11) fmpy.s1 U_hi = N_0, d_1
1815         nop.i 999 ;;
1816 }
1817
1818 { .mfi
1819         nop.m 999
1820 //
1821 //    Case 3: r = s_val + w (Z complete)
1822 //    Case 4: U_hi = N_0 * d_1
1823 //
1824 (p11) fmpy.s1 V_hi = N, P_2
1825         nop.i 999
1826 }
1827
1828 { .mfi
1829         nop.m 999
1830 (p11) fmpy.s1 U_hi = N_0, d_1
1831         nop.i 999 ;;
1832 }
1833
1834 { .mfi
1835         nop.m 999
1836 //
1837 //    Decide between case 3 and 4:
1838 //    Case 3:  s <= -2**(-14) or s >= 2**(-14)
1839 //    Case 4: -2**(-14) < s < 2**(-14)
1840 //
1841 (p10) fadd.s1 r = s_val, w
1842         nop.i 999
1843 }
1844
1845 { .mfi
1846         nop.m 999
1847 (p11) fmpy.s1 w = N, P_3
1848         nop.i 999 ;;
1849 }
1850
1851 { .mfi
1852         nop.m 999
1853 //
1854 //    Case 4: We need abs of both U_hi and V_hi - dont
1855 //    worry about switched sign of V_hi .
1856 //
1857 (p11) fsub.s1 A = U_hi, V_hi
1858         nop.i 999
1859 }
1860
1861 { .mfi
1862         nop.m 999
1863 //
1864 //    Case 4: A =  U_hi + V_hi
1865 //    Note: Worry about switched sign of V_hi, so subtract instead of add.
1866 //
1867 (p11) fnma.s1 V_lo = N, P_2, V_hi
1868         nop.i 999 ;;
1869 }
1870
1871 { .mfi
1872         nop.m 999
1873 (p11) fms.s1 U_lo = N_0, d_1, U_hi
1874         nop.i 999 ;;
1875 }
1876
1877 { .mfi
1878         nop.m 999
1879 (p11) fabs V_hiabs = V_hi
1880         nop.i 999
1881 }
1882
1883 { .mfi
1884         nop.m 999
1885 //
1886 //    Case 4: V_hi = N * P_2
1887 //            w = N * P_3
1888 //    Note the product does not include the (-) as in the writeup
1889 //    so (-) missing for V_hi and w .
1890 (p10) fadd.s1 r = s_val, w
1891         nop.i 999 ;;
1892 }
1893
1894 { .mfi
1895         nop.m 999
1896 //
1897 //    Case 3: c = s_val - r
1898 //    Case 4: U_lo = N_0 * d_1 - U_hi
1899 //
1900 (p11) fabs U_hiabs = U_hi
1901         nop.i 999
1902 }
1903
1904 { .mfi
1905         nop.m 999
1906 (p11) fmpy.s1 w = N, P_3
1907         nop.i 999 ;;
1908 }
1909
1910 { .mfi
1911         nop.m 999
1912 //
1913 //    Case 4: Set P_12 if U_hiabs >= V_hiabs
1914 //
1915 (p11) fadd.s1 C_hi = s_val, A
1916         nop.i 999 ;;
1917 }
1918
1919 { .mfi
1920         nop.m 999
1921 //
1922 //    Case 4: C_hi = s_val + A
1923 //
1924 (p11) fadd.s1 t = U_lo, V_lo
1925         nop.i 999 ;;
1926 }
1927
1928 { .mfi
1929         nop.m 999
1930 //
1931 //    Case 3: Is |r| < 2**(-2), if so set PR_7
1932 //    else set PR_8.
1933 //    Case 3: If PR_7 is set, prepare to branch to Small_R.
1934 //    Case 3: If PR_8 is set, prepare to branch to Normal_R.
1935 //
1936 (p10) fsub.s1 c = s_val, r
1937         nop.i 999 ;;
1938 }
1939
1940 { .mfi
1941         nop.m 999
1942 //
1943 //    Case 3: c = (s - r) + w (c complete)
1944 //
1945 (p11) fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs
1946         nop.i 999
1947 }
1948
1949 { .mfi
1950         nop.m 999
1951 (p11) fms.s1 w = N_0, d_2, w
1952         nop.i 999 ;;
1953 }
1954
1955 { .mfi
1956         nop.m 999
1957 //
1958 //    Case 4: V_hi = N * P_2
1959 //            w = N * P_3
1960 //    Note the product does not include the (-) as in the writeup
1961 //    so (-) missing for V_hi and w .
1962 //
1963 (p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2
1964         nop.i 999 ;;
1965 }
1966
1967 { .mfi
1968         nop.m 999
1969 (p14) fcmp.gt.s1 p14, p15 = r, NEGTWO_TO_NEG2
1970         nop.i 999 ;;
1971 }
1972
1973 { .mfb
1974         nop.m 999
1975 //
1976 //    Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup)
1977 //    Note: the (-) is still missing for V_hi .
1978 //    Case 4: w = w + N_0 * d_2
1979 //    Note: the (-) is now incorporated in w .
1980 //
1981 (p10) fadd.s1 c = c, w
1982 //
1983 //    Case 4: t = U_lo + V_lo
1984 //    Note: remember V_lo should be (-), subtract instead of add. NO
1985 //
1986 (p14) br.cond.spnt TAN_SMALL_R ;;
1987 }
1988
1989 { .mib
1990         nop.m 999
1991         nop.i 999
1992 (p15) br.cond.spnt TAN_NORMAL_R ;;
1993 }
1994
1995 { .mfi
1996         nop.m 999
1997 //
1998 //    Case 3: Vector off when |r| < 2**(-2).  Recall that PR_3 will be true.
1999 //    The remaining stuff is for Case 4.
2000 //
2001 (p12) fsub.s1 a = U_hi, A
2002 (p11) extr.u i_1 = N_fix_gr, 0, 1 ;;
2003 }
2004
2005 { .mfi
2006         nop.m 999
2007 //
2008 //    Case 4: C_lo = s_val - C_hi
2009 //
2010 (p11) fadd.s1 t = t, w
2011         nop.i 999
2012 }
2013
2014 { .mfi
2015         nop.m 999
2016 (p13) fadd.s1 a = V_hi, A
2017         nop.i 999 ;;
2018 }
2019
2020 //
2021 //    Case 4: a = U_hi - A
2022 //            a = V_hi - A (do an add to account for missing (-) on V_hi
2023 //
2024
2025 { .mfi
2026 (p11)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
2027 (p11) fsub.s1 C_lo = s_val, C_hi
2028         nop.i 999
2029 }
2030 ;;
2031
2032 { .mmi
2033 (p11) ld8 table_ptr1 = [table_ptr1]
2034       nop.m 999
2035       nop.i 999
2036 }
2037 ;;
2038
2039 //
2040 //    Case 4: a = (U_hi - A)  + V_hi
2041 //            a = (V_hi - A)  + U_hi
2042 //    In each case account for negative missing form V_hi .
2043 //
2044 //
2045 //    Case 4: C_lo = (s_val - C_hi) + A
2046 //
2047
2048 { .mmi
2049 (p11) add table_ptr1 = 224, table_ptr1 ;;
2050 (p11) ldfe P1_1 = [table_ptr1], 16
2051         nop.i 999 ;;
2052 }
2053
2054 { .mfi
2055 (p11) ldfe P1_2 = [table_ptr1], 128
2056 //
2057 //    Case 4: w = U_lo + V_lo  + w
2058 //
2059 (p12) fsub.s1 a = a, V_hi
2060         nop.i 999 ;;
2061 }
2062 //
2063 //    Case 4: r = C_hi + C_lo
2064 //
2065
2066 { .mfi
2067 (p11) ldfe Q1_1 = [table_ptr1], 16
2068 (p11) fadd.s1 C_lo = C_lo, A
2069         nop.i 999 ;;
2070 }
2071 //
2072 //    Case 4: c = C_hi - r
2073 //    Get [i_1] - lsb of N_fix_gr.
2074 //
2075
2076 { .mfi
2077 (p11) ldfe Q1_2 = [table_ptr1], 16
2078         nop.f 999
2079         nop.i 999 ;;
2080 }
2081
2082 { .mfi
2083         nop.m 999
2084 (p13) fsub.s1 a = U_hi, a
2085         nop.i 999 ;;
2086 }
2087
2088 { .mfi
2089         nop.m 999
2090 (p11) fadd.s1 t = t, a
2091         nop.i 999 ;;
2092 }
2093
2094 { .mfi
2095         nop.m 999
2096 //
2097 //    Case 4: t = t + a
2098 //
2099 (p11) fadd.s1 C_lo = C_lo, t
2100         nop.i 999 ;;
2101 }
2102
2103 { .mfi
2104         nop.m 999
2105 //
2106 //    Case 4: C_lo = C_lo + t
2107 //
2108 (p11) fadd.s1 r = C_hi, C_lo
2109         nop.i 999 ;;
2110 }
2111
2112 { .mfi
2113         nop.m 999
2114 (p11) fsub.s1 c = C_hi, r
2115         nop.i 999
2116 }
2117
2118 { .mfi
2119         nop.m 999
2120 //
2121 //    Case 4: c = c + C_lo  finished.
2122 //    Is i_1  even or odd?
2123 //    if i_1 == 0, set PR_4, else set PR_5.
2124 //
2125 // r and c have been computed.
2126 // We known whether this is the sine or cosine routine.
2127 // Make sure ftz mode is set - should be automatic when using wre
2128 (p0)  fmpy.s1 rsq = r, r
2129         nop.i 999 ;;
2130 }
2131
2132 { .mfi
2133         nop.m 999
2134 (p11) fadd.s1 c = c , C_lo
2135 (p11) cmp.eq.unc p11, p12 =  0x0000, i_1 ;;
2136 }
2137
2138 { .mfi
2139         nop.m 999
2140 (p12) frcpa.s1 S_hi, p0 = f1, r
2141         nop.i 999
2142 }
2143
2144 { .mfi
2145         nop.m 999
2146 //
2147 //    N odd: Change sign of S_hi
2148 //
2149 (p11) fma.s1 Result = rsq, P1_2, P1_1
2150         nop.i 999 ;;
2151 }
2152
2153 { .mfi
2154         nop.m 999
2155 (p12) fma.s1 P = rsq, Q1_2, Q1_1
2156         nop.i 999
2157 }
2158
2159 { .mfi
2160         nop.m 999
2161 //
2162 //    N odd:  Result  =  S_hi + S_lo      (User supplied rounding mode for C1)
2163 //
2164 (p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2165         nop.i 999 ;;
2166 }
2167
2168 { .mfi
2169         nop.m 999
2170 //
2171 //    N even: rsq = r * r
2172 //    N odd:  S_hi = frcpa(r)
2173 //
2174 (p12) fmerge.ns S_hi = S_hi, S_hi
2175         nop.i 999
2176 }
2177
2178 { .mfi
2179         nop.m 999
2180 //
2181 //    N even: rsq = rsq * P1_2 + P1_1
2182 //    N odd:  poly1 =  1.0 +  S_hi * r    16 bits partial  account for necessary
2183 //
2184 (p11) fmpy.s1 Result = rsq, Result
2185         nop.i 999 ;;
2186 }
2187
2188 { .mfi
2189         nop.m 999
2190 (p12) fma.s1 poly1 = S_hi, r,f1
2191         nop.i 999
2192 }
2193
2194 { .mfi
2195         nop.m 999
2196 //
2197 //    N even: Result =  Result * rsq
2198 //    N odd:  S_hi  = S_hi + S_hi*poly1  16 bits account for necessary
2199 //
2200 (p11) fma.s1 Result = r, Result, c
2201         nop.i 999 ;;
2202 }
2203
2204 { .mfi
2205         nop.m 999
2206 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2207         nop.i 999
2208 }
2209
2210 { .mfi
2211         nop.m 999
2212 //
2213 //    N odd:   S_hi  = S_hi * poly1 + S_hi   32 bits
2214 //
2215 (p11) fadd.s0 Result= r, Result
2216         nop.i 999 ;;
2217 }
2218
2219 { .mfi
2220         nop.m 999
2221 (p12) fma.s1 poly1 =  S_hi, r, f1
2222         nop.i 999 ;;
2223 }
2224
2225 { .mfi
2226         nop.m 999
2227 //
2228 //    N even: Result = Result * r + c
2229 //    N odd:  poly1  = 1.0 + S_hi * r        32 bits partial
2230 //
2231 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2232         nop.i 999 ;;
2233 }
2234
2235 { .mfi
2236         nop.m 999
2237 (p12) fma.s1 poly1 = S_hi, r, f1
2238         nop.i 999 ;;
2239 }
2240
2241 { .mfi
2242         nop.m 999
2243 //
2244 //    N even: Result1 = Result + r  (Rounding mode S0)
2245 //    N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
2246 //
2247 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2248         nop.i 999 ;;
2249 }
2250
2251 { .mfi
2252         nop.m 999
2253 //
2254 //    N odd:  poly1  =  S_hi * poly + S_hi    64 bits
2255 //
2256 (p12) fma.s1 poly1 = S_hi, r, f1
2257         nop.i 999 ;;
2258 }
2259
2260 { .mfi
2261         nop.m 999
2262 //
2263 //    N odd:  poly1  =  S_hi * r + 1.0
2264 //
2265 (p12) fma.s1 poly1 = S_hi, c, poly1
2266         nop.i 999 ;;
2267 }
2268
2269 { .mfi
2270         nop.m 999
2271 //
2272 //    N odd:  poly1  =  S_hi * c + poly1
2273 //
2274 (p12) fmpy.s1 S_lo = S_hi, poly1
2275         nop.i 999 ;;
2276 }
2277
2278 { .mfi
2279         nop.m 999
2280 //
2281 //    N odd:  S_lo  =  S_hi *  poly1
2282 //
2283 (p12) fma.s1 S_lo = P, r, S_lo
2284         nop.i 999 ;;
2285 }
2286
2287 { .mfb
2288         nop.m 999
2289 //
2290 //    N odd:  S_lo  =  S_lo + r * P
2291 //
2292 (p12) fadd.s0 Result = S_hi, S_lo
2293 (p0)   br.ret.sptk b0 ;;
2294 }
2295
2296
2297 TAN_SMALL_R:
2298
2299 { .mii
2300         nop.m 999
2301 (p0)  extr.u i_1 = N_fix_gr, 0, 1 ;;
2302 (p0)  cmp.eq.unc p11, p12 = 0x0000, i_1
2303 }
2304
2305 { .mfi
2306         nop.m 999
2307 (p0)  fmpy.s1 rsq = r, r
2308         nop.i 999 ;;
2309 }
2310
2311 { .mfi
2312         nop.m 999
2313 (p12) frcpa.s1 S_hi, p0 = f1, r
2314         nop.i 999
2315 }
2316
2317 { .mfi
2318 (p0)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
2319         nop.f 999
2320         nop.i 999
2321 }
2322 ;;
2323
2324 { .mmi
2325 (p0)  ld8 table_ptr1 = [table_ptr1]
2326       nop.m 999
2327       nop.i 999
2328 }
2329 ;;
2330
2331 // *****************************************************************
2332 // *****************************************************************
2333 // *****************************************************************
2334
2335 { .mmi
2336 (p0)  add table_ptr1 = 224, table_ptr1 ;;
2337 (p0)  ldfe P1_1 = [table_ptr1], 16
2338         nop.i 999 ;;
2339 }
2340 //    r and c have been computed.
2341 //    We known whether this is the sine or cosine routine.
2342 //    Make sure ftz mode is set - should be automatic when using wre
2343 //    |r| < 2**(-2)
2344
2345 { .mfi
2346 (p0)  ldfe P1_2 = [table_ptr1], 16
2347 (p11) fmpy.s1 r_to_the_8 = rsq, rsq
2348         nop.i 999 ;;
2349 }
2350 //
2351 //    Set table_ptr1 to beginning of constant table.
2352 //    Get [i_1] - lsb of N_fix_gr.
2353 //
2354
2355 { .mfi
2356 (p0)  ldfe P1_3 = [table_ptr1], 96
2357 //
2358 //    N even: rsq = r * r
2359 //    N odd:  S_hi = frcpa(r)
2360 //
2361 (p12) fmerge.ns S_hi = S_hi, S_hi
2362         nop.i 999 ;;
2363 }
2364 //
2365 //    Is i_1  even or odd?
2366 //    if i_1 == 0, set PR_11.
2367 //    if i_1 != 0, set PR_12.
2368 //
2369
2370 { .mfi
2371 (p11) ldfe P1_9 = [table_ptr1], -16
2372 //
2373 //    N even: Poly2 = P1_7 + Poly2 * rsq
2374 //    N odd:  poly2 = Q1_5 + poly2 * rsq
2375 //
2376 (p11) fadd.s1 CORR = rsq, f1
2377         nop.i 999 ;;
2378 }
2379
2380 { .mmi
2381 (p11) ldfe P1_8 = [table_ptr1], -16 ;;
2382 //
2383 //    N even: Poly1 = P1_2 + P1_3 * rsq
2384 //    N odd:  poly1 =  1.0 +  S_hi * r
2385 //    16 bits partial  account for necessary (-1)
2386 //
2387 (p11) ldfe P1_7 = [table_ptr1], -16
2388         nop.i 999 ;;
2389 }
2390 //
2391 //    N even: Poly1 = P1_1 + Poly1 * rsq
2392 //    N odd:  S_hi  =  S_hi + S_hi * poly1)     16 bits account for necessary
2393 //
2394
2395 { .mfi
2396 (p11) ldfe P1_6 = [table_ptr1], -16
2397 //
2398 //    N even: Poly2 = P1_5 + Poly2 * rsq
2399 //    N odd:  poly2 = Q1_3 + poly2 * rsq
2400 //
2401 (p11) fmpy.s1 r_to_the_8 = r_to_the_8, r_to_the_8
2402         nop.i 999 ;;
2403 }
2404 //
2405 //    N even: Poly1 =  Poly1 * rsq
2406 //    N odd:  poly1  = 1.0 + S_hi * r         32 bits partial
2407 //
2408
2409 { .mfi
2410 (p11) ldfe P1_5 = [table_ptr1], -16
2411 (p12) fma.s1 poly1 =  S_hi, r, f1
2412         nop.i 999 ;;
2413 }
2414 //
2415 //    N even: CORR =  CORR * c
2416 //    N odd:  S_hi  =  S_hi * poly1 + S_hi    32 bits
2417 //
2418
2419 //
2420 //    N even: Poly2 = P1_6 + Poly2 * rsq
2421 //    N odd:  poly2 = Q1_4 + poly2 * rsq
2422 //
2423 { .mmf
2424 (p0)  addl           table_ptr2   = @ltoff(TAN_BASE_CONSTANTS), gp
2425 (p11) ldfe P1_4 = [table_ptr1], -16
2426 (p11) fmpy.s1 CORR =  CORR, c
2427 }
2428 ;;
2429
2430
2431 { .mmi
2432 (p0)  ld8 table_ptr2 = [table_ptr2]
2433       nop.m 999
2434       nop.i 999
2435 }
2436 ;;
2437
2438
2439 { .mii
2440 (p0)  add table_ptr2 = 464, table_ptr2
2441         nop.i 999 ;;
2442         nop.i 999
2443 }
2444
2445 { .mfi
2446         nop.m 999
2447 (p11) fma.s1 Poly1 = P1_3, rsq, P1_2
2448         nop.i 999 ;;
2449 }
2450
2451 { .mfi
2452 (p0)  ldfe Q1_7 = [table_ptr2], -16
2453 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2454         nop.i 999 ;;
2455 }
2456
2457 { .mfi
2458 (p0)  ldfe Q1_6 = [table_ptr2], -16
2459 (p11) fma.s1 Poly2 = P1_9, rsq, P1_8
2460         nop.i 999 ;;
2461 }
2462
2463 { .mmi
2464 (p0)  ldfe Q1_5 = [table_ptr2], -16 ;;
2465 (p12) ldfe Q1_4 = [table_ptr2], -16
2466         nop.i 999 ;;
2467 }
2468
2469 { .mfi
2470 (p12) ldfe Q1_3 = [table_ptr2], -16
2471 //
2472 //    N even: Poly2 = P1_8 + P1_9 * rsq
2473 //    N odd:  poly2 = Q1_6 + Q1_7 * rsq
2474 //
2475 (p11) fma.s1 Poly1 = Poly1, rsq, P1_1
2476         nop.i 999 ;;
2477 }
2478
2479 { .mfi
2480 (p12) ldfe Q1_2 = [table_ptr2], -16
2481 (p12) fma.s1 poly1 = S_hi, r, f1
2482         nop.i 999 ;;
2483 }
2484
2485 { .mfi
2486 (p12) ldfe Q1_1 = [table_ptr2], -16
2487 (p11) fma.s1 Poly2 = Poly2, rsq, P1_7
2488         nop.i 999 ;;
2489 }
2490
2491 { .mfi
2492         nop.m 999
2493 //
2494 //    N even: CORR =  rsq + 1
2495 //    N even: r_to_the_8 =  rsq * rsq
2496 //
2497 (p11) fmpy.s1 Poly1 = Poly1, rsq
2498         nop.i 999 ;;
2499 }
2500
2501 { .mfi
2502         nop.m 999
2503 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2504         nop.i 999
2505 }
2506
2507 { .mfi
2508         nop.m 999
2509 (p12) fma.s1 poly2 = Q1_7, rsq, Q1_6
2510         nop.i 999 ;;
2511 }
2512
2513 { .mfi
2514         nop.m 999
2515 (p11) fma.s1 Poly2 = Poly2, rsq, P1_6
2516         nop.i 999 ;;
2517 }
2518
2519 { .mfi
2520         nop.m 999
2521 (p12) fma.s1 poly1 = S_hi, r, f1
2522         nop.i 999
2523 }
2524
2525 { .mfi
2526         nop.m 999
2527 (p12) fma.s1 poly2 = poly2, rsq, Q1_5
2528         nop.i 999 ;;
2529 }
2530
2531 { .mfi
2532         nop.m 999
2533 (p11) fma.s1 Poly2= Poly2, rsq, P1_5
2534         nop.i 999 ;;
2535 }
2536
2537 { .mfi
2538         nop.m 999
2539 (p12) fma.s1 S_hi =  S_hi, poly1, S_hi
2540         nop.i 999
2541 }
2542
2543 { .mfi
2544         nop.m 999
2545 (p12) fma.s1 poly2 = poly2, rsq, Q1_4
2546         nop.i 999 ;;
2547 }
2548
2549 { .mfi
2550         nop.m 999
2551 //
2552 //    N even: r_to_the_8 = r_to_the_8 * r_to_the_8
2553 //    N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
2554 //
2555 (p11) fma.s1 Poly2 = Poly2, rsq, P1_4
2556         nop.i 999 ;;
2557 }
2558
2559 { .mfi
2560         nop.m 999
2561 //
2562 //    N even: Result = CORR + Poly * r
2563 //    N odd:  P = Q1_1 + poly2 * rsq
2564 //
2565 (p12) fma.s1 poly1 = S_hi, r, f1
2566         nop.i 999
2567 }
2568
2569 { .mfi
2570         nop.m 999
2571 (p12) fma.s1 poly2 = poly2, rsq, Q1_3
2572         nop.i 999 ;;
2573 }
2574
2575 { .mfi
2576         nop.m 999
2577 //
2578 //    N even: Poly2 = P1_4 + Poly2 * rsq
2579 //    N odd:  poly2 = Q1_2 + poly2 * rsq
2580 //
2581 (p11) fma.s1 Poly = Poly2, r_to_the_8, Poly1
2582         nop.i 999 ;;
2583 }
2584
2585 { .mfi
2586         nop.m 999
2587 (p12) fma.s1 poly1 = S_hi, c, poly1
2588         nop.i 999
2589 }
2590
2591 { .mfi
2592         nop.m 999
2593 (p12) fma.s1 poly2 = poly2, rsq, Q1_2
2594         nop.i 999 ;;
2595 }
2596
2597 { .mfi
2598         nop.m 999
2599 //
2600 //    N even: Poly = Poly1 + Poly2 * r_to_the_8
2601 //    N odd:  S_hi =  S_hi * poly1 + S_hi    64 bits
2602 //
2603 (p11) fma.s1 Result = Poly, r, CORR
2604         nop.i 999 ;;
2605 }
2606
2607 { .mfi
2608         nop.m 999
2609 //
2610 //    N even: Result =  r + Result  (User supplied rounding mode)
2611 //    N odd:  poly1  =  S_hi * c + poly1
2612 //
2613 (p12) fmpy.s1 S_lo = S_hi, poly1
2614         nop.i 999
2615 }
2616
2617 { .mfi
2618         nop.m 999
2619 (p12) fma.s1 P = poly2, rsq, Q1_1
2620         nop.i 999 ;;
2621 }
2622
2623 { .mfi
2624         nop.m 999
2625 //
2626 //    N odd:  poly1  =  S_hi * r + 1.0
2627 //
2628 (p11) fadd.s0 Result = Result, r
2629         nop.i 999 ;;
2630 }
2631
2632 { .mfi
2633         nop.m 999
2634 //
2635 //    N odd:  S_lo  =  S_hi *  poly1
2636 //
2637 (p12) fma.s1 S_lo = Q1_1, c, S_lo
2638         nop.i 999
2639 }
2640
2641 { .mfi
2642         nop.m 999
2643 //
2644 //    N odd:  Result = Result + S_hi  (user supplied rounding mode)
2645 //
2646 (p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2647         nop.i 999 ;;
2648 }
2649
2650 { .mfi
2651         nop.m 999
2652 //
2653 //    N odd:  S_lo  =  Q1_1 * c + S_lo
2654 //
2655 (p12) fma.s1 Result = P, r, S_lo
2656         nop.i 999 ;;
2657 }
2658
2659 { .mfb
2660         nop.m 999
2661 //
2662 //    N odd:  Result =  S_lo + r * P
2663 //
2664 (p12) fadd.s0 Result = Result, S_hi
2665 (p0)   br.ret.sptk b0 ;;
2666 }
2667
2668
2669 TAN_NORMAL_R:
2670
2671 { .mfi
2672 (p0)  getf.sig sig_r = r
2673 // *******************************************************************
2674 // *******************************************************************
2675 // *******************************************************************
2676 //
2677 //    r and c have been computed.
2678 //    Make sure ftz mode is set - should be automatic when using wre
2679 //
2680 //
2681 //    Get [i_1] -  lsb of N_fix_gr alone.
2682 //
2683 (p0)  fmerge.s  Pos_r = f1, r
2684 (p0)  extr.u i_1 = N_fix_gr, 0, 1 ;;
2685 }
2686
2687 { .mfi
2688         nop.m 999
2689 (p0)  fmerge.s  sgn_r =  r, f1
2690 (p0)  cmp.eq.unc p11, p12 = 0x0000, i_1 ;;
2691 }
2692
2693 { .mfi
2694         nop.m 999
2695         nop.f 999
2696 (p0)  extr.u lookup = sig_r, 58, 5
2697 }
2698
2699 { .mlx
2700         nop.m 999
2701 (p0)  movl Create_B = 0x8200000000000000 ;;
2702 }
2703
2704 { .mfi
2705 (p0)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
2706         nop.f 999
2707 (p0)  dep Create_B = lookup, Create_B, 58, 5
2708 }
2709 ;;
2710
2711 //
2712 //    Get [i_1] -  lsb of N_fix_gr alone.
2713 //    Pos_r = abs (r)
2714 //
2715
2716
2717 { .mmi
2718       ld8 table_ptr1 = [table_ptr1]
2719       nop.m 999
2720       nop.i 999
2721 }
2722 ;;
2723
2724
2725 { .mmi
2726         nop.m 999
2727 (p0)  setf.sig B = Create_B
2728 //
2729 //    Set table_ptr1 and table_ptr2 to base address of
2730 //    constant table.
2731 //
2732 (p0)  add table_ptr1 = 480, table_ptr1 ;;
2733 }
2734
2735 { .mmb
2736         nop.m 999
2737 //
2738 //    Is i_1 or i_0  == 0 ?
2739 //    Create the constant  1 00000 1000000000000000000000...
2740 //
2741 (p0)  ldfe P2_1 = [table_ptr1], 16
2742         nop.b 999
2743 }
2744
2745 { .mmi
2746         nop.m 999 ;;
2747 (p0)  getf.exp exp_r = Pos_r
2748         nop.i 999
2749 }
2750 //
2751 //    Get r's exponent
2752 //    Get r's significand
2753 //
2754
2755 { .mmi
2756 (p0)  ldfe P2_2 = [table_ptr1], 16 ;;
2757 //
2758 //    Get the 5 bits or r for the lookup.   1.xxxxx ....
2759 //    from sig_r.
2760 //    Grab  lsb of exp of B
2761 //
2762 (p0)  ldfe P2_3 = [table_ptr1], 16
2763         nop.i 999 ;;
2764 }
2765
2766 { .mii
2767         nop.m 999
2768 (p0)  andcm table_offset = 0x0001, exp_r ;;
2769 (p0)  shl table_offset = table_offset, 9 ;;
2770 }
2771
2772 { .mii
2773         nop.m 999
2774 //
2775 //    Deposit   0 00000 1000000000000000000000... on
2776 //              1 xxxxx yyyyyyyyyyyyyyyyyyyyyy...,
2777 //    getting rid of the ys.
2778 //    Is  B = 2** -2 or  B= 2** -1? If 2**-1, then
2779 //    we want an offset of 512 for table addressing.
2780 //
2781 (p0)  shladd table_offset = lookup, 4, table_offset ;;
2782 //
2783 //    B =  ........ 1xxxxx 1000000000000000000...
2784 //
2785 (p0)  add table_ptr1 = table_ptr1, table_offset ;;
2786 }
2787
2788 { .mmb
2789         nop.m 999
2790 //
2791 //   B =  ........ 1xxxxx 1000000000000000000...
2792 //   Convert B so it has the same exponent as Pos_r
2793 //
2794 (p0)  ldfd T_hi = [table_ptr1], 8
2795         nop.b 999 ;;
2796 }
2797
2798 //
2799 //    x = |r| - B
2800 //    Load T_hi.
2801 //    Load C_hi.
2802 //
2803
2804 { .mmf
2805 (p0)  addl           table_ptr2   = @ltoff(TAN_BASE_CONSTANTS), gp
2806 (p0)  ldfs T_lo = [table_ptr1]
2807 (p0)  fmerge.se B = Pos_r, B
2808 }
2809 ;;
2810
2811 { .mmi
2812       ld8 table_ptr2 = [table_ptr2]
2813       nop.m 999
2814       nop.i 999
2815 }
2816 ;;
2817
2818 { .mii
2819 (p0)  add table_ptr2 = 1360, table_ptr2
2820         nop.i 999 ;;
2821 (p0)  add table_ptr2 = table_ptr2, table_offset ;;
2822 }
2823
2824 { .mfi
2825 (p0)  ldfd C_hi = [table_ptr2], 8
2826 (p0)  fsub.s1 x = Pos_r, B
2827         nop.i 999 ;;
2828 }
2829
2830 { .mii
2831 (p0)  ldfs C_lo = [table_ptr2],255
2832         nop.i 999 ;;
2833 //
2834 //    xsq = x * x
2835 //    N even: Tx = T_hi * x
2836 //    Load T_lo.
2837 //    Load C_lo - increment pointer to get SC_inv
2838 //    - cant get all the way, do an add later.
2839 //
2840 (p0)  add table_ptr2 = 569, table_ptr2 ;;
2841 }
2842 //
2843 //    N even: Tx1 = Tx + 1
2844 //    N odd:  Cx1 = 1 - Cx
2845 //
2846
2847 { .mfi
2848 (p0)  ldfe SC_inv = [table_ptr2], 0
2849         nop.f 999
2850         nop.i 999 ;;
2851 }
2852
2853 { .mfi
2854         nop.m 999
2855 (p0)  fmpy.s1 xsq = x, x
2856         nop.i 999
2857 }
2858
2859 { .mfi
2860         nop.m 999
2861 (p11) fmpy.s1 Tx = T_hi, x
2862         nop.i 999 ;;
2863 }
2864
2865 { .mfi
2866         nop.m 999
2867 (p12) fmpy.s1 Cx = C_hi, x
2868         nop.i 999 ;;
2869 }
2870
2871 { .mfi
2872         nop.m 999
2873 //
2874 //    N odd: Cx = C_hi * x
2875 //
2876 (p0)  fma.s1 P = P2_3, xsq, P2_2
2877         nop.i 999
2878 }
2879
2880 { .mfi
2881         nop.m 999
2882 //
2883 //    N even and odd: P = P2_3 + P2_2 * xsq
2884 //
2885 (p11) fadd.s1 Tx1 = Tx, f1
2886         nop.i 999 ;;
2887 }
2888
2889 { .mfi
2890         nop.m 999
2891 //
2892 //    N even: D = C_hi - tanx
2893 //    N odd: D = T_hi + tanx
2894 //
2895 (p11) fmpy.s1 CORR = SC_inv, T_hi
2896         nop.i 999
2897 }
2898
2899 { .mfi
2900         nop.m 999
2901 (p0)  fmpy.s1 Sx = SC_inv, x
2902         nop.i 999 ;;
2903 }
2904
2905 { .mfi
2906         nop.m 999
2907 (p12) fmpy.s1 CORR = SC_inv, C_hi
2908         nop.i 999 ;;
2909 }
2910
2911 { .mfi
2912         nop.m 999
2913 (p12) fsub.s1 V_hi = f1, Cx
2914         nop.i 999 ;;
2915 }
2916
2917 { .mfi
2918         nop.m 999
2919 (p0)  fma.s1 P = P, xsq, P2_1
2920         nop.i 999
2921 }
2922
2923 { .mfi
2924         nop.m 999
2925 //
2926 //    N even and odd: P = P2_1 + P * xsq
2927 //
2928 (p11) fma.s1 V_hi = Tx, Tx1, f1
2929         nop.i 999 ;;
2930 }
2931
2932 { .mfi
2933         nop.m 999
2934 //
2935 //    N even: Result  = sgn_r * tail + T_hi (user rounding mode for C1)
2936 //    N odd:  Result  = sgn_r * tail + C_hi (user rounding mode for C1)
2937 //
2938 (p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2939         nop.i 999 ;;
2940 }
2941
2942 { .mfi
2943         nop.m 999
2944 (p0)  fmpy.s1 CORR = CORR, c
2945         nop.i 999 ;;
2946 }
2947
2948 { .mfi
2949         nop.m 999
2950 (p12) fnma.s1 V_hi = Cx,V_hi,f1
2951         nop.i 999 ;;
2952 }
2953
2954 { .mfi
2955         nop.m 999
2956 //
2957 //    N even: V_hi = Tx * Tx1 + 1
2958 //    N odd: Cx1 = 1 - Cx * Cx1
2959 //
2960 (p0)  fmpy.s1 P = P, xsq
2961         nop.i 999
2962 }
2963
2964 { .mfi
2965         nop.m 999
2966 //
2967 //    N even and odd: P = P * xsq
2968 //
2969 (p11) fmpy.s1 V_hi = V_hi, T_hi
2970         nop.i 999 ;;
2971 }
2972
2973 { .mfi
2974         nop.m 999
2975 //
2976 //    N even and odd: tail = P * tail + V_lo
2977 //
2978 (p11) fmpy.s1 T_hi = sgn_r, T_hi
2979         nop.i 999 ;;
2980 }
2981
2982 { .mfi
2983         nop.m 999
2984 (p0)  fmpy.s1 CORR = CORR, sgn_r
2985         nop.i 999 ;;
2986 }
2987
2988 { .mfi
2989         nop.m 999
2990 (p12) fmpy.s1 V_hi = V_hi,C_hi
2991         nop.i 999 ;;
2992 }
2993
2994 { .mfi
2995         nop.m 999
2996 //
2997 //    N even: V_hi = T_hi * V_hi
2998 //    N odd: V_hi  = C_hi * V_hi
2999 //
3000 (p0)  fma.s1 tanx = P, x, x
3001         nop.i 999
3002 }
3003
3004 { .mfi
3005         nop.m 999
3006 (p12) fnmpy.s1 C_hi = sgn_r, C_hi
3007         nop.i 999 ;;
3008 }
3009
3010 { .mfi
3011         nop.m 999
3012 //
3013 //    N even: V_lo = 1 - V_hi + C_hi
3014 //    N odd: V_lo = 1 - V_hi + T_hi
3015 //
3016 (p11) fadd.s1 CORR = CORR, T_lo
3017         nop.i 999
3018 }
3019
3020 { .mfi
3021         nop.m 999
3022 (p12) fsub.s1 CORR = CORR, C_lo
3023         nop.i 999 ;;
3024 }
3025
3026 { .mfi
3027         nop.m 999
3028 //
3029 //    N even and odd: tanx = x + x * P
3030 //    N even and odd: Sx = SC_inv * x
3031 //
3032 (p11) fsub.s1 D = C_hi, tanx
3033         nop.i 999
3034 }
3035
3036 { .mfi
3037         nop.m 999
3038 (p12) fadd.s1 D = T_hi, tanx
3039         nop.i 999 ;;
3040 }
3041
3042 { .mfi
3043         nop.m 999
3044 //
3045 //    N odd: CORR = SC_inv * C_hi
3046 //    N even: CORR = SC_inv * T_hi
3047 //
3048 (p0)  fnma.s1 D = V_hi, D, f1
3049         nop.i 999 ;;
3050 }
3051
3052 { .mfi
3053         nop.m 999
3054 //
3055 //    N even and odd: D = 1 - V_hi * D
3056 //    N even and odd: CORR = CORR * c
3057 //
3058 (p0)  fma.s1 V_hi = V_hi, D, V_hi
3059         nop.i 999 ;;
3060 }
3061
3062 { .mfi
3063         nop.m 999
3064 //
3065 //    N even and odd: V_hi = V_hi + V_hi * D
3066 //    N even and odd: CORR = sgn_r * CORR
3067 //
3068 (p11) fnma.s1 V_lo = V_hi, C_hi, f1
3069         nop.i 999
3070 }
3071
3072 { .mfi
3073         nop.m 999
3074 (p12) fnma.s1 V_lo = V_hi, T_hi, f1
3075         nop.i 999 ;;
3076 }
3077
3078 { .mfi
3079         nop.m 999
3080 //
3081 //    N even: CORR = COOR + T_lo
3082 //    N odd: CORR = CORR - C_lo
3083 //
3084 (p11) fma.s1 V_lo = tanx, V_hi, V_lo
3085         nop.i 999
3086 }
3087
3088 { .mfi
3089         nop.m 999
3090 (p12) fnma.s1 V_lo = tanx, V_hi, V_lo
3091         nop.i 999 ;;
3092 }
3093
3094 { .mfi
3095         nop.m 999
3096 //
3097 //    N even: V_lo = V_lo + V_hi * tanx
3098 //    N odd: V_lo = V_lo - V_hi * tanx
3099 //
3100 (p11) fnma.s1 V_lo = C_lo, V_hi, V_lo
3101         nop.i 999
3102 }
3103
3104 { .mfi
3105         nop.m 999
3106 (p12) fnma.s1 V_lo = T_lo, V_hi, V_lo
3107         nop.i 999 ;;
3108 }
3109
3110 { .mfi
3111         nop.m 999
3112 //
3113 //    N  even: V_lo = V_lo - V_hi * C_lo
3114 //    N  odd: V_lo = V_lo - V_hi * T_lo
3115 //
3116 (p0)  fmpy.s1 V_lo = V_hi, V_lo
3117         nop.i 999 ;;
3118 }
3119
3120 { .mfi
3121         nop.m 999
3122 //
3123 //    N even and odd: V_lo = V_lo * V_hi
3124 //
3125 (p0)  fadd.s1 tail = V_hi, V_lo
3126         nop.i 999 ;;
3127 }
3128
3129 { .mfi
3130         nop.m 999
3131 //
3132 //    N even and odd: tail = V_hi + V_lo
3133 //
3134 (p0)  fma.s1 tail = tail, P, V_lo
3135         nop.i 999 ;;
3136 }
3137
3138 { .mfi
3139         nop.m 999
3140 //
3141 //    N even: T_hi = sgn_r * T_hi
3142 //    N odd : C_hi = -sgn_r * C_hi
3143 //
3144 (p0)  fma.s1 tail = tail, Sx, CORR
3145         nop.i 999 ;;
3146 }
3147
3148 { .mfi
3149         nop.m 999
3150 //
3151 //    N even and odd: tail = Sx * tail + CORR
3152 //
3153 (p0)  fma.s1 tail = V_hi, Sx, tail
3154         nop.i 999 ;;
3155 }
3156
3157 { .mfi
3158         nop.m 999
3159 //
3160 //    N even an odd: tail = Sx * V_hi + tail
3161 //
3162 (p11) fma.s0 Result = sgn_r, tail, T_hi
3163         nop.i 999
3164 }
3165
3166 { .mfb
3167         nop.m 999
3168 (p12) fma.s0 Result = sgn_r, tail, C_hi
3169 (p0)   br.ret.sptk b0 ;;
3170 }
3171
3172 .endp __libm_tan
3173 ASM_SIZE_DIRECTIVE(__libm_tan)
3174
3175
3176
3177 // *******************************************************************
3178 // *******************************************************************
3179 // *******************************************************************
3180 //
3181 //     Special Code to handle very large argument case.
3182 //     Call int pi_by_2_reduce(&x,&r)
3183 //     for |arguments| >= 2**63
3184 //     (Arg or x) is in f8
3185 //     Address to save r and c as double
3186
3187 //                 (1)                    (2)                 (3) (call)         (4)
3188 //            sp -> +               psp -> +            psp -> +           sp ->  +
3189 //                  |                      |                   |                  |
3190 //                  |                r50 ->| <- r50      f0  ->|           r50 -> | -> c
3191 //                  |                      |                   |                  |
3192 //         sp-32 -> | <- r50          f0 ->|             f0  ->| <- r50    r49 -> | -> r
3193 //                  |                      |                   |                  |
3194 //                  |               r49  ->| <- r49     Arg  ->| <- r49           | -> x
3195 //                  |                      |                   |                  |
3196 //         sp -64 ->|             sp -64 ->|          sp -64 ->|                  |
3197 //
3198 //            save pfs           save b0                                     restore gp
3199 //            save gp                                                        restore b0
3200 //                                                                           restore pfs
3201
3202
3203
3204 .proc __libm_callout
3205 __libm_callout:
3206 TAN_ARG_TOO_LARGE:
3207 .prologue
3208 // (1)
3209 { .mfi
3210         add   GR_Parameter_r =-32,sp                        // Parameter: r address
3211         nop.f 0
3212 .save   ar.pfs,GR_SAVE_PFS
3213         mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
3214 }
3215 { .mfi
3216 .fframe 64
3217         add sp=-64,sp                           // Create new stack
3218         nop.f 0
3219         mov GR_SAVE_GP=gp                       // Save gp
3220 };;
3221
3222 // (2)
3223 { .mmi
3224         stfe [GR_Parameter_r ] = f0,16                      // Clear Parameter r on stack
3225         add  GR_Parameter_X = 16,sp                        // Parameter x address
3226 .save   b0, GR_SAVE_B0
3227         mov GR_SAVE_B0=b0                       // Save b0
3228 };;
3229
3230 // (3)
3231 .body
3232 { .mib
3233         stfe [GR_Parameter_r ] = f0,-16                     // Clear Parameter c on stack
3234         nop.i 0
3235         nop.b 0
3236 }
3237 { .mib
3238         stfe [GR_Parameter_X] = Arg                        // Store Parameter x on stack
3239         nop.i 0
3240 (p0)    br.call.sptk b0=__libm_pi_by_2_reduce#
3241 }
3242 ;;
3243
3244
3245 // (4)
3246 { .mmi
3247         mov   gp = GR_SAVE_GP                  // Restore gp
3248 (p0)    mov   N_fix_gr = r8
3249         nop.i 999
3250 }
3251 ;;
3252
3253 { .mmi
3254 (p0)    ldfe  Arg        =[GR_Parameter_X],16
3255 (p0)    ldfs  TWO_TO_NEG2 = [table_ptr2],4
3256         nop.i 999
3257 }
3258 ;;
3259
3260
3261 { .mmb
3262 (p0)    ldfe  r =[GR_Parameter_r ],16
3263 (p0)    ldfs  NEGTWO_TO_NEG2 = [table_ptr2],4
3264         nop.b 999 ;;
3265 }
3266
3267 { .mfi
3268 (p0)    ldfe  c =[GR_Parameter_r ]
3269         nop.f 999
3270         nop.i 999 ;;
3271 }
3272
3273 { .mfi
3274         nop.m 999
3275 //
3276 //     Is |r| < 2**(-2)
3277 //
3278 (p0)   fcmp.lt.unc.s1  p6, p0 = r, TWO_TO_NEG2
3279         mov   b0 = GR_SAVE_B0                  // Restore return address
3280 }
3281 ;;
3282
3283 { .mfi
3284        nop.m 999
3285 (p6)   fcmp.gt.unc.s1  p6, p0 = r, NEGTWO_TO_NEG2
3286        mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
3287 }
3288 ;;
3289
3290 { .mbb
3291 .restore sp
3292         add   sp = 64,sp                       // Restore stack pointer
3293 (p6)   br.cond.spnt TAN_SMALL_R
3294 (p0)   br.cond.sptk TAN_NORMAL_R
3295 }
3296 ;;
3297 .endp __libm_callout
3298 ASM_SIZE_DIRECTIVE(__libm_callout)
3299
3300
3301 .proc __libm_TAN_SPECIAL
3302 __libm_TAN_SPECIAL:
3303
3304 //
3305 //     Code for NaNs, Unsupporteds, Infs, or +/- zero ?
3306 //     Invalid raised for Infs and SNaNs.
3307 //
3308
3309 { .mfb
3310         nop.m 999
3311 (p0)   fmpy.s0 Arg = Arg, f0
3312 (p0)   br.ret.sptk b0
3313 }
3314 .endp __libm_TAN_SPECIAL
3315 ASM_SIZE_DIRECTIVE(__libm_TAN_SPECIAL)
3316
3317
3318 .type __libm_pi_by_2_reduce#,@function
3319 .global __libm_pi_by_2_reduce#