sysdeps/ia64/fpu/libm_tan.S

   1 .file "libm_tan.s"
   2
   3 // Copyright (C) 2000, 2001, Intel Corporation
   4 // All rights reserved.
   5 //
   6 //
   7 // Redistribution and use in source and binary forms, with or without
   8 // modification, are permitted provided that the following conditions are
   9 // met:
  10 //
  11 // * Redistributions of source code must retain the above copyright
  12 // notice, this list of conditions and the following disclaimer.
  13 //
  14 // * Redistributions in binary form must reproduce the above copyright
  15 // notice, this list of conditions and the following disclaimer in the
  16 // documentation and/or other materials provided with the distribution.
  17 //
  18 // * The name of Intel Corporation may not be used to endorse or promote
  19 // products derived from this software without specific prior written
  20 // permission.
  21 //
  22 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  23 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  24 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  25 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  26 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  27 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  28 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  29 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  30 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  31 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  32 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  33 //
  34 // Intel Corporation is the author of this code, and requests that all
  35 // problem reports or change requests be submitted to it directly at
  36 // http://developer.intel.com/opensource.
  37 //
  38 // *********************************************************************
  39 //
  40 // History:
  41 // 02/02/00 Initial Version
  42 // 4/04/00  Unwind support added
  43 // 12/28/00 Fixed false invalid flags
  44 //
  45 // *********************************************************************
  46 //
  47 // Function:   tan(x) = tangent(x), for double precision x values
  48 //
  49 // *********************************************************************
  50 //
  51 // Accuracy:       Very accurate for double-precision values
  52 //
  53 // *********************************************************************
  54 //
  55 // Resources Used:
  56 //
  57 //    Floating-Point Registers: f8 (Input and Return Value)
  58 //                              f9-f15
  59 //                              f32-f112
  60 //
  61 //    General Purpose Registers:
  62 //      r32-r48
  63 //      r49-r50 (Used to pass arguments to pi_by_2 reduce routine)
  64 //
  65 //    Predicate Registers:      p6-p15
  66 //
  67 // *********************************************************************
  68 //
  69 // IEEE Special Conditions:
  70 //
  71 //    Denormal  fault raised on denormal inputs
  72 //    Overflow exceptions do not occur
  73 //    Underflow exceptions raised when appropriate for tan
  74 //    (No specialized error handling for this routine)
  75 //    Inexact raised when appropriate by algorithm
  76 //
  77 //    tan(SNaN) = QNaN
  78 //    tan(QNaN) = QNaN
  79 //    tan(inf) = QNaN
  80 //    tan(+/-0) = +/-0
  81 //
  82 // *********************************************************************
  83 //
  84 // Mathematical Description
  85 //
  86 // We consider the computation of FPTAN of Arg. Now, given
  87 //
  88 //      Arg = N pi/2  + alpha,          |alpha| <= pi/4,
  89 //
  90 // basic mathematical relationship shows that
  91 //
  92 //      tan( Arg ) =  tan( alpha )     if N is even;
  93 //                 = -cot( alpha )      otherwise.
  94 //
  95 // The value of alpha is obtained by argument reduction and
  96 // represented by two working precision numbers r and c where
  97 //
  98 //      alpha =  r  +  c     accurately.
  99 //
 100 // The reduction method is described in a previous write up.
 101 // The argument reduction scheme identifies 4 cases. For Cases 2
 102 // and 4, because |alpha| is small, tan(r+c) and -cot(r+c) can be
 103 // computed very easily by 2 or 3 terms of the Taylor series
 104 // expansion as follows:
 105 //
 106 // Case 2:
 107 // -------
 108 //
 109 //      tan(r + c) = r + c + r^3/3          ...accurately
 110 //        -cot(r + c) = -1/(r+c) + r/3          ...accurately
 111 //
 112 // Case 4:
 113 // -------
 114 //
 115 //      tan(r + c) = r + c + r^3/3 + 2r^5/15     ...accurately
 116 //        -cot(r + c) = -1/(r+c) + r/3 + r^3/45     ...accurately
 117 //
 118 //
 119 // The only cases left are Cases 1 and 3 of the argument reduction
 120 // procedure. These two cases will be merged since after the
 121 // argument is reduced in either cases, we have the reduced argument
 122 // represented as r + c and that the magnitude |r + c| is not small
 123 // enough to allow the usage of a very short approximation.
 124 //
 125 // The greatest challenge of this task is that the second terms of
 126 // the Taylor series for tan(r) and -cot(r)
 127 //
 128 //      r + r^3/3 + 2 r^5/15 + ...
 129 //
 130 // and
 131 //
 132 //      -1/r + r/3 + r^3/45 + ...
 133 //
 134 // are not very small when |r| is close to pi/4 and the rounding
 135 // errors will be a concern if simple polynomial accumulation is
 136 // used. When |r| < 2^(-2), however, the second terms will be small
 137 // enough (5 bits or so of right shift) that a normal Horner
 138 // recurrence suffices. Hence there are two cases that we consider
 139 // in the accurate computation of tan(r) and cot(r), |r| <= pi/4.
 140 //
 141 // Case small_r: |r| < 2^(-2)
 142 // --------------------------
 143 //
 144 // Since Arg = N pi/4 + r + c accurately, we have
 145 //
 146 //      tan(Arg) =  tan(r+c)            for N even,
 147 //            = -cot(r+c)          otherwise.
 148 //
 149 // Here for this case, both tan(r) and -cot(r) can be approximated
 150 // by simple polynomials:
 151 //
 152 //      tan(r) =    r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
 153 //        -cot(r) = -1/r + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
 154 //
 155 // accurately. Since |r| is relatively small, tan(r+c) and
 156 // -cot(r+c) can be accurately approximated by replacing r with
 157 // r+c only in the first two terms of the corresponding polynomials.
 158 //
 159 // Note that P1_1 (and Q1_1 for that matter) approximates 1/3 to
 160 // almost 64 sig. bits, thus
 161 //
 162 //      P1_1 (r+c)^3 =  P1_1 r^3 + c * r^2     accurately.
 163 //
 164 // Hence,
 165 //
 166 //      tan(r+c) =    r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
 167 //                     + c*(1 + r^2)
 168 //
 169 //        -cot(r+c) = -1/(r+c) + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
 170 //               + Q1_1*c
 171 //
 172 //
 173 // Case normal_r: 2^(-2) <= |r| <= pi/4
 174 // ------------------------------------
 175 //
 176 // This case is more likely than the previous one if one considers
 177 // r to be uniformly distributed in [-pi/4 pi/4].
 178 //
 179 // The required calculation is either
 180 //
 181 //      tan(r + c)  =  tan(r)  +  correction,  or
 182 //        -cot(r + c)  = -cot(r)  +  correction.
 183 //
 184 // Specifically,
 185 //
 186 //      tan(r + c) =  tan(r) + c tan'(r)  + O(c^2)
 187 //              =  tan(r) + c sec^2(r) + O(c^2)
 188 //              =  tan(r) + c SEC_sq     ...accurately
 189 //                as long as SEC_sq approximates sec^2(r)
 190 //                to, say, 5 bits or so.
 191 //
 192 // Similarly,
 193 //
 194 //        -cot(r + c) = -cot(r) - c cot'(r)  + O(c^2)
 195 //              = -cot(r) + c csc^2(r) + O(c^2)
 196 //              = -cot(r) + c CSC_sq     ...accurately
 197 //                as long as CSC_sq approximates csc^2(r)
 198 //                to, say, 5 bits or so.
 199 //
 200 // We therefore concentrate on accurately calculating tan(r) and
 201 // cot(r) for a working-precision number r, |r| <= pi/4 to within
 202 // 0.1% or so.
 203 //
 204 // We will employ a table-driven approach. Let
 205 //
 206 //      r = sgn_r * 2^k * 1.b_1 b_2 ... b_5 ... b_63
 207 //        = sgn_r * ( B + x )
 208 //
 209 // where
 210 //
 211 //      B = 2^k * 1.b_1 b_2 ... b_5 1
 212 //         x = |r| - B
 213 //
 214 // Now,
 215 //                   tan(B)  +   tan(x)
 216 //      tan( B + x ) =  ------------------------
 217 //                   1 -  tan(B)*tan(x)
 218 //
 219 //               /                         \
 220 //               |   tan(B)  +   tan(x)          |
 221
 222 //      = tan(B) +  | ------------------------ - tan(B) |
 223 //               |     1 -  tan(B)*tan(x)          |
 224 //               \                         /
 225 //
 226 //                 sec^2(B) * tan(x)
 227 //      = tan(B) + ------------------------
 228 //                 1 -  tan(B)*tan(x)
 229 //
 230 //                (1/[sin(B)*cos(B)]) * tan(x)
 231 //      = tan(B) + --------------------------------
 232 //                      cot(B)  -  tan(x)
 233 //
 234 //
 235 // Clearly, the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
 236 // calculated beforehand and stored in a table. Since
 237 //
 238 //      |x| <= 2^k * 2^(-6)  <= 2^(-7)  (because k = -1, -2)
 239 //
 240 // a very short polynomial will be sufficient to approximate tan(x)
 241 // accurately. The details involved in computing the last expression
 242 // will be given in the next section on algorithm description.
 243 //
 244 //
 245 // Now, we turn to the case where cot( B + x ) is needed.
 246 //
 247 //
 248 //                   1 - tan(B)*tan(x)
 249 //      cot( B + x ) =  ------------------------
 250 //                   tan(B)  +  tan(x)
 251 //
 252 //               /                           \
 253 //               |   1 - tan(B)*tan(x)              |
 254
 255 //      = cot(B) +  | ----------------------- - cot(B) |
 256 //               |     tan(B)  +  tan(x)            |
 257 //               \                           /
 258 //
 259 //               [tan(B) + cot(B)] * tan(x)
 260 //      = cot(B) - ----------------------------
 261 //                   tan(B)  +  tan(x)
 262 //
 263 //                (1/[sin(B)*cos(B)]) * tan(x)
 264 //      = cot(B) - --------------------------------
 265 //                      tan(B)  +  tan(x)
 266 //
 267 //
 268 // Note that the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) that
 269 // are needed are the same set of values needed in the previous
 270 // case.
 271 //
 272 // Finally, we can put all the ingredients together as follows:
 273 //
 274 //      Arg = N * pi/2 +  r + c          ...accurately
 275 //
 276 //      tan(Arg) =  tan(r) + correction    if N is even;
 277 //            = -cot(r) + correction    otherwise.
 278 //
 279 // For Cases 2 and 4,
 280 //
 281 //     Case 2:
 282 //     tan(Arg) =  tan(r + c) = r + c + r^3/3           N even
 283 //              = -cot(r + c) = -1/(r+c) + r/3           N odd
 284 //     Case 4:
 285 //     tan(Arg) =  tan(r + c) = r + c + r^3/3 + 2r^5/15  N even
 286 //              = -cot(r + c) = -1/(r+c) + r/3 + r^3/45  N odd
 287 //
 288 //
 289 // For Cases 1 and 3,
 290 //
 291 //     Case small_r: |r| < 2^(-2)
 292 //
 293 //      tan(Arg) =  r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
 294 //                     + c*(1 + r^2)               N even
 295 //
 296 //                  = -1/(r+c) + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
 297 //               + Q1_1*c                    N odd
 298 //
 299 //     Case normal_r: 2^(-2) <= |r| <= pi/4
 300 //
 301 //      tan(Arg) =  tan(r) + c * sec^2(r)     N even
 302 //               = -cot(r) + c * csc^2(r)     otherwise
 303 //
 304 //     For N even,
 305 //
 306 //      tan(Arg) = tan(r) + c*sec^2(r)
 307 //               = tan( sgn_r * (B+x) ) + c * sec^2(|r|)
 308 //                  = sgn_r * ( tan(B+x)  + sgn_r*c*sec^2(|r|) )
 309 //                  = sgn_r * ( tan(B+x)  + sgn_r*c*sec^2(B) )
 310 //
 311 // since B approximates |r| to 2^(-6) in relative accuracy.
 312 //
 313 //                 /            (1/[sin(B)*cos(B)]) * tan(x)
 314 //    tan(Arg) = sgn_r * | tan(B) + --------------------------------
 315 //                 \                     cot(B)  -  tan(x)
 316 //                                        \
 317 //                       + CORR  |
 318
 319 //                                     /
 320 // where
 321 //
 322 //    CORR = sgn_r*c*tan(B)*SC_inv(B);  SC_inv(B) = 1/(sin(B)*cos(B)).
 323 //
 324 // For N odd,
 325 //
 326 //      tan(Arg) = -cot(r) + c*csc^2(r)
 327 //               = -cot( sgn_r * (B+x) ) + c * csc^2(|r|)
 328 //                  = sgn_r * ( -cot(B+x)  + sgn_r*c*csc^2(|r|) )
 329 //                  = sgn_r * ( -cot(B+x)  + sgn_r*c*csc^2(B) )
 330 //
 331 // since B approximates |r| to 2^(-6) in relative accuracy.
 332 //
 333 //                 /            (1/[sin(B)*cos(B)]) * tan(x)
 334 //    tan(Arg) = sgn_r * | -cot(B) + --------------------------------
 335 //                 \                     tan(B)  +  tan(x)
 336 //                                        \
 337 //                       + CORR  |
 338
 339 //                                     /
 340 // where
 341 //
 342 //    CORR = sgn_r*c*cot(B)*SC_inv(B);  SC_inv(B) = 1/(sin(B)*cos(B)).
 343 //
 344 //
 345 // The actual algorithm prescribes how all the mathematical formulas
 346 // are calculated.
 347 //
 348 //
 349 // 2. Algorithmic Description
 350 // ==========================
 351 //
 352 // 2.1 Computation for Cases 2 and 4.
 353 // ----------------------------------
 354 //
 355 // For Case 2, we use two-term polynomials.
 356 //
 357 //    For N even,
 358 //
 359 //    rsq := r * r
 360 //    Result := c + r * rsq * P1_1
 361 //    Result := r + Result          ...in user-defined rounding
 362 //
 363 //    For N odd,
 364 //    S_hi  := -frcpa(r)               ...8 bits
 365 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
 366 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
 367 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
 368 //    S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
 369 //    ...S_hi + S_lo is -1/(r+c) to extra precision
 370 //    S_lo  := S_lo + Q1_1*r
 371 //
 372 //    Result := S_hi + S_lo     ...in user-defined rounding
 373 //
 374 // For Case 4, we use three-term polynomials
 375 //
 376 //    For N even,
 377 //
 378 //    rsq := r * r
 379 //    Result := c + r * rsq * (P1_1 + rsq * P1_2)
 380 //    Result := r + Result          ...in user-defined rounding
 381 //
 382 //    For N odd,
 383 //    S_hi  := -frcpa(r)               ...8 bits
 384 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
 385 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
 386 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
 387 //    S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
 388 //    ...S_hi + S_lo is -1/(r+c) to extra precision
 389 //    rsq   := r * r
 390 //    P      := Q1_1 + rsq*Q1_2
 391 //    S_lo  := S_lo + r*P
 392 //
 393 //    Result := S_hi + S_lo     ...in user-defined rounding
 394 //
 395 //
 396 // Note that the coefficients P1_1, P1_2, Q1_1, and Q1_2 are
 397 // the same as those used in the small_r case of Cases 1 and 3
 398 // below.
 399 //
 400 //
 401 // 2.2 Computation for Cases 1 and 3.
 402 // ----------------------------------
 403 // This is further divided into the case of small_r,
 404 // where |r| < 2^(-2), and the case of normal_r, where |r| lies between
 405 // 2^(-2) and pi/4.
 406 //
 407 // Algorithm for the case of small_r
 408 // ---------------------------------
 409 //
 410 // For N even,
 411 //      rsq   := r * r
 412 //      Poly1 := rsq*(P1_1 + rsq*(P1_2 + rsq*P1_3))
 413 //      r_to_the_8    := rsq * rsq
 414 //      r_to_the_8    := r_to_the_8 * r_to_the_8
 415 //      Poly2 := P1_4 + rsq*(P1_5 + rsq*(P1_6 + ... rsq*P1_9))
 416 //      CORR  := c * ( 1 + rsq )
 417 //      Poly  := Poly1 + r_to_the_8*Poly2
 418 //      Result := r*Poly + CORR
 419 //      Result := r + Result     ...in user-defined rounding
 420 //      ...note that Poly1 and r_to_the_8 can be computed in parallel
 421 //      ...with Poly2 (Poly1 is intentionally set to be much
 422 //      ...shorter than Poly2 so that r_to_the_8 and CORR can be hidden)
 423 //
 424 // For N odd,
 425 //      S_hi  := -frcpa(r)               ...8 bits
 426 //      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
 427 //      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
 428 //      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
 429 //      S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
 430 //      ...S_hi + S_lo is -1/(r+c) to extra precision
 431 //      S_lo  := S_lo + Q1_1*c
 432 //
 433 //      ...S_hi and S_lo are computed in parallel with
 434 //      ...the following
 435 //      rsq := r*r
 436 //      P   := Q1_1 + rsq*(Q1_2 + rsq*(Q1_3 + ... + rsq*Q1_7))
 437 //
 438 //      Result :=  r*P + S_lo
 439 //      Result :=  S_hi  +  Result      ...in user-defined rounding
 440 //
 441 //
 442 // Algorithm for the case of normal_r
 443 // ----------------------------------
 444 //
 445 // Here, we first consider the computation of tan( r + c ). As
 446 // presented in the previous section,
 447 //
 448 //      tan( r + c )  =  tan(r) + c * sec^2(r)
 449 //                 =  sgn_r * [ tan(B+x) + CORR ]
 450 //      CORR = sgn_r * c * tan(B) * 1/[sin(B)*cos(B)]
 451 //
 452 // because sec^2(r) = sec^(|r|), and B approximate |r| to 6.5 bits.
 453 //
 454 //      tan( r + c ) =
 455 //           /           (1/[sin(B)*cos(B)]) * tan(x)
 456 //      sgn_r * | tan(B) + --------------------------------  +
 457 //           \                     cot(B)  -  tan(x)
 458 //                                \
 459 //                          CORR  |
 460
 461 //                                /
 462 //
 463 // The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
 464 // calculated beforehand and stored in a table. Specifically,
 465 // the table values are
 466 //
 467 //      tan(B)                as  T_hi  +  T_lo;
 468 //      cot(B)             as  C_hi  +  C_lo;
 469 //      1/[sin(B)*cos(B)]  as  SC_inv
 470 //
 471 // T_hi, C_hi are in  double-precision  memory format;
 472 // T_lo, C_lo are in  single-precision  memory format;
 473 // SC_inv     is  in extended-precision memory format.
 474 //
 475 // The value of tan(x) will be approximated by a short polynomial of
 476 // the form
 477 //
 478 //      tan(x)  as  x  +  x * P, where
 479 //           P  =   x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
 480 //
 481 // Because |x| <= 2^(-7), cot(B) - x approximates cot(B) - tan(x)
 482 // to a relative accuracy better than 2^(-20). Thus, a good
 483 // initial guess of 1/( cot(B) - tan(x) ) to initiate the iterative
 484 // division is:
 485 //
 486 //      1/(cot(B) - tan(x))      is approximately
 487 //      1/(cot(B) -   x)         is
 488 //      tan(B)/(1 - x*tan(B))    is approximately
 489 //      T_hi / ( 1 - T_hi * x )  is approximately
 490 //
 491 //      T_hi * [ 1 + (Thi * x) + (T_hi * x)^2 ]
 492 //
 493 // The calculation of tan(r+c) therefore proceed as follows:
 494 //
 495 //      Tx     := T_hi * x
 496 //      xsq     := x * x
 497 //
 498 //      V_hi     := T_hi*(1 + Tx*(1 + Tx))
 499 //      P     := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
 500 //      ...V_hi serves as an initial guess of 1/(cot(B) - tan(x))
 501 //         ...good to about 20 bits of accuracy
 502 //
 503 //      tanx     := x + x*P
 504 //      D     := C_hi - tanx
 505 //      ...D is a double precision denominator: cot(B) - tan(x)
 506 //
 507 //      V_hi     := V_hi + V_hi*(1 - V_hi*D)
 508 //      ....V_hi approximates 1/(cot(B)-tan(x)) to 40 bits
 509 //
 510 //      V_lo     := V_hi * ( [ (1 - V_hi*C_hi) + V_hi*tanx ]
 511 //                           - V_hi*C_lo )   ...observe all order
 512 //         ...V_hi + V_lo approximates 1/(cot(B) - tan(x))
 513 //      ...to extra accuracy
 514 //
 515 //      ...               SC_inv(B) * (x + x*P)
 516 //      ...   tan(B) +      ------------------------- + CORR
 517 //         ...                cot(B) - (x + x*P)
 518 //      ...
 519 //      ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 520 //      ...
 521 //
 522 //      Sx     := SC_inv * x
 523 //      CORR     := sgn_r * c * SC_inv * T_hi
 524 //
 525 //      ...put the ingredients together to compute
 526 //      ...               SC_inv(B) * (x + x*P)
 527 //      ...   tan(B) +      ------------------------- + CORR
 528 //         ...                cot(B) - (x + x*P)
 529 //      ...
 530 //      ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 531 //      ...
 532 //      ... = T_hi + T_lo + CORR +
 533 //      ...    Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
 534 //
 535 //      CORR := CORR + T_lo
 536 //      tail := V_lo + P*(V_hi + V_lo)
 537 //         tail := Sx * tail  +  CORR
 538 //      tail := Sx * V_hi  +  tail
 539 //         T_hi := sgn_r * T_hi
 540 //
 541 //         ...T_hi + sgn_r*tail  now approximate
 542 //      ...sgn_r*(tan(B+x) + CORR) accurately
 543 //
 544 //      Result :=  T_hi + sgn_r*tail  ...in user-defined
 545 //                           ...rounding control
 546 //      ...It is crucial that independent paths be fully
 547 //      ...exploited for performance's sake.
 548 //
 549 //
 550 // Next, we consider the computation of -cot( r + c ). As
 551 // presented in the previous section,
 552 //
 553 //        -cot( r + c )  =  -cot(r) + c * csc^2(r)
 554 //                 =  sgn_r * [ -cot(B+x) + CORR ]
 555 //      CORR = sgn_r * c * cot(B) * 1/[sin(B)*cos(B)]
 556 //
 557 // because csc^2(r) = csc^(|r|), and B approximate |r| to 6.5 bits.
 558 //
 559 //        -cot( r + c ) =
 560 //           /             (1/[sin(B)*cos(B)]) * tan(x)
 561 //      sgn_r * | -cot(B) + --------------------------------  +
 562 //           \                     tan(B)  +  tan(x)
 563 //                                \
 564 //                          CORR  |
 565
 566 //                                /
 567 //
 568 // The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
 569 // calculated beforehand and stored in a table. Specifically,
 570 // the table values are
 571 //
 572 //      tan(B)                as  T_hi  +  T_lo;
 573 //      cot(B)             as  C_hi  +  C_lo;
 574 //      1/[sin(B)*cos(B)]  as  SC_inv
 575 //
 576 // T_hi, C_hi are in  double-precision  memory format;
 577 // T_lo, C_lo are in  single-precision  memory format;
 578 // SC_inv     is  in extended-precision memory format.
 579 //
 580 // The value of tan(x) will be approximated by a short polynomial of
 581 // the form
 582 //
 583 //      tan(x)  as  x  +  x * P, where
 584 //           P  =   x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
 585 //
 586 // Because |x| <= 2^(-7), tan(B) + x approximates tan(B) + tan(x)
 587 // to a relative accuracy better than 2^(-18). Thus, a good
 588 // initial guess of 1/( tan(B) + tan(x) ) to initiate the iterative
 589 // division is:
 590 //
 591 //      1/(tan(B) + tan(x))      is approximately
 592 //      1/(tan(B) +   x)         is
 593 //      cot(B)/(1 + x*cot(B))    is approximately
 594 //      C_hi / ( 1 + C_hi * x )  is approximately
 595 //
 596 //      C_hi * [ 1 - (C_hi * x) + (C_hi * x)^2 ]
 597 //
 598 // The calculation of -cot(r+c) therefore proceed as follows:
 599 //
 600 //      Cx     := C_hi * x
 601 //      xsq     := x * x
 602 //
 603 //      V_hi     := C_hi*(1 - Cx*(1 - Cx))
 604 //      P     := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
 605 //      ...V_hi serves as an initial guess of 1/(tan(B) + tan(x))
 606 //         ...good to about 18 bits of accuracy
 607 //
 608 //      tanx     := x + x*P
 609 //      D     := T_hi + tanx
 610 //      ...D is a double precision denominator: tan(B) + tan(x)
 611 //
 612 //      V_hi     := V_hi + V_hi*(1 - V_hi*D)
 613 //      ....V_hi approximates 1/(tan(B)+tan(x)) to 40 bits
 614 //
 615 //      V_lo     := V_hi * ( [ (1 - V_hi*T_hi) - V_hi*tanx ]
 616 //                           - V_hi*T_lo )   ...observe all order
 617 //         ...V_hi + V_lo approximates 1/(tan(B) + tan(x))
 618 //      ...to extra accuracy
 619 //
 620 //      ...               SC_inv(B) * (x + x*P)
 621 //      ...  -cot(B) +      ------------------------- + CORR
 622 //         ...                tan(B) + (x + x*P)
 623 //      ...
 624 //      ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 625 //      ...
 626 //
 627 //      Sx     := SC_inv * x
 628 //      CORR     := sgn_r * c * SC_inv * C_hi
 629 //
 630 //      ...put the ingredients together to compute
 631 //      ...               SC_inv(B) * (x + x*P)
 632 //      ...  -cot(B) +      ------------------------- + CORR
 633 //         ...                tan(B) + (x + x*P)
 634 //      ...
 635 //      ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 636 //      ...
 637 //      ... =-C_hi - C_lo + CORR +
 638 //      ...    Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
 639 //
 640 //      CORR := CORR - C_lo
 641 //      tail := V_lo + P*(V_hi + V_lo)
 642 //         tail := Sx * tail  +  CORR
 643 //      tail := Sx * V_hi  +  tail
 644 //         C_hi := -sgn_r * C_hi
 645 //
 646 //         ...C_hi + sgn_r*tail now approximates
 647 //      ...sgn_r*(-cot(B+x) + CORR) accurately
 648 //
 649 //      Result :=  C_hi + sgn_r*tail   in user-defined rounding control
 650 //      ...It is crucial that independent paths be fully
 651 //      ...exploited for performance's sake.
 652 //
 653 // 3. Implementation Notes
 654 // =======================
 655 //
 656 //   Table entries T_hi, T_lo; C_hi, C_lo; SC_inv
 657 //
 658 //   Recall that 2^(-2) <= |r| <= pi/4;
 659 //
 660 //      r = sgn_r * 2^k * 1.b_1 b_2 ... b_63
 661 //
 662 //   and
 663 //
 664 //        B = 2^k * 1.b_1 b_2 b_3 b_4 b_5 1
 665 //
 666 //   Thus, for k = -2, possible values of B are
 667 //
 668 //          B = 2^(-2) * ( 1 + index/32  +  1/64 ),
 669 //      index ranges from 0 to 31
 670 //
 671 //   For k = -1, however, since |r| <= pi/4 = 0.78...
 672 //   possible values of B are
 673 //
 674 //        B = 2^(-1) * ( 1 + index/32  +  1/64 )
 675 //      index ranges from 0 to 19.
 676 //
 677 //
 678
 679 #include "libm_support.h"
 680
 681 #ifdef _LIBC
 682 .rodata
 683 #else
 684 .data
 685 #endif
 686
 687 .align 128
 688
 689 TAN_BASE_CONSTANTS:
 690 .type TAN_BASE_CONSTANTS, @object
 691 data4    0x4B800000, 0xCB800000, 0x38800000, 0xB8800000 // two**24, -two**24
 692                                                         // two**-14, -two**-14
 693 data4    0x4E44152A, 0xA2F9836E, 0x00003FFE, 0x00000000 // two_by_pi
 694 data4    0xCE81B9F1, 0xC84D32B0, 0x00004016, 0x00000000 // P_0
 695 data4    0x2168C235, 0xC90FDAA2, 0x00003FFF, 0x00000000 // P_1
 696 data4    0xFC8F8CBB, 0xECE675D1, 0x0000BFBD, 0x00000000 // P_2
 697 data4    0xACC19C60, 0xB7ED8FBB, 0x0000BF7C, 0x00000000 // P_3
 698 data4    0x5F000000, 0xDF000000, 0x00000000, 0x00000000 // two_to_63, -two_to_63
 699 data4    0x6EC6B45A, 0xA397E504, 0x00003FE7, 0x00000000 // Inv_P_0
 700 data4    0xDBD171A1, 0x8D848E89, 0x0000BFBF, 0x00000000 // d_1
 701 data4    0x18A66F8E, 0xD5394C36, 0x0000BF7C, 0x00000000 // d_2
 702 data4    0x2168C234, 0xC90FDAA2, 0x00003FFE, 0x00000000 // PI_BY_4
 703 data4    0x2168C234, 0xC90FDAA2, 0x0000BFFE, 0x00000000 // MPI_BY_4
 704 data4    0x3E800000, 0xBE800000, 0x00000000, 0x00000000 // two**-2, -two**-2
 705 data4    0x2F000000, 0xAF000000, 0x00000000, 0x00000000 // two**-33, -two**-33
 706 data4    0xAAAAAABD, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P1_1
 707 data4    0x88882E6A, 0x88888888, 0x00003FFC, 0x00000000 // P1_2
 708 data4    0x0F0177B6, 0xDD0DD0DD, 0x00003FFA, 0x00000000 // P1_3
 709 data4    0x646B8C6D, 0xB327A440, 0x00003FF9, 0x00000000 // P1_4
 710 data4    0x1D5F7D20, 0x91371B25, 0x00003FF8, 0x00000000 // P1_5
 711 data4    0x61C67914, 0xEB69A5F1, 0x00003FF6, 0x00000000 // P1_6
 712 data4    0x019318D2, 0xBEDD37BE, 0x00003FF5, 0x00000000 // P1_7
 713 data4    0x3C794015, 0x9979B146, 0x00003FF4, 0x00000000 // P1_8
 714 data4    0x8C6EB58A, 0x8EBD21A3, 0x00003FF3, 0x00000000 // P1_9
 715 data4    0xAAAAAAB4, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // Q1_1
 716 data4    0x0B5FC93E, 0xB60B60B6, 0x00003FF9, 0x00000000 // Q1_2
 717 data4    0x0C9BBFBF, 0x8AB355E0, 0x00003FF6, 0x00000000 // Q1_3
 718 data4    0xCBEE3D4C, 0xDDEBBC89, 0x00003FF2, 0x00000000 // Q1_4
 719 data4    0x5F80BBB6, 0xB3548A68, 0x00003FEF, 0x00000000 // Q1_5
 720 data4    0x4CED5BF1, 0x91362560, 0x00003FEC, 0x00000000 // Q1_6
 721 data4    0x8EE92A83, 0xF189D95A, 0x00003FE8, 0x00000000 // Q1_7
 722 data4    0xAAAB362F, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P2_1
 723 data4    0xE97A6097, 0x88888886, 0x00003FFC, 0x00000000 // P2_2
 724 data4    0x25E716A1, 0xDD108EE0, 0x00003FFA, 0x00000000 // P2_3
 725 //
 726 //  Entries T_hi   double-precision memory format
 727 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 728 //  Entries T_lo  single-precision memory format
 729 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 730 //
 731 data4    0x62400794, 0x3FD09BC3, 0x23A05C32, 0x00000000
 732 data4    0xDFFBC074, 0x3FD124A9, 0x240078B2, 0x00000000
 733 data4    0x5BD4920F, 0x3FD1AE23, 0x23826B8E, 0x00000000
 734 data4    0x15E2701D, 0x3FD23835, 0x22D31154, 0x00000000
 735 data4    0x63739C2D, 0x3FD2C2E4, 0x2265C9E2, 0x00000000
 736 data4    0xAFEEA48B, 0x3FD34E36, 0x245C05EB, 0x00000000
 737 data4    0x7DBB35D1, 0x3FD3DA31, 0x24749F2D, 0x00000000
 738 data4    0x67321619, 0x3FD466DA, 0x2462CECE, 0x00000000
 739 data4    0x1F94A4D5, 0x3FD4F437, 0x246D0DF1, 0x00000000
 740 data4    0x740C3E6D, 0x3FD5824D, 0x240A85B5, 0x00000000
 741 data4    0x4CB1E73D, 0x3FD61123, 0x23F96E33, 0x00000000
 742 data4    0xAD9EA64B, 0x3FD6A0BE, 0x247C5393, 0x00000000
 743 data4    0xB804FD01, 0x3FD73125, 0x241F3B29, 0x00000000
 744 data4    0xAB53EE83, 0x3FD7C25E, 0x2479989B, 0x00000000
 745 data4    0xE6640EED, 0x3FD8546F, 0x23B343BC, 0x00000000
 746 data4    0xE8AF1892, 0x3FD8E75F, 0x241454D1, 0x00000000
 747 data4    0x53928BDA, 0x3FD97B35, 0x238613D9, 0x00000000
 748 data4    0xEB9DE4DE, 0x3FDA0FF6, 0x22859FA7, 0x00000000
 749 data4    0x99ECF92D, 0x3FDAA5AB, 0x237A6D06, 0x00000000
 750 data4    0x6D8F1796, 0x3FDB3C5A, 0x23952F6C, 0x00000000
 751 data4    0x9CFB8BE4, 0x3FDBD40A, 0x2280FC95, 0x00000000
 752 data4    0x87943100, 0x3FDC6CC3, 0x245D2EC0, 0x00000000
 753 data4    0xB736C500, 0x3FDD068C, 0x23C4AD7D, 0x00000000
 754 data4    0xE1DDBC31, 0x3FDDA16D, 0x23D076E6, 0x00000000
 755 data4    0xEB515A93, 0x3FDE3D6E, 0x244809A6, 0x00000000
 756 data4    0xE6E9E5F1, 0x3FDEDA97, 0x220856C8, 0x00000000
 757 data4    0x1963CE69, 0x3FDF78F1, 0x244BE993, 0x00000000
 758 data4    0x7D635BCE, 0x3FE00C41, 0x23D21799, 0x00000000
 759 data4    0x1C302CD3, 0x3FE05CAB, 0x248A1B1D, 0x00000000
 760 data4    0xDB6A1FA0, 0x3FE0ADB9, 0x23D53E33, 0x00000000
 761 data4    0x4A20BA81, 0x3FE0FF72, 0x24DB9ED5, 0x00000000
 762 data4    0x153FA6F5, 0x3FE151D9, 0x24E9E451, 0x00000000
 763 //
 764 //  Entries T_hi   double-precision memory format
 765 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 766 //  Entries T_lo  single-precision memory format
 767 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 768 //
 769 data4    0xBA1BE39E, 0x3FE1CEC4, 0x24B60F9E, 0x00000000
 770 data4    0x5ABD9B2D, 0x3FE277E4, 0x248C2474, 0x00000000
 771 data4    0x0272B110, 0x3FE32418, 0x247B8311, 0x00000000
 772 data4    0x890E2DF0, 0x3FE3D38B, 0x24C55751, 0x00000000
 773 data4    0x46236871, 0x3FE4866D, 0x24E5BC34, 0x00000000
 774 data4    0x45E044B0, 0x3FE53CEE, 0x24001BA4, 0x00000000
 775 data4    0x82EC06E4, 0x3FE5F742, 0x24B973DC, 0x00000000
 776 data4    0x25DF43F9, 0x3FE6B5A1, 0x24895440, 0x00000000
 777 data4    0xCAFD348C, 0x3FE77844, 0x240021CA, 0x00000000
 778 data4    0xCEED6B92, 0x3FE83F6B, 0x24C45372, 0x00000000
 779 data4    0xA34F3665, 0x3FE90B58, 0x240DAD33, 0x00000000
 780 data4    0x2C1E56B4, 0x3FE9DC52, 0x24F846CE, 0x00000000
 781 data4    0x27041578, 0x3FEAB2A4, 0x2323FB6E, 0x00000000
 782 data4    0x9DD8C373, 0x3FEB8E9F, 0x24B3090B, 0x00000000
 783 data4    0x65C9AA7B, 0x3FEC709B, 0x2449F611, 0x00000000
 784 data4    0xACCF8435, 0x3FED58F4, 0x23616A7E, 0x00000000
 785 data4    0x97635082, 0x3FEE480F, 0x24C2FEAE, 0x00000000
 786 data4    0xF0ACC544, 0x3FEF3E57, 0x242CE964, 0x00000000
 787 data4    0xF7E06E4B, 0x3FF01E20, 0x2480D3EE, 0x00000000
 788 data4    0x8A798A69, 0x3FF0A125, 0x24DB8967, 0x00000000
 789 //
 790 //  Entries C_hi   double-precision memory format
 791 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 792 //  Entries C_lo  single-precision memory format
 793 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 794 //
 795 data4    0xE63EFBD0, 0x400ED3E2, 0x259D94D4, 0x00000000
 796 data4    0xC515DAB5, 0x400DDDB4, 0x245F0537, 0x00000000
 797 data4    0xBE19A79F, 0x400CF57A, 0x25D4EA9F, 0x00000000
 798 data4    0xD15298ED, 0x400C1A06, 0x24AE40A0, 0x00000000
 799 data4    0x164B2708, 0x400B4A4C, 0x25A5AAB6, 0x00000000
 800 data4    0x5285B068, 0x400A855A, 0x25524F18, 0x00000000
 801 data4    0x3FFA549F, 0x4009CA5A, 0x24C999C0, 0x00000000
 802 data4    0x646AF623, 0x4009188A, 0x254FD801, 0x00000000
 803 data4    0x6084D0E7, 0x40086F3C, 0x2560F5FD, 0x00000000
 804 data4    0xA29A76EE, 0x4007CDD2, 0x255B9D19, 0x00000000
 805 data4    0x6C8ECA95, 0x400733BE, 0x25CB021B, 0x00000000
 806 data4    0x1F8DDC52, 0x4006A07E, 0x24AB4722, 0x00000000
 807 data4    0xC298AD58, 0x4006139B, 0x252764E2, 0x00000000
 808 data4    0xBAD7164B, 0x40058CAB, 0x24DAF5DB, 0x00000000
 809 data4    0xAE31A5D3, 0x40050B4B, 0x25EA20F4, 0x00000000
 810 data4    0x89F85A8A, 0x40048F21, 0x2583A3E8, 0x00000000
 811 data4    0xA862380D, 0x400417DA, 0x25DCC4CC, 0x00000000
 812 data4    0x1088FCFE, 0x4003A52B, 0x2430A492, 0x00000000
 813 data4    0xCD3527D5, 0x400336CC, 0x255F77CF, 0x00000000
 814 data4    0x5760766D, 0x4002CC7F, 0x25DA0BDA, 0x00000000
 815 data4    0x11CE02E3, 0x40026607, 0x256FF4A2, 0x00000000
 816 data4    0xD37BBE04, 0x4002032C, 0x25208AED, 0x00000000
 817 data4    0x7F050775, 0x4001A3BD, 0x24B72DD6, 0x00000000
 818 data4    0xA554848A, 0x40014789, 0x24AB4DAA, 0x00000000
 819 data4    0x323E81B7, 0x4000EE65, 0x2584C440, 0x00000000
 820 data4    0x21CF1293, 0x40009827, 0x25C9428D, 0x00000000
 821 data4    0x3D415EEB, 0x400044A9, 0x25DC8482, 0x00000000
 822 data4    0xBD72C577, 0x3FFFE78F, 0x257F5070, 0x00000000
 823 data4    0x75EFD28E, 0x3FFF4AC3, 0x23EBBF7A, 0x00000000
 824 data4    0x60B52DDE, 0x3FFEB2AF, 0x22EECA07, 0x00000000
 825 data4    0x35204180, 0x3FFE1F19, 0x24191079, 0x00000000
 826 data4    0x54F7E60A, 0x3FFD8FCA, 0x248D3058, 0x00000000
 827 //
 828 //  Entries C_hi   double-precision memory format
 829 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 830 //  Entries C_lo  single-precision memory format
 831 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 832 //
 833 data4    0x79F6FADE, 0x3FFCC06A, 0x239C7886, 0x00000000
 834 data4    0x891662A6, 0x3FFBB91F, 0x250BD191, 0x00000000
 835 data4    0x529F155D, 0x3FFABFB6, 0x256CC3E6, 0x00000000
 836 data4    0x2E964AE9, 0x3FF9D300, 0x250843E3, 0x00000000
 837 data4    0x89DCB383, 0x3FF8F1EF, 0x2277C87E, 0x00000000
 838 data4    0x7C87DBD6, 0x3FF81B93, 0x256DA6CF, 0x00000000
 839 data4    0x1042EDE4, 0x3FF74F14, 0x2573D28A, 0x00000000
 840 data4    0x1784B360, 0x3FF68BAF, 0x242E489A, 0x00000000
 841 data4    0x7C923C4C, 0x3FF5D0B5, 0x2532D940, 0x00000000
 842 data4    0xF418EF20, 0x3FF51D88, 0x253C7DD6, 0x00000000
 843 data4    0x02F88DAE, 0x3FF4719A, 0x23DB59BF, 0x00000000
 844 data4    0x49DA0788, 0x3FF3CC66, 0x252B4756, 0x00000000
 845 data4    0x0B980DB8, 0x3FF32D77, 0x23FE585F, 0x00000000
 846 data4    0xE56C987A, 0x3FF2945F, 0x25378A63, 0x00000000
 847 data4    0xB16523F6, 0x3FF200BD, 0x247BB2E0, 0x00000000
 848 data4    0x8CE27778, 0x3FF17235, 0x24446538, 0x00000000
 849 data4    0xFDEFE692, 0x3FF0E873, 0x2514638F, 0x00000000
 850 data4    0x33154062, 0x3FF0632C, 0x24A7FC27, 0x00000000
 851 data4    0xB3EF115F, 0x3FEFC42E, 0x248FD0FE, 0x00000000
 852 data4    0x135D26F6, 0x3FEEC9E8, 0x2385C719, 0x00000000
 853 //
 854 //  Entries SC_inv in Swapped IEEE format (extended)
 855 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 856 //
 857 data4    0x1BF30C9E, 0x839D6D4A, 0x00004001, 0x00000000
 858 data4    0x554B0EB0, 0x80092804, 0x00004001, 0x00000000
 859 data4    0xA1CF0DE9, 0xF959F94C, 0x00004000, 0x00000000
 860 data4    0x77378677, 0xF3086BA0, 0x00004000, 0x00000000
 861 data4    0xCCD4723C, 0xED154515, 0x00004000, 0x00000000
 862 data4    0x1C27CF25, 0xE7790944, 0x00004000, 0x00000000
 863 data4    0x8DDACB88, 0xE22D037D, 0x00004000, 0x00000000
 864 data4    0x89C73522, 0xDD2B2D8A, 0x00004000, 0x00000000
 865 data4    0xBB2C1171, 0xD86E1A23, 0x00004000, 0x00000000
 866 data4    0xDFF5E0F9, 0xD3F0E288, 0x00004000, 0x00000000
 867 data4    0x283BEBD5, 0xCFAF16B1, 0x00004000, 0x00000000
 868 data4    0x0D88DD53, 0xCBA4AFAA, 0x00004000, 0x00000000
 869 data4    0xCA67C43D, 0xC7CE03CC, 0x00004000, 0x00000000
 870 data4    0x0CA0DDB0, 0xC427BC82, 0x00004000, 0x00000000
 871 data4    0xF13D8CAB, 0xC0AECD57, 0x00004000, 0x00000000
 872 data4    0x71ECE6B1, 0xBD606C38, 0x00004000, 0x00000000
 873 data4    0xA44C4929, 0xBA3A0A96, 0x00004000, 0x00000000
 874 data4    0xE5CCCEC1, 0xB7394F6F, 0x00004000, 0x00000000
 875 data4    0x9637D8BC, 0xB45C1203, 0x00004000, 0x00000000
 876 data4    0x92CB051B, 0xB1A05528, 0x00004000, 0x00000000
 877 data4    0x6BA2FFD0, 0xAF04432B, 0x00004000, 0x00000000
 878 data4    0x7221235F, 0xAC862A23, 0x00004000, 0x00000000
 879 data4    0x5F00A9D1, 0xAA2478AF, 0x00004000, 0x00000000
 880 data4    0x81E082BF, 0xA7DDBB0C, 0x00004000, 0x00000000
 881 data4    0x45684FEE, 0xA5B0987D, 0x00004000, 0x00000000
 882 data4    0x627A8F53, 0xA39BD0F5, 0x00004000, 0x00000000
 883 data4    0x6EC5C8B0, 0xA19E3B03, 0x00004000, 0x00000000
 884 data4    0x91CD7C66, 0x9FB6C1F0, 0x00004000, 0x00000000
 885 data4    0x1FA3DF8A, 0x9DE46410, 0x00004000, 0x00000000
 886 data4    0xA8F6B888, 0x9C263139, 0x00004000, 0x00000000
 887 data4    0xC27B0450, 0x9A7B4968, 0x00004000, 0x00000000
 888 data4    0x5EE614EE, 0x98E2DB7E, 0x00004000, 0x00000000
 889 //
 890 //  Entries SC_inv in Swapped IEEE format (extended)
 891 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 892 //
 893 data4    0x13B2B5BA, 0x969F335C, 0x00004000, 0x00000000
 894 data4    0xD4C0F548, 0x93D446D9, 0x00004000, 0x00000000
 895 data4    0x61B798AF, 0x9147094F, 0x00004000, 0x00000000
 896 data4    0x758787AC, 0x8EF317CC, 0x00004000, 0x00000000
 897 data4    0xB99EEFDB, 0x8CD498B3, 0x00004000, 0x00000000
 898 data4    0xDFF8BC37, 0x8AE82A7D, 0x00004000, 0x00000000
 899 data4    0xE3C55D42, 0x892AD546, 0x00004000, 0x00000000
 900 data4    0xD15573C1, 0x8799FEA9, 0x00004000, 0x00000000
 901 data4    0x435A4B4C, 0x86335F88, 0x00004000, 0x00000000
 902 data4    0x3E93A87B, 0x84F4FB6E, 0x00004000, 0x00000000
 903 data4    0x80A382FB, 0x83DD1952, 0x00004000, 0x00000000
 904 data4    0xA4CB8C9E, 0x82EA3D7F, 0x00004000, 0x00000000
 905 data4    0x6861D0A8, 0x821B247C, 0x00004000, 0x00000000
 906 data4    0x63E8D244, 0x816EBED1, 0x00004000, 0x00000000
 907 data4    0x27E4CFC6, 0x80E42D91, 0x00004000, 0x00000000
 908 data4    0x28E64AFD, 0x807ABF8D, 0x00004000, 0x00000000
 909 data4    0x863B4FD8, 0x8031EF26, 0x00004000, 0x00000000
 910 data4    0xAE8C11FD, 0x800960AD, 0x00004000, 0x00000000
 911 data4    0x5FDBEC21, 0x8000E147, 0x00004000, 0x00000000
 912 data4    0xA07791FA, 0x80186650, 0x00004000, 0x00000000
 913
 914 Arg                 = f8
 915 Result              = f8
 916 fp_tmp              = f9
 917 U_2                 = f10
 918 rsq                =  f11
 919 C_hi                = f12
 920 C_lo                = f13
 921 T_hi                = f14
 922 T_lo                = f15
 923
 924 N_0                 = f32
 925 d_1                 = f33
 926 MPI_BY_4            = f34
 927 tail                = f35
 928 tanx                = f36
 929 Cx                  = f37
 930 Sx                  = f38
 931 sgn_r               = f39
 932 CORR                = f40
 933 P                   = f41
 934 D                   = f42
 935 ArgPrime            = f43
 936 P_0                 = f44
 937
 938 P2_1                = f45
 939 P2_2                = f46
 940 P2_3                = f47
 941
 942 P1_1                = f45
 943 P1_2                = f46
 944 P1_3                = f47
 945
 946 P1_4                = f48
 947 P1_5                = f49
 948 P1_6                = f50
 949 P1_7                = f51
 950 P1_8                = f52
 951 P1_9                = f53
 952
 953 TWO_TO_63           = f54
 954 NEGTWO_TO_63        = f55
 955 x                   = f56
 956 xsq                 = f57
 957 Tx                  = f58
 958 Tx1                 = f59
 959 Set                 = f60
 960 poly1               = f61
 961 poly2               = f62
 962 Poly                = f63
 963 Poly1               = f64
 964 Poly2               = f65
 965 r_to_the_8          = f66
 966 B                   = f67
 967 SC_inv              = f68
 968 Pos_r               = f69
 969 N_0_fix             = f70
 970 PI_BY_4             = f71
 971 NEGTWO_TO_NEG2      = f72
 972 TWO_TO_24           = f73
 973 TWO_TO_NEG14        = f74
 974 TWO_TO_NEG33        = f75
 975 NEGTWO_TO_24        = f76
 976 NEGTWO_TO_NEG14     = f76
 977 NEGTWO_TO_NEG33     = f77
 978 two_by_PI           = f78
 979 N                   = f79
 980 N_fix               = f80
 981 P_1                 = f81
 982 P_2                 = f82
 983 P_3                 = f83
 984 s_val               = f84
 985 w                   = f85
 986 c                   = f86
 987 r                   = f87
 988 Z                   = f88
 989 A                   = f89
 990 a                   = f90
 991 t                   = f91
 992 U_1                 = f92
 993 d_2                 = f93
 994 TWO_TO_NEG2         = f94
 995 Q1_1                = f95
 996 Q1_2                = f96
 997 Q1_3                = f97
 998 Q1_4                = f98
 999 Q1_5                = f99
1000 Q1_6                = f100
1001 Q1_7                = f101
1002 Q1_8                = f102
1003 S_hi                = f103
1004 S_lo                = f104
1005 V_hi                = f105
1006 V_lo                = f106
1007 U_hi                = f107
1008 U_lo                = f108
1009 U_hiabs             = f109
1010 V_hiabs             = f110
1011 V                   = f111
1012 Inv_P_0             = f112
1013
1014 GR_SAVE_B0     = r33
1015 GR_SAVE_GP     = r34
1016 GR_SAVE_PFS    = r35
1017
1018 delta1         = r36
1019 table_ptr1     = r37
1020 table_ptr2     = r38
1021 i_0            = r39
1022 i_1            = r40
1023 N_fix_gr       = r41
1024 N_inc          = r42
1025 exp_Arg        = r43
1026 exp_r          = r44
1027 sig_r          = r45
1028 lookup         = r46
1029 table_offset   = r47
1030 Create_B       = r48
1031 gr_tmp         = r49
1032
1033 GR_Parameter_X = r49
1034 GR_Parameter_r = r50
1035
1036
1037
1038 .global __libm_tan
1039 .section .text
1040 .proc __libm_tan
1041
1042
1043 __libm_tan:
1044
1045 { .mfi
1046 alloc r32 = ar.pfs, 0,17,2,0
1047 (p0)   fclass.m.unc  p6,p0 = Arg, 0x1E7
1048       addl gr_tmp = -1,r0
1049 }
1050 ;;
1051
1052 { .mfi
1053        nop.m 999
1054 (p0)   fclass.nm.unc  p7,p0 = Arg, 0x1FF
1055        nop.i 999
1056 }
1057 ;;
1058
1059 { .mfi
1060 (p0)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
1061        nop.f 999
1062        nop.i 999
1063 }
1064 ;;
1065
1066 { .mmi
1067       ld8 table_ptr1 = [table_ptr1]
1068       setf.sig fp_tmp = gr_tmp   // Make a constant so fmpy produces inexact
1069       nop.i 999
1070 }
1071 ;;
1072
1073 //
1074 //     Check for NatVals, Infs , NaNs, and Zeros
1075 //     Check for everything - if false, then must be pseudo-zero
1076 //     or pseudo-nan.
1077 //     Local table pointer
1078 //
1079
1080 { .mbb
1081 (p0)   add table_ptr2 = 96, table_ptr1
1082 (p6)   br.cond.spnt __libm_TAN_SPECIAL
1083 (p7)   br.cond.spnt __libm_TAN_SPECIAL ;;
1084 }
1085 //
1086 //     Point to Inv_P_0
1087 //     Branch out to deal with unsupporteds and special values.
1088 //
1089
1090 { .mmf
1091 (p0)   ldfs TWO_TO_24 = [table_ptr1],4
1092 (p0)   ldfs TWO_TO_63 = [table_ptr2],4
1093 //
1094 //     Load -2**24, load -2**63.
1095 //
1096 (p0)   fcmp.eq.s0 p0, p6 = Arg, f1 ;;
1097 }
1098
1099 { .mfi
1100 (p0)   ldfs NEGTWO_TO_63 = [table_ptr2],12
1101 (p0)   fnorm.s1     Arg = Arg
1102         nop.i 999
1103 }
1104 //
1105 //     Load 2**24, Load 2**63.
1106 //
1107
1108 { .mmi
1109 (p0)   ldfs NEGTWO_TO_24 = [table_ptr1],12 ;;
1110 //
1111 //     Do fcmp to generate Denormal exception
1112 //     - can't do FNORM (will generate Underflow when U is unmasked!)
1113 //     Normalize input argument.
1114 //
1115 (p0)   ldfe two_by_PI = [table_ptr1],16
1116         nop.i 999
1117 }
1118
1119 { .mmi
1120 (p0)   ldfe Inv_P_0 = [table_ptr2],16 ;;
1121 (p0)   ldfe d_1 = [table_ptr2],16
1122         nop.i 999
1123 }
1124 //
1125 //     Decide about the paths to take:
1126 //     PR_1 and PR_3 set if -2**24 < Arg < 2**24 - CASE 1 OR 2
1127 //     OTHERWISE - CASE 3 OR 4
1128 //     Load inverse of P_0 .
1129 //     Set PR_6 if Arg <= -2**63
1130 //     Are there any Infs, NaNs, or zeros?
1131 //
1132
1133 { .mmi
1134 (p0)   ldfe P_0 = [table_ptr1],16 ;;
1135 (p0)   ldfe d_2 = [table_ptr2],16
1136         nop.i 999
1137 }
1138 //
1139 //     Set PR_8 if Arg <= -2**24
1140 //     Set PR_6 if Arg >=  2**63
1141 //
1142
1143 { .mmi
1144 (p0)   ldfe P_1 = [table_ptr1],16 ;;
1145 (p0)   ldfe PI_BY_4 = [table_ptr2],16
1146         nop.i 999
1147 }
1148 //
1149 //     Set PR_8 if Arg >= 2**24
1150 //
1151
1152 { .mmi
1153 (p0)   ldfe P_2 = [table_ptr1],16 ;;
1154 (p0)   ldfe   MPI_BY_4 = [table_ptr2],16
1155         nop.i 999
1156 }
1157 //
1158 //     Load  P_2 and PI_BY_4
1159 //
1160
1161 { .mfi
1162 (p0)   ldfe   P_3 = [table_ptr1],16
1163         nop.f 999
1164         nop.i 999 ;;
1165 }
1166
1167 { .mfi
1168         nop.m 999
1169 (p0)   fcmp.le.unc.s1 p6,p7 = Arg,NEGTWO_TO_63
1170         nop.i 999
1171 }
1172
1173 { .mfi
1174         nop.m 999
1175 (p0)   fcmp.le.unc.s1 p8,p9 = Arg,NEGTWO_TO_24
1176         nop.i 999 ;;
1177 }
1178
1179 { .mfi
1180         nop.m 999
1181 (p7)   fcmp.ge.s1 p6,p0 = Arg,TWO_TO_63
1182         nop.i 999
1183 }
1184
1185 { .mfi
1186         nop.m 999
1187 (p9)   fcmp.ge.s1 p8,p0 = Arg,TWO_TO_24
1188         nop.i 999 ;;
1189 }
1190
1191 { .mib
1192         nop.m 999
1193         nop.i 999
1194 //
1195 //     Load  P_3 and -PI_BY_4
1196 //
1197 (p6)   br.cond.spnt TAN_ARG_TOO_LARGE ;;
1198 }
1199
1200 { .mib
1201         nop.m 999
1202         nop.i 999
1203 //
1204 //     Load 2**(-2).
1205 //     Load -2**(-2).
1206 //     Branch out if we have a special argument.
1207 //     Branch out if the magnitude of the input argument is too large
1208 //     - do this branch before the next.
1209 //
1210 (p8)   br.cond.spnt TAN_LARGER_ARG ;;
1211 }
1212 //
1213 //     Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
1214 //
1215
1216 { .mfi
1217 (p0)   ldfs TWO_TO_NEG2 = [table_ptr2],4
1218 //     ARGUMENT REDUCTION CODE - CASE 1 and 2
1219 //     Load 2**(-2).
1220 //     Load -2**(-2).
1221 (p0)   fmpy.s1 N = Arg,two_by_PI
1222         nop.i 999 ;;
1223 }
1224
1225 { .mfi
1226 (p0)   ldfs NEGTWO_TO_NEG2 = [table_ptr2],12
1227 //
1228 //     N = Arg * 2/pi
1229 //
1230 (p0)   fcmp.lt.unc.s1 p8,p9= Arg,PI_BY_4
1231         nop.i 999 ;;
1232 }
1233
1234 { .mfi
1235         nop.m 999
1236 //
1237 //     if Arg < pi/4,  set PR_8.
1238 //
1239 (p8)   fcmp.gt.s1 p8,p9= Arg,MPI_BY_4
1240         nop.i 999 ;;
1241 }
1242 //
1243 //     Case 1: Is |r| < 2**(-2).
1244 //     Arg is the same as r in this case.
1245 //     r = Arg
1246 //     c = 0
1247 //
1248
1249 { .mfi
1250 (p8)   mov N_fix_gr = r0
1251 //
1252 //     if Arg > -pi/4, reset PR_8.
1253 //     Select the case when |Arg| < pi/4 - set PR[8] = true.
1254 //     Else Select the case when |Arg| >= pi/4 - set PR[9] = true.
1255 //
1256 (p0)   fcvt.fx.s1 N_fix = N
1257         nop.i 999 ;;
1258 }
1259
1260 { .mfi
1261         nop.m 999
1262 //
1263 //     Grab the integer part of N .
1264 //
1265 (p8)   mov r = Arg
1266         nop.i 999
1267 }
1268
1269 { .mfi
1270         nop.m 999
1271 (p8)   mov c = f0
1272         nop.i 999 ;;
1273 }
1274
1275 { .mfi
1276         nop.m 999
1277 (p8)   fcmp.lt.unc.s1 p10, p11 = Arg, TWO_TO_NEG2
1278         nop.i 999 ;;
1279 }
1280
1281 { .mfi
1282         nop.m 999
1283 (p10)  fcmp.gt.s1 p10,p0 = Arg, NEGTWO_TO_NEG2
1284         nop.i 999 ;;
1285 }
1286
1287 { .mfi
1288         nop.m 999
1289 //
1290 //     Case 2: Place integer part of N in GP register.
1291 //
1292 (p9)   fcvt.xf N = N_fix
1293         nop.i 999 ;;
1294 }
1295
1296 { .mib
1297 (p9)   getf.sig N_fix_gr = N_fix
1298         nop.i 999
1299 //
1300 //     Case 2: Convert integer N_fix back to normalized floating-point value.
1301 //
1302 (p10)  br.cond.spnt TAN_SMALL_R ;;
1303 }
1304
1305 { .mib
1306         nop.m 999
1307         nop.i 999
1308 (p8)   br.cond.sptk TAN_NORMAL_R ;;
1309 }
1310 //
1311 //     Case 1: PR_3 is only affected  when PR_1 is set.
1312 //
1313
1314 { .mmi
1315 (p9)   ldfs TWO_TO_NEG33 = [table_ptr2], 4 ;;
1316 //
1317 //     Case 2: Load 2**(-33).
1318 //
1319 (p9)   ldfs NEGTWO_TO_NEG33 = [table_ptr2], 4
1320         nop.i 999 ;;
1321 }
1322
1323 { .mfi
1324         nop.m 999
1325 //
1326 //     Case 2: Load -2**(-33).
1327 //
1328 (p9)   fnma.s1 s_val = N, P_1, Arg
1329         nop.i 999
1330 }
1331
1332 { .mfi
1333         nop.m 999
1334 (p9)   fmpy.s1 w = N, P_2
1335         nop.i 999 ;;
1336 }
1337
1338 { .mfi
1339         nop.m 999
1340 //
1341 //     Case 2: w = N * P_2
1342 //     Case 2: s_val = -N * P_1  + Arg
1343 //
1344 (p0)   fcmp.lt.unc.s1 p9,p8 = s_val, TWO_TO_NEG33
1345         nop.i 999 ;;
1346 }
1347
1348 { .mfi
1349         nop.m 999
1350 //
1351 //     Decide between case_1 and case_2 reduce:
1352 //
1353 (p9)   fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33
1354         nop.i 999 ;;
1355 }
1356
1357 { .mfi
1358         nop.m 999
1359 //
1360 //     Case 1_reduce:  s <= -2**(-33) or s >= 2**(-33)
1361 //     Case 2_reduce: -2**(-33) < s < 2**(-33)
1362 //
1363 (p8)   fsub.s1 r = s_val, w
1364         nop.i 999
1365 }
1366
1367 { .mfi
1368         nop.m 999
1369 (p9)   fmpy.s1 w = N, P_3
1370         nop.i 999 ;;
1371 }
1372
1373 { .mfi
1374         nop.m 999
1375 (p9)   fma.s1  U_1 = N, P_2, w
1376         nop.i 999
1377 }
1378
1379 { .mfi
1380         nop.m 999
1381 //
1382 //     Case 1_reduce: Is |r| < 2**(-2), if so set PR_10
1383 //     else set PR_11.
1384 //
1385 (p8)   fsub.s1 c = s_val, r
1386         nop.i 999 ;;
1387 }
1388
1389 { .mfi
1390         nop.m 999
1391 //
1392 //     Case 1_reduce: r = s + w (change sign)
1393 //     Case 2_reduce: w = N * P_3 (change sign)
1394 //
1395 (p8)   fcmp.lt.unc.s1 p10, p11 = r, TWO_TO_NEG2
1396         nop.i 999 ;;
1397 }
1398
1399 { .mfi
1400         nop.m 999
1401 (p10)  fcmp.gt.s1 p10, p11 = r, NEGTWO_TO_NEG2
1402         nop.i 999 ;;
1403 }
1404
1405 { .mfi
1406         nop.m 999
1407 (p9)   fsub.s1 r = s_val, U_1
1408         nop.i 999
1409 }
1410
1411 { .mfi
1412         nop.m 999
1413 //
1414 //     Case 1_reduce: c is complete here.
1415 //     c = c + w (w has not been negated.)
1416 //     Case 2_reduce: r is complete here - continue to calculate c .
1417 //     r = s - U_1
1418 //
1419 (p9)   fms.s1 U_2 = N, P_2, U_1
1420         nop.i 999 ;;
1421 }
1422
1423 { .mfi
1424         nop.m 999
1425 //
1426 //     Case 1_reduce: c = s - r
1427 //     Case 2_reduce: U_1 = N * P_2 + w
1428 //
1429 (p8)   fsub.s1 c = c, w
1430         nop.i 999 ;;
1431 }
1432
1433 { .mfi
1434         nop.m 999
1435 (p9)   fsub.s1 s_val = s_val, r
1436         nop.i 999
1437 }
1438
1439 { .mfb
1440         nop.m 999
1441 //
1442 //     Case 2_reduce:
1443 //     U_2 = N * P_2 - U_1
1444 //     Not needed until later.
1445 //
1446 (p9)   fadd.s1 U_2 = U_2, w
1447 //
1448 //     Case 2_reduce:
1449 //     s = s - r
1450 //     U_2 = U_2 + w
1451 //
1452 (p10)  br.cond.spnt TAN_SMALL_R ;;
1453 }
1454
1455 { .mib
1456         nop.m 999
1457         nop.i 999
1458 (p11)  br.cond.sptk TAN_NORMAL_R ;;
1459 }
1460
1461 { .mii
1462         nop.m 999
1463 //
1464 //     Case 2_reduce:
1465 //     c = c - U_2
1466 //     c is complete here
1467 //     Argument reduction ends here.
1468 //
1469 (p9)   extr.u i_1 = N_fix_gr, 0, 1 ;;
1470 (p9)   cmp.eq.unc p11, p12 = 0x0000,i_1 ;;
1471 }
1472
1473 { .mfi
1474         nop.m 999
1475 //
1476 //     Is i_1  even or odd?
1477 //     if i_1 == 0, set p11, else set p12.
1478 //
1479 (p11)  fmpy.s1 rsq = r, Z
1480         nop.i 999 ;;
1481 }
1482
1483 { .mfi
1484         nop.m 999
1485 (p12)  frcpa.s1 S_hi,p0 = f1, r
1486         nop.i 999
1487 }
1488
1489 //
1490 //     Case 1: Branch to SMALL_R or NORMAL_R.
1491 //     Case 1 is done now.
1492 //
1493
1494 { .mfi
1495 (p9)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
1496 (p9)   fsub.s1 c = s_val, U_1
1497         nop.i 999 ;;
1498 }
1499 ;;
1500
1501 { .mmi
1502 (p9)  ld8 table_ptr1 = [table_ptr1]
1503       nop.m 999
1504       nop.i 999
1505 }
1506 ;;
1507
1508 { .mmi
1509 (p9)   add table_ptr1 = 224, table_ptr1 ;;
1510 (p9)   ldfe P1_1 = [table_ptr1],144
1511         nop.i 999 ;;
1512 }
1513 //
1514 //     Get [i_1] -  lsb of N_fix_gr .
1515 //     Load P1_1 and point to Q1_1 .
1516 //
1517
1518 { .mfi
1519 (p9)   ldfe Q1_1 = [table_ptr1] , 0
1520 //
1521 //     N even: rsq = r * Z
1522 //     N odd:  S_hi = frcpa(r)
1523 //
1524 (p12)  fmerge.ns S_hi = S_hi, S_hi
1525         nop.i 999
1526 }
1527
1528 { .mfi
1529         nop.m 999
1530 //
1531 //     Case 2_reduce:
1532 //     c = s - U_1
1533 //
1534 (p9)   fsub.s1 c = c, U_2
1535         nop.i 999 ;;
1536 }
1537
1538 { .mfi
1539         nop.m 999
1540 (p12)  fma.s1  poly1 = S_hi, r, f1
1541         nop.i 999 ;;
1542 }
1543
1544 { .mfi
1545         nop.m 999
1546 //
1547 //     N odd:  Change sign of S_hi
1548 //
1549 (p11)  fmpy.s1 rsq = rsq, P1_1
1550         nop.i 999 ;;
1551 }
1552
1553 { .mfi
1554         nop.m 999
1555 (p12)  fma.s1 S_hi = S_hi, poly1, S_hi
1556         nop.i 999 ;;
1557 }
1558
1559 { .mfi
1560         nop.m 999
1561 //
1562 //     N even: rsq = rsq * P1_1
1563 //     N odd:  poly1 =  1.0 +  S_hi * r    16 bits partial  account for necessary
1564 //
1565 (p11)  fma.s1 Result = r, rsq, c
1566         nop.i 999 ;;
1567 }
1568
1569 { .mfi
1570         nop.m 999
1571 //
1572 //     N even: Result = c  + r * rsq
1573 //     N odd:  S_hi  = S_hi + S_hi*poly1  16 bits account for necessary
1574 //
1575 (p12)  fma.s1 poly1 = S_hi, r, f1
1576         nop.i 999 ;;
1577 }
1578
1579 { .mfi
1580         nop.m 999
1581 //
1582 //     N even: Result = Result + r
1583 //     N odd:  poly1  = 1.0 + S_hi * r        32 bits partial
1584 //
1585 (p11)  fadd.s0 Result = r, Result
1586         nop.i 999 ;;
1587 }
1588
1589 { .mfi
1590         nop.m 999
1591 (p12)  fma.s1  S_hi = S_hi, poly1, S_hi
1592         nop.i 999 ;;
1593 }
1594
1595 { .mfi
1596         nop.m 999
1597 //
1598 //     N even: Result1 = Result + r
1599 //     N odd:   S_hi  = S_hi * poly1 + S_hi   32 bits
1600 //
1601 (p12)  fma.s1 poly1 = S_hi, r, f1
1602         nop.i 999 ;;
1603 }
1604
1605 { .mfi
1606         nop.m 999
1607 //
1608 //     N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
1609 //
1610 (p12)  fma.s1 S_hi = S_hi, poly1, S_hi
1611         nop.i 999 ;;
1612 }
1613
1614 { .mfi
1615         nop.m 999
1616 //
1617 //     N odd:  poly1  =  S_hi * poly + 1.0    64 bits
1618 //
1619 (p12)  fma.s1 poly1 = S_hi, r, f1
1620         nop.i 999 ;;
1621 }
1622
1623 { .mfi
1624         nop.m 999
1625 //
1626 //     N odd:  poly1  =  S_hi * r + 1.0
1627 //
1628 (p12)  fma.s1 poly1 = S_hi, c, poly1
1629         nop.i 999 ;;
1630 }
1631
1632 { .mfi
1633         nop.m 999
1634 //
1635 //     N odd:  poly1  =  S_hi * c + poly1
1636 //
1637 (p12)  fmpy.s1 S_lo = S_hi, poly1
1638         nop.i 999 ;;
1639 }
1640
1641 { .mfi
1642         nop.m 999
1643 //
1644 //     N odd:  S_lo  =  S_hi *  poly1
1645 //
1646 (p12)  fma.s1 S_lo = Q1_1, r, S_lo
1647         nop.i 999
1648 }
1649
1650 { .mfi
1651         nop.m 999
1652 //
1653 //     N odd:  Result =  S_hi + S_lo
1654 //
1655 (p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
1656         nop.i 999 ;;
1657 }
1658
1659 { .mfb
1660         nop.m 999
1661 //
1662 //     N odd:  S_lo  =  S_lo + Q1_1 * r
1663 //
1664 (p12)  fadd.s0 Result = S_hi, S_lo
1665 (p0)   br.ret.sptk b0 ;;
1666 }
1667
1668
1669 TAN_LARGER_ARG:
1670
1671 { .mmf
1672 (p0)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
1673       nop.m 999
1674 (p0)  fmpy.s1 N_0 = Arg, Inv_P_0
1675 }
1676 ;;
1677
1678 //
1679 // ARGUMENT REDUCTION CODE - CASE 3 and 4
1680 //
1681 //
1682 //    Adjust table_ptr1 to beginning of table.
1683 //    N_0 = Arg * Inv_P_0
1684 //
1685
1686
1687 { .mmi
1688 (p0)  ld8 table_ptr1 = [table_ptr1]
1689       nop.m 999
1690       nop.i 999
1691 }
1692 ;;
1693
1694
1695 { .mmi
1696 (p0)  add table_ptr1 = 8, table_ptr1 ;;
1697 //
1698 //    Point to  2*-14
1699 //
1700 (p0)  ldfs TWO_TO_NEG14 = [table_ptr1], 4
1701         nop.i 999 ;;
1702 }
1703 //
1704 //    Load 2**(-14).
1705 //
1706
1707 { .mmi
1708 (p0)  ldfs NEGTWO_TO_NEG14 = [table_ptr1], 180 ;;
1709 //
1710 //    N_0_fix  = integer part of N_0 .
1711 //    Adjust table_ptr1 to beginning of table.
1712 //
1713 (p0)  ldfs TWO_TO_NEG2 = [table_ptr1], 4
1714         nop.i 999 ;;
1715 }
1716 //
1717 //    Make N_0 the integer part.
1718 //
1719
1720 { .mfi
1721 (p0)  ldfs NEGTWO_TO_NEG2 = [table_ptr1]
1722 //
1723 //    Load -2**(-14).
1724 //
1725 (p0)  fcvt.fx.s1 N_0_fix = N_0
1726         nop.i 999 ;;
1727 }
1728
1729 { .mfi
1730         nop.m 999
1731 (p0)  fcvt.xf N_0 = N_0_fix
1732         nop.i 999 ;;
1733 }
1734
1735 { .mfi
1736         nop.m 999
1737 (p0)  fnma.s1 ArgPrime = N_0, P_0, Arg
1738         nop.i 999
1739 }
1740
1741 { .mfi
1742         nop.m 999
1743 (p0)  fmpy.s1 w = N_0, d_1
1744         nop.i 999 ;;
1745 }
1746
1747 { .mfi
1748         nop.m 999
1749 //
1750 //    ArgPrime = -N_0 * P_0 + Arg
1751 //    w  = N_0 * d_1
1752 //
1753 (p0)  fmpy.s1 N = ArgPrime, two_by_PI
1754         nop.i 999 ;;
1755 }
1756
1757 { .mfi
1758         nop.m 999
1759 //
1760 //    N = ArgPrime * 2/pi
1761 //
1762 (p0)  fcvt.fx.s1 N_fix = N
1763         nop.i 999 ;;
1764 }
1765
1766 { .mfi
1767         nop.m 999
1768 //
1769 //    N_fix is the integer part.
1770 //
1771 (p0)  fcvt.xf N = N_fix
1772         nop.i 999 ;;
1773 }
1774
1775 { .mfi
1776 (p0)  getf.sig N_fix_gr = N_fix
1777         nop.f 999
1778         nop.i 999 ;;
1779 }
1780
1781 { .mfi
1782         nop.m 999
1783 //
1784 //    N is the integer part of the reduced-reduced argument.
1785 //    Put the integer in a GP register.
1786 //
1787 (p0)  fnma.s1 s_val = N, P_1, ArgPrime
1788         nop.i 999
1789 }
1790
1791 { .mfi
1792         nop.m 999
1793 (p0)  fnma.s1 w = N, P_2, w
1794         nop.i 999 ;;
1795 }
1796
1797 { .mfi
1798         nop.m 999
1799 //
1800 //    s_val = -N*P_1 + ArgPrime
1801 //    w = -N*P_2 + w
1802 //
1803 (p0)  fcmp.lt.unc.s1 p11, p10 = s_val, TWO_TO_NEG14
1804         nop.i 999 ;;
1805 }
1806
1807 { .mfi
1808         nop.m 999
1809 (p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14
1810         nop.i 999 ;;
1811 }
1812
1813 { .mfi
1814         nop.m 999
1815 //
1816 //    Case 3: r = s_val + w (Z complete)
1817 //    Case 4: U_hi = N_0 * d_1
1818 //
1819 (p10) fmpy.s1 V_hi = N, P_2
1820         nop.i 999
1821 }
1822
1823 { .mfi
1824         nop.m 999
1825 (p11) fmpy.s1 U_hi = N_0, d_1
1826         nop.i 999 ;;
1827 }
1828
1829 { .mfi
1830         nop.m 999
1831 //
1832 //    Case 3: r = s_val + w (Z complete)
1833 //    Case 4: U_hi = N_0 * d_1
1834 //
1835 (p11) fmpy.s1 V_hi = N, P_2
1836         nop.i 999
1837 }
1838
1839 { .mfi
1840         nop.m 999
1841 (p11) fmpy.s1 U_hi = N_0, d_1
1842         nop.i 999 ;;
1843 }
1844
1845 { .mfi
1846         nop.m 999
1847 //
1848 //    Decide between case 3 and 4:
1849 //    Case 3:  s <= -2**(-14) or s >= 2**(-14)
1850 //    Case 4: -2**(-14) < s < 2**(-14)
1851 //
1852 (p10) fadd.s1 r = s_val, w
1853         nop.i 999
1854 }
1855
1856 { .mfi
1857         nop.m 999
1858 (p11) fmpy.s1 w = N, P_3
1859         nop.i 999 ;;
1860 }
1861
1862 { .mfi
1863         nop.m 999
1864 //
1865 //    Case 4: We need abs of both U_hi and V_hi - dont
1866 //    worry about switched sign of V_hi .
1867 //
1868 (p11) fsub.s1 A = U_hi, V_hi
1869         nop.i 999
1870 }
1871
1872 { .mfi
1873         nop.m 999
1874 //
1875 //    Case 4: A =  U_hi + V_hi
1876 //    Note: Worry about switched sign of V_hi, so subtract instead of add.
1877 //
1878 (p11) fnma.s1 V_lo = N, P_2, V_hi
1879         nop.i 999 ;;
1880 }
1881
1882 { .mfi
1883         nop.m 999
1884 (p11) fms.s1 U_lo = N_0, d_1, U_hi
1885         nop.i 999 ;;
1886 }
1887
1888 { .mfi
1889         nop.m 999
1890 (p11) fabs V_hiabs = V_hi
1891         nop.i 999
1892 }
1893
1894 { .mfi
1895         nop.m 999
1896 //
1897 //    Case 4: V_hi = N * P_2
1898 //            w = N * P_3
1899 //    Note the product does not include the (-) as in the writeup
1900 //    so (-) missing for V_hi and w .
1901 (p10) fadd.s1 r = s_val, w
1902         nop.i 999 ;;
1903 }
1904
1905 { .mfi
1906         nop.m 999
1907 //
1908 //    Case 3: c = s_val - r
1909 //    Case 4: U_lo = N_0 * d_1 - U_hi
1910 //
1911 (p11) fabs U_hiabs = U_hi
1912         nop.i 999
1913 }
1914
1915 { .mfi
1916         nop.m 999
1917 (p11) fmpy.s1 w = N, P_3
1918         nop.i 999 ;;
1919 }
1920
1921 { .mfi
1922         nop.m 999
1923 //
1924 //    Case 4: Set P_12 if U_hiabs >= V_hiabs
1925 //
1926 (p11) fadd.s1 C_hi = s_val, A
1927         nop.i 999 ;;
1928 }
1929
1930 { .mfi
1931         nop.m 999
1932 //
1933 //    Case 4: C_hi = s_val + A
1934 //
1935 (p11) fadd.s1 t = U_lo, V_lo
1936         nop.i 999 ;;
1937 }
1938
1939 { .mfi
1940         nop.m 999
1941 //
1942 //    Case 3: Is |r| < 2**(-2), if so set PR_7
1943 //    else set PR_8.
1944 //    Case 3: If PR_7 is set, prepare to branch to Small_R.
1945 //    Case 3: If PR_8 is set, prepare to branch to Normal_R.
1946 //
1947 (p10) fsub.s1 c = s_val, r
1948         nop.i 999 ;;
1949 }
1950
1951 { .mfi
1952         nop.m 999
1953 //
1954 //    Case 3: c = (s - r) + w (c complete)
1955 //
1956 (p11) fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs
1957         nop.i 999
1958 }
1959
1960 { .mfi
1961         nop.m 999
1962 (p11) fms.s1 w = N_0, d_2, w
1963         nop.i 999 ;;
1964 }
1965
1966 { .mfi
1967         nop.m 999
1968 //
1969 //    Case 4: V_hi = N * P_2
1970 //            w = N * P_3
1971 //    Note the product does not include the (-) as in the writeup
1972 //    so (-) missing for V_hi and w .
1973 //
1974 (p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2
1975         nop.i 999 ;;
1976 }
1977
1978 { .mfi
1979         nop.m 999
1980 (p14) fcmp.gt.s1 p14, p15 = r, NEGTWO_TO_NEG2
1981         nop.i 999 ;;
1982 }
1983
1984 { .mfb
1985         nop.m 999
1986 //
1987 //    Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup)
1988 //    Note: the (-) is still missing for V_hi .
1989 //    Case 4: w = w + N_0 * d_2
1990 //    Note: the (-) is now incorporated in w .
1991 //
1992 (p10) fadd.s1 c = c, w
1993 //
1994 //    Case 4: t = U_lo + V_lo
1995 //    Note: remember V_lo should be (-), subtract instead of add. NO
1996 //
1997 (p14) br.cond.spnt TAN_SMALL_R ;;
1998 }
1999
2000 { .mib
2001         nop.m 999
2002         nop.i 999
2003 (p15) br.cond.spnt TAN_NORMAL_R ;;
2004 }
2005
2006 { .mfi
2007         nop.m 999
2008 //
2009 //    Case 3: Vector off when |r| < 2**(-2).  Recall that PR_3 will be true.
2010 //    The remaining stuff is for Case 4.
2011 //
2012 (p12) fsub.s1 a = U_hi, A
2013 (p11) extr.u i_1 = N_fix_gr, 0, 1 ;;
2014 }
2015
2016 { .mfi
2017         nop.m 999
2018 //
2019 //    Case 4: C_lo = s_val - C_hi
2020 //
2021 (p11) fadd.s1 t = t, w
2022         nop.i 999
2023 }
2024
2025 { .mfi
2026         nop.m 999
2027 (p13) fadd.s1 a = V_hi, A
2028         nop.i 999 ;;
2029 }
2030
2031 //
2032 //    Case 4: a = U_hi - A
2033 //            a = V_hi - A (do an add to account for missing (-) on V_hi
2034 //
2035
2036 { .mfi
2037 (p11)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
2038 (p11) fsub.s1 C_lo = s_val, C_hi
2039         nop.i 999
2040 }
2041 ;;
2042
2043 { .mmi
2044 (p11) ld8 table_ptr1 = [table_ptr1]
2045       nop.m 999
2046       nop.i 999
2047 }
2048 ;;
2049
2050 //
2051 //    Case 4: a = (U_hi - A)  + V_hi
2052 //            a = (V_hi - A)  + U_hi
2053 //    In each case account for negative missing form V_hi .
2054 //
2055 //
2056 //    Case 4: C_lo = (s_val - C_hi) + A
2057 //
2058
2059 { .mmi
2060 (p11) add table_ptr1 = 224, table_ptr1 ;;
2061 (p11) ldfe P1_1 = [table_ptr1], 16
2062         nop.i 999 ;;
2063 }
2064
2065 { .mfi
2066 (p11) ldfe P1_2 = [table_ptr1], 128
2067 //
2068 //    Case 4: w = U_lo + V_lo  + w
2069 //
2070 (p12) fsub.s1 a = a, V_hi
2071         nop.i 999 ;;
2072 }
2073 //
2074 //    Case 4: r = C_hi + C_lo
2075 //
2076
2077 { .mfi
2078 (p11) ldfe Q1_1 = [table_ptr1], 16
2079 (p11) fadd.s1 C_lo = C_lo, A
2080         nop.i 999 ;;
2081 }
2082 //
2083 //    Case 4: c = C_hi - r
2084 //    Get [i_1] - lsb of N_fix_gr.
2085 //
2086
2087 { .mfi
2088 (p11) ldfe Q1_2 = [table_ptr1], 16
2089         nop.f 999
2090         nop.i 999 ;;
2091 }
2092
2093 { .mfi
2094         nop.m 999
2095 (p13) fsub.s1 a = U_hi, a
2096         nop.i 999 ;;
2097 }
2098
2099 { .mfi
2100         nop.m 999
2101 (p11) fadd.s1 t = t, a
2102         nop.i 999 ;;
2103 }
2104
2105 { .mfi
2106         nop.m 999
2107 //
2108 //    Case 4: t = t + a
2109 //
2110 (p11) fadd.s1 C_lo = C_lo, t
2111         nop.i 999 ;;
2112 }
2113
2114 { .mfi
2115         nop.m 999
2116 //
2117 //    Case 4: C_lo = C_lo + t
2118 //
2119 (p11) fadd.s1 r = C_hi, C_lo
2120         nop.i 999 ;;
2121 }
2122
2123 { .mfi
2124         nop.m 999
2125 (p11) fsub.s1 c = C_hi, r
2126         nop.i 999
2127 }
2128
2129 { .mfi
2130         nop.m 999
2131 //
2132 //    Case 4: c = c + C_lo  finished.
2133 //    Is i_1  even or odd?
2134 //    if i_1 == 0, set PR_4, else set PR_5.
2135 //
2136 // r and c have been computed.
2137 // We known whether this is the sine or cosine routine.
2138 // Make sure ftz mode is set - should be automatic when using wre
2139 (p0)  fmpy.s1 rsq = r, r
2140         nop.i 999 ;;
2141 }
2142
2143 { .mfi
2144         nop.m 999
2145 (p11) fadd.s1 c = c , C_lo
2146 (p11) cmp.eq.unc p11, p12 =  0x0000, i_1 ;;
2147 }
2148
2149 { .mfi
2150         nop.m 999
2151 (p12) frcpa.s1 S_hi, p0 = f1, r
2152         nop.i 999
2153 }
2154
2155 { .mfi
2156         nop.m 999
2157 //
2158 //    N odd: Change sign of S_hi
2159 //
2160 (p11) fma.s1 Result = rsq, P1_2, P1_1
2161         nop.i 999 ;;
2162 }
2163
2164 { .mfi
2165         nop.m 999
2166 (p12) fma.s1 P = rsq, Q1_2, Q1_1
2167         nop.i 999
2168 }
2169
2170 { .mfi
2171         nop.m 999
2172 //
2173 //    N odd:  Result  =  S_hi + S_lo      (User supplied rounding mode for C1)
2174 //
2175 (p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2176         nop.i 999 ;;
2177 }
2178
2179 { .mfi
2180         nop.m 999
2181 //
2182 //    N even: rsq = r * r
2183 //    N odd:  S_hi = frcpa(r)
2184 //
2185 (p12) fmerge.ns S_hi = S_hi, S_hi
2186         nop.i 999
2187 }
2188
2189 { .mfi
2190         nop.m 999
2191 //
2192 //    N even: rsq = rsq * P1_2 + P1_1
2193 //    N odd:  poly1 =  1.0 +  S_hi * r    16 bits partial  account for necessary
2194 //
2195 (p11) fmpy.s1 Result = rsq, Result
2196         nop.i 999 ;;
2197 }
2198
2199 { .mfi
2200         nop.m 999
2201 (p12) fma.s1 poly1 = S_hi, r,f1
2202         nop.i 999
2203 }
2204
2205 { .mfi
2206         nop.m 999
2207 //
2208 //    N even: Result =  Result * rsq
2209 //    N odd:  S_hi  = S_hi + S_hi*poly1  16 bits account for necessary
2210 //
2211 (p11) fma.s1 Result = r, Result, c
2212         nop.i 999 ;;
2213 }
2214
2215 { .mfi
2216         nop.m 999
2217 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2218         nop.i 999
2219 }
2220
2221 { .mfi
2222         nop.m 999
2223 //
2224 //    N odd:   S_hi  = S_hi * poly1 + S_hi   32 bits
2225 //
2226 (p11) fadd.s0 Result= r, Result
2227         nop.i 999 ;;
2228 }
2229
2230 { .mfi
2231         nop.m 999
2232 (p12) fma.s1 poly1 =  S_hi, r, f1
2233         nop.i 999 ;;
2234 }
2235
2236 { .mfi
2237         nop.m 999
2238 //
2239 //    N even: Result = Result * r + c
2240 //    N odd:  poly1  = 1.0 + S_hi * r        32 bits partial
2241 //
2242 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2243         nop.i 999 ;;
2244 }
2245
2246 { .mfi
2247         nop.m 999
2248 (p12) fma.s1 poly1 = S_hi, r, f1
2249         nop.i 999 ;;
2250 }
2251
2252 { .mfi
2253         nop.m 999
2254 //
2255 //    N even: Result1 = Result + r  (Rounding mode S0)
2256 //    N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
2257 //
2258 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2259         nop.i 999 ;;
2260 }
2261
2262 { .mfi
2263         nop.m 999
2264 //
2265 //    N odd:  poly1  =  S_hi * poly + S_hi    64 bits
2266 //
2267 (p12) fma.s1 poly1 = S_hi, r, f1
2268         nop.i 999 ;;
2269 }
2270
2271 { .mfi
2272         nop.m 999
2273 //
2274 //    N odd:  poly1  =  S_hi * r + 1.0
2275 //
2276 (p12) fma.s1 poly1 = S_hi, c, poly1
2277         nop.i 999 ;;
2278 }
2279
2280 { .mfi
2281         nop.m 999
2282 //
2283 //    N odd:  poly1  =  S_hi * c + poly1
2284 //
2285 (p12) fmpy.s1 S_lo = S_hi, poly1
2286         nop.i 999 ;;
2287 }
2288
2289 { .mfi
2290         nop.m 999
2291 //
2292 //    N odd:  S_lo  =  S_hi *  poly1
2293 //
2294 (p12) fma.s1 S_lo = P, r, S_lo
2295         nop.i 999 ;;
2296 }
2297
2298 { .mfb
2299         nop.m 999
2300 //
2301 //    N odd:  S_lo  =  S_lo + r * P
2302 //
2303 (p12) fadd.s0 Result = S_hi, S_lo
2304 (p0)   br.ret.sptk b0 ;;
2305 }
2306
2307
2308 TAN_SMALL_R:
2309
2310 { .mii
2311         nop.m 999
2312 (p0)  extr.u i_1 = N_fix_gr, 0, 1 ;;
2313 (p0)  cmp.eq.unc p11, p12 = 0x0000, i_1
2314 }
2315
2316 { .mfi
2317         nop.m 999
2318 (p0)  fmpy.s1 rsq = r, r
2319         nop.i 999 ;;
2320 }
2321
2322 { .mfi
2323         nop.m 999
2324 (p12) frcpa.s1 S_hi, p0 = f1, r
2325         nop.i 999
2326 }
2327
2328 { .mfi
2329 (p0)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
2330         nop.f 999
2331         nop.i 999
2332 }
2333 ;;
2334
2335 { .mmi
2336 (p0)  ld8 table_ptr1 = [table_ptr1]
2337       nop.m 999
2338       nop.i 999
2339 }
2340 ;;
2341
2342 // *****************************************************************
2343 // *****************************************************************
2344 // *****************************************************************
2345
2346 { .mmi
2347 (p0)  add table_ptr1 = 224, table_ptr1 ;;
2348 (p0)  ldfe P1_1 = [table_ptr1], 16
2349         nop.i 999 ;;
2350 }
2351 //    r and c have been computed.
2352 //    We known whether this is the sine or cosine routine.
2353 //    Make sure ftz mode is set - should be automatic when using wre
2354 //    |r| < 2**(-2)
2355
2356 { .mfi
2357 (p0)  ldfe P1_2 = [table_ptr1], 16
2358 (p11) fmpy.s1 r_to_the_8 = rsq, rsq
2359         nop.i 999 ;;
2360 }
2361 //
2362 //    Set table_ptr1 to beginning of constant table.
2363 //    Get [i_1] - lsb of N_fix_gr.
2364 //
2365
2366 { .mfi
2367 (p0)  ldfe P1_3 = [table_ptr1], 96
2368 //
2369 //    N even: rsq = r * r
2370 //    N odd:  S_hi = frcpa(r)
2371 //
2372 (p12) fmerge.ns S_hi = S_hi, S_hi
2373         nop.i 999 ;;
2374 }
2375 //
2376 //    Is i_1  even or odd?
2377 //    if i_1 == 0, set PR_11.
2378 //    if i_1 != 0, set PR_12.
2379 //
2380
2381 { .mfi
2382 (p11) ldfe P1_9 = [table_ptr1], -16
2383 //
2384 //    N even: Poly2 = P1_7 + Poly2 * rsq
2385 //    N odd:  poly2 = Q1_5 + poly2 * rsq
2386 //
2387 (p11) fadd.s1 CORR = rsq, f1
2388         nop.i 999 ;;
2389 }
2390
2391 { .mmi
2392 (p11) ldfe P1_8 = [table_ptr1], -16 ;;
2393 //
2394 //    N even: Poly1 = P1_2 + P1_3 * rsq
2395 //    N odd:  poly1 =  1.0 +  S_hi * r
2396 //    16 bits partial  account for necessary (-1)
2397 //
2398 (p11) ldfe P1_7 = [table_ptr1], -16
2399         nop.i 999 ;;
2400 }
2401 //
2402 //    N even: Poly1 = P1_1 + Poly1 * rsq
2403 //    N odd:  S_hi  =  S_hi + S_hi * poly1)     16 bits account for necessary
2404 //
2405
2406 { .mfi
2407 (p11) ldfe P1_6 = [table_ptr1], -16
2408 //
2409 //    N even: Poly2 = P1_5 + Poly2 * rsq
2410 //    N odd:  poly2 = Q1_3 + poly2 * rsq
2411 //
2412 (p11) fmpy.s1 r_to_the_8 = r_to_the_8, r_to_the_8
2413         nop.i 999 ;;
2414 }
2415 //
2416 //    N even: Poly1 =  Poly1 * rsq
2417 //    N odd:  poly1  = 1.0 + S_hi * r         32 bits partial
2418 //
2419
2420 { .mfi
2421 (p11) ldfe P1_5 = [table_ptr1], -16
2422 (p12) fma.s1 poly1 =  S_hi, r, f1
2423         nop.i 999 ;;
2424 }
2425 //
2426 //    N even: CORR =  CORR * c
2427 //    N odd:  S_hi  =  S_hi * poly1 + S_hi    32 bits
2428 //
2429
2430 //
2431 //    N even: Poly2 = P1_6 + Poly2 * rsq
2432 //    N odd:  poly2 = Q1_4 + poly2 * rsq
2433 //
2434 { .mmf
2435 (p0)  addl           table_ptr2   = @ltoff(TAN_BASE_CONSTANTS), gp
2436 (p11) ldfe P1_4 = [table_ptr1], -16
2437 (p11) fmpy.s1 CORR =  CORR, c
2438 }
2439 ;;
2440
2441
2442 { .mmi
2443 (p0)  ld8 table_ptr2 = [table_ptr2]
2444       nop.m 999
2445       nop.i 999
2446 }
2447 ;;
2448
2449
2450 { .mii
2451 (p0)  add table_ptr2 = 464, table_ptr2
2452         nop.i 999 ;;
2453         nop.i 999
2454 }
2455
2456 { .mfi
2457         nop.m 999
2458 (p11) fma.s1 Poly1 = P1_3, rsq, P1_2
2459         nop.i 999 ;;
2460 }
2461
2462 { .mfi
2463 (p0)  ldfe Q1_7 = [table_ptr2], -16
2464 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2465         nop.i 999 ;;
2466 }
2467
2468 { .mfi
2469 (p0)  ldfe Q1_6 = [table_ptr2], -16
2470 (p11) fma.s1 Poly2 = P1_9, rsq, P1_8
2471         nop.i 999 ;;
2472 }
2473
2474 { .mmi
2475 (p0)  ldfe Q1_5 = [table_ptr2], -16 ;;
2476 (p12) ldfe Q1_4 = [table_ptr2], -16
2477         nop.i 999 ;;
2478 }
2479
2480 { .mfi
2481 (p12) ldfe Q1_3 = [table_ptr2], -16
2482 //
2483 //    N even: Poly2 = P1_8 + P1_9 * rsq
2484 //    N odd:  poly2 = Q1_6 + Q1_7 * rsq
2485 //
2486 (p11) fma.s1 Poly1 = Poly1, rsq, P1_1
2487         nop.i 999 ;;
2488 }
2489
2490 { .mfi
2491 (p12) ldfe Q1_2 = [table_ptr2], -16
2492 (p12) fma.s1 poly1 = S_hi, r, f1
2493         nop.i 999 ;;
2494 }
2495
2496 { .mfi
2497 (p12) ldfe Q1_1 = [table_ptr2], -16
2498 (p11) fma.s1 Poly2 = Poly2, rsq, P1_7
2499         nop.i 999 ;;
2500 }
2501
2502 { .mfi
2503         nop.m 999
2504 //
2505 //    N even: CORR =  rsq + 1
2506 //    N even: r_to_the_8 =  rsq * rsq
2507 //
2508 (p11) fmpy.s1 Poly1 = Poly1, rsq
2509         nop.i 999 ;;
2510 }
2511
2512 { .mfi
2513         nop.m 999
2514 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2515         nop.i 999
2516 }
2517
2518 { .mfi
2519         nop.m 999
2520 (p12) fma.s1 poly2 = Q1_7, rsq, Q1_6
2521         nop.i 999 ;;
2522 }
2523
2524 { .mfi
2525         nop.m 999
2526 (p11) fma.s1 Poly2 = Poly2, rsq, P1_6
2527         nop.i 999 ;;
2528 }
2529
2530 { .mfi
2531         nop.m 999
2532 (p12) fma.s1 poly1 = S_hi, r, f1
2533         nop.i 999
2534 }
2535
2536 { .mfi
2537         nop.m 999
2538 (p12) fma.s1 poly2 = poly2, rsq, Q1_5
2539         nop.i 999 ;;
2540 }
2541
2542 { .mfi
2543         nop.m 999
2544 (p11) fma.s1 Poly2= Poly2, rsq, P1_5
2545         nop.i 999 ;;
2546 }
2547
2548 { .mfi
2549         nop.m 999
2550 (p12) fma.s1 S_hi =  S_hi, poly1, S_hi
2551         nop.i 999
2552 }
2553
2554 { .mfi
2555         nop.m 999
2556 (p12) fma.s1 poly2 = poly2, rsq, Q1_4
2557         nop.i 999 ;;
2558 }
2559
2560 { .mfi
2561         nop.m 999
2562 //
2563 //    N even: r_to_the_8 = r_to_the_8 * r_to_the_8
2564 //    N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
2565 //
2566 (p11) fma.s1 Poly2 = Poly2, rsq, P1_4
2567         nop.i 999 ;;
2568 }
2569
2570 { .mfi
2571         nop.m 999
2572 //
2573 //    N even: Result = CORR + Poly * r
2574 //    N odd:  P = Q1_1 + poly2 * rsq
2575 //
2576 (p12) fma.s1 poly1 = S_hi, r, f1
2577         nop.i 999
2578 }
2579
2580 { .mfi
2581         nop.m 999
2582 (p12) fma.s1 poly2 = poly2, rsq, Q1_3
2583         nop.i 999 ;;
2584 }
2585
2586 { .mfi
2587         nop.m 999
2588 //
2589 //    N even: Poly2 = P1_4 + Poly2 * rsq
2590 //    N odd:  poly2 = Q1_2 + poly2 * rsq
2591 //
2592 (p11) fma.s1 Poly = Poly2, r_to_the_8, Poly1
2593         nop.i 999 ;;
2594 }
2595
2596 { .mfi
2597         nop.m 999
2598 (p12) fma.s1 poly1 = S_hi, c, poly1
2599         nop.i 999
2600 }
2601
2602 { .mfi
2603         nop.m 999
2604 (p12) fma.s1 poly2 = poly2, rsq, Q1_2
2605         nop.i 999 ;;
2606 }
2607
2608 { .mfi
2609         nop.m 999
2610 //
2611 //    N even: Poly = Poly1 + Poly2 * r_to_the_8
2612 //    N odd:  S_hi =  S_hi * poly1 + S_hi    64 bits
2613 //
2614 (p11) fma.s1 Result = Poly, r, CORR
2615         nop.i 999 ;;
2616 }
2617
2618 { .mfi
2619         nop.m 999
2620 //
2621 //    N even: Result =  r + Result  (User supplied rounding mode)
2622 //    N odd:  poly1  =  S_hi * c + poly1
2623 //
2624 (p12) fmpy.s1 S_lo = S_hi, poly1
2625         nop.i 999
2626 }
2627
2628 { .mfi
2629         nop.m 999
2630 (p12) fma.s1 P = poly2, rsq, Q1_1
2631         nop.i 999 ;;
2632 }
2633
2634 { .mfi
2635         nop.m 999
2636 //
2637 //    N odd:  poly1  =  S_hi * r + 1.0
2638 //
2639 (p11) fadd.s0 Result = Result, r
2640         nop.i 999 ;;
2641 }
2642
2643 { .mfi
2644         nop.m 999
2645 //
2646 //    N odd:  S_lo  =  S_hi *  poly1
2647 //
2648 (p12) fma.s1 S_lo = Q1_1, c, S_lo
2649         nop.i 999
2650 }
2651
2652 { .mfi
2653         nop.m 999
2654 //
2655 //    N odd:  Result = Result + S_hi  (user supplied rounding mode)
2656 //
2657 (p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2658         nop.i 999 ;;
2659 }
2660
2661 { .mfi
2662         nop.m 999
2663 //
2664 //    N odd:  S_lo  =  Q1_1 * c + S_lo
2665 //
2666 (p12) fma.s1 Result = P, r, S_lo
2667         nop.i 999 ;;
2668 }
2669
2670 { .mfb
2671         nop.m 999
2672 //
2673 //    N odd:  Result =  S_lo + r * P
2674 //
2675 (p12) fadd.s0 Result = Result, S_hi
2676 (p0)   br.ret.sptk b0 ;;
2677 }
2678
2679
2680 TAN_NORMAL_R:
2681
2682 { .mfi
2683 (p0)  getf.sig sig_r = r
2684 // *******************************************************************
2685 // *******************************************************************
2686 // *******************************************************************
2687 //
2688 //    r and c have been computed.
2689 //    Make sure ftz mode is set - should be automatic when using wre
2690 //
2691 //
2692 //    Get [i_1] -  lsb of N_fix_gr alone.
2693 //
2694 (p0)  fmerge.s  Pos_r = f1, r
2695 (p0)  extr.u i_1 = N_fix_gr, 0, 1 ;;
2696 }
2697
2698 { .mfi
2699         nop.m 999
2700 (p0)  fmerge.s  sgn_r =  r, f1
2701 (p0)  cmp.eq.unc p11, p12 = 0x0000, i_1 ;;
2702 }
2703
2704 { .mfi
2705         nop.m 999
2706         nop.f 999
2707 (p0)  extr.u lookup = sig_r, 58, 5
2708 }
2709
2710 { .mlx
2711         nop.m 999
2712 (p0)  movl Create_B = 0x8200000000000000 ;;
2713 }
2714
2715 { .mfi
2716 (p0)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
2717         nop.f 999
2718 (p0)  dep Create_B = lookup, Create_B, 58, 5
2719 }
2720 ;;
2721
2722 //
2723 //    Get [i_1] -  lsb of N_fix_gr alone.
2724 //    Pos_r = abs (r)
2725 //
2726
2727
2728 { .mmi
2729       ld8 table_ptr1 = [table_ptr1]
2730       nop.m 999
2731       nop.i 999
2732 }
2733 ;;
2734
2735
2736 { .mmi
2737         nop.m 999
2738 (p0)  setf.sig B = Create_B
2739 //
2740 //    Set table_ptr1 and table_ptr2 to base address of
2741 //    constant table.
2742 //
2743 (p0)  add table_ptr1 = 480, table_ptr1 ;;
2744 }
2745
2746 { .mmb
2747         nop.m 999
2748 //
2749 //    Is i_1 or i_0  == 0 ?
2750 //    Create the constant  1 00000 1000000000000000000000...
2751 //
2752 (p0)  ldfe P2_1 = [table_ptr1], 16
2753         nop.b 999
2754 }
2755
2756 { .mmi
2757         nop.m 999 ;;
2758 (p0)  getf.exp exp_r = Pos_r
2759         nop.i 999
2760 }
2761 //
2762 //    Get r's exponent
2763 //    Get r's significand
2764 //
2765
2766 { .mmi
2767 (p0)  ldfe P2_2 = [table_ptr1], 16 ;;
2768 //
2769 //    Get the 5 bits or r for the lookup.   1.xxxxx ....
2770 //    from sig_r.
2771 //    Grab  lsb of exp of B
2772 //
2773 (p0)  ldfe P2_3 = [table_ptr1], 16
2774         nop.i 999 ;;
2775 }
2776
2777 { .mii
2778         nop.m 999
2779 (p0)  andcm table_offset = 0x0001, exp_r ;;
2780 (p0)  shl table_offset = table_offset, 9 ;;
2781 }
2782
2783 { .mii
2784         nop.m 999
2785 //
2786 //    Deposit   0 00000 1000000000000000000000... on
2787 //              1 xxxxx yyyyyyyyyyyyyyyyyyyyyy...,
2788 //    getting rid of the ys.
2789 //    Is  B = 2** -2 or  B= 2** -1? If 2**-1, then
2790 //    we want an offset of 512 for table addressing.
2791 //
2792 (p0)  shladd table_offset = lookup, 4, table_offset ;;
2793 //
2794 //    B =  ........ 1xxxxx 1000000000000000000...
2795 //
2796 (p0)  add table_ptr1 = table_ptr1, table_offset ;;
2797 }
2798
2799 { .mmb
2800         nop.m 999
2801 //
2802 //   B =  ........ 1xxxxx 1000000000000000000...
2803 //   Convert B so it has the same exponent as Pos_r
2804 //
2805 (p0)  ldfd T_hi = [table_ptr1], 8
2806         nop.b 999 ;;
2807 }
2808
2809 //
2810 //    x = |r| - B
2811 //    Load T_hi.
2812 //    Load C_hi.
2813 //
2814
2815 { .mmf
2816 (p0)  addl           table_ptr2   = @ltoff(TAN_BASE_CONSTANTS), gp
2817 (p0)  ldfs T_lo = [table_ptr1]
2818 (p0)  fmerge.se B = Pos_r, B
2819 }
2820 ;;
2821
2822 { .mmi
2823       ld8 table_ptr2 = [table_ptr2]
2824       nop.m 999
2825       nop.i 999
2826 }
2827 ;;
2828
2829 { .mii
2830 (p0)  add table_ptr2 = 1360, table_ptr2
2831         nop.i 999 ;;
2832 (p0)  add table_ptr2 = table_ptr2, table_offset ;;
2833 }
2834
2835 { .mfi
2836 (p0)  ldfd C_hi = [table_ptr2], 8
2837 (p0)  fsub.s1 x = Pos_r, B
2838         nop.i 999 ;;
2839 }
2840
2841 { .mii
2842 (p0)  ldfs C_lo = [table_ptr2],255
2843         nop.i 999 ;;
2844 //
2845 //    xsq = x * x
2846 //    N even: Tx = T_hi * x
2847 //    Load T_lo.
2848 //    Load C_lo - increment pointer to get SC_inv
2849 //    - cant get all the way, do an add later.
2850 //
2851 (p0)  add table_ptr2 = 569, table_ptr2 ;;
2852 }
2853 //
2854 //    N even: Tx1 = Tx + 1
2855 //    N odd:  Cx1 = 1 - Cx
2856 //
2857
2858 { .mfi
2859 (p0)  ldfe SC_inv = [table_ptr2], 0
2860         nop.f 999
2861         nop.i 999 ;;
2862 }
2863
2864 { .mfi
2865         nop.m 999
2866 (p0)  fmpy.s1 xsq = x, x
2867         nop.i 999
2868 }
2869
2870 { .mfi
2871         nop.m 999
2872 (p11) fmpy.s1 Tx = T_hi, x
2873         nop.i 999 ;;
2874 }
2875
2876 { .mfi
2877         nop.m 999
2878 (p12) fmpy.s1 Cx = C_hi, x
2879         nop.i 999 ;;
2880 }
2881
2882 { .mfi
2883         nop.m 999
2884 //
2885 //    N odd: Cx = C_hi * x
2886 //
2887 (p0)  fma.s1 P = P2_3, xsq, P2_2
2888         nop.i 999
2889 }
2890
2891 { .mfi
2892         nop.m 999
2893 //
2894 //    N even and odd: P = P2_3 + P2_2 * xsq
2895 //
2896 (p11) fadd.s1 Tx1 = Tx, f1
2897         nop.i 999 ;;
2898 }
2899
2900 { .mfi
2901         nop.m 999
2902 //
2903 //    N even: D = C_hi - tanx
2904 //    N odd: D = T_hi + tanx
2905 //
2906 (p11) fmpy.s1 CORR = SC_inv, T_hi
2907         nop.i 999
2908 }
2909
2910 { .mfi
2911         nop.m 999
2912 (p0)  fmpy.s1 Sx = SC_inv, x
2913         nop.i 999 ;;
2914 }
2915
2916 { .mfi
2917         nop.m 999
2918 (p12) fmpy.s1 CORR = SC_inv, C_hi
2919         nop.i 999 ;;
2920 }
2921
2922 { .mfi
2923         nop.m 999
2924 (p12) fsub.s1 V_hi = f1, Cx
2925         nop.i 999 ;;
2926 }
2927
2928 { .mfi
2929         nop.m 999
2930 (p0)  fma.s1 P = P, xsq, P2_1
2931         nop.i 999
2932 }
2933
2934 { .mfi
2935         nop.m 999
2936 //
2937 //    N even and odd: P = P2_1 + P * xsq
2938 //
2939 (p11) fma.s1 V_hi = Tx, Tx1, f1
2940         nop.i 999 ;;
2941 }
2942
2943 { .mfi
2944         nop.m 999
2945 //
2946 //    N even: Result  = sgn_r * tail + T_hi (user rounding mode for C1)
2947 //    N odd:  Result  = sgn_r * tail + C_hi (user rounding mode for C1)
2948 //
2949 (p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2950         nop.i 999 ;;
2951 }
2952
2953 { .mfi
2954         nop.m 999
2955 (p0)  fmpy.s1 CORR = CORR, c
2956         nop.i 999 ;;
2957 }
2958
2959 { .mfi
2960         nop.m 999
2961 (p12) fnma.s1 V_hi = Cx,V_hi,f1
2962         nop.i 999 ;;
2963 }
2964
2965 { .mfi
2966         nop.m 999
2967 //
2968 //    N even: V_hi = Tx * Tx1 + 1
2969 //    N odd: Cx1 = 1 - Cx * Cx1
2970 //
2971 (p0)  fmpy.s1 P = P, xsq
2972         nop.i 999
2973 }
2974
2975 { .mfi
2976         nop.m 999
2977 //
2978 //    N even and odd: P = P * xsq
2979 //
2980 (p11) fmpy.s1 V_hi = V_hi, T_hi
2981         nop.i 999 ;;
2982 }
2983
2984 { .mfi
2985         nop.m 999
2986 //
2987 //    N even and odd: tail = P * tail + V_lo
2988 //
2989 (p11) fmpy.s1 T_hi = sgn_r, T_hi
2990         nop.i 999 ;;
2991 }
2992
2993 { .mfi
2994         nop.m 999
2995 (p0)  fmpy.s1 CORR = CORR, sgn_r
2996         nop.i 999 ;;
2997 }
2998
2999 { .mfi
3000         nop.m 999
3001 (p12) fmpy.s1 V_hi = V_hi,C_hi
3002         nop.i 999 ;;
3003 }
3004
3005 { .mfi
3006         nop.m 999
3007 //
3008 //    N even: V_hi = T_hi * V_hi
3009 //    N odd: V_hi  = C_hi * V_hi
3010 //
3011 (p0)  fma.s1 tanx = P, x, x
3012         nop.i 999
3013 }
3014
3015 { .mfi
3016         nop.m 999
3017 (p12) fnmpy.s1 C_hi = sgn_r, C_hi
3018         nop.i 999 ;;
3019 }
3020
3021 { .mfi
3022         nop.m 999
3023 //
3024 //    N even: V_lo = 1 - V_hi + C_hi
3025 //    N odd: V_lo = 1 - V_hi + T_hi
3026 //
3027 (p11) fadd.s1 CORR = CORR, T_lo
3028         nop.i 999
3029 }
3030
3031 { .mfi
3032         nop.m 999
3033 (p12) fsub.s1 CORR = CORR, C_lo
3034         nop.i 999 ;;
3035 }
3036
3037 { .mfi
3038         nop.m 999
3039 //
3040 //    N even and odd: tanx = x + x * P
3041 //    N even and odd: Sx = SC_inv * x
3042 //
3043 (p11) fsub.s1 D = C_hi, tanx
3044         nop.i 999
3045 }
3046
3047 { .mfi
3048         nop.m 999
3049 (p12) fadd.s1 D = T_hi, tanx
3050         nop.i 999 ;;
3051 }
3052
3053 { .mfi
3054         nop.m 999
3055 //
3056 //    N odd: CORR = SC_inv * C_hi
3057 //    N even: CORR = SC_inv * T_hi
3058 //
3059 (p0)  fnma.s1 D = V_hi, D, f1
3060         nop.i 999 ;;
3061 }
3062
3063 { .mfi
3064         nop.m 999
3065 //
3066 //    N even and odd: D = 1 - V_hi * D
3067 //    N even and odd: CORR = CORR * c
3068 //
3069 (p0)  fma.s1 V_hi = V_hi, D, V_hi
3070         nop.i 999 ;;
3071 }
3072
3073 { .mfi
3074         nop.m 999
3075 //
3076 //    N even and odd: V_hi = V_hi + V_hi * D
3077 //    N even and odd: CORR = sgn_r * CORR
3078 //
3079 (p11) fnma.s1 V_lo = V_hi, C_hi, f1
3080         nop.i 999
3081 }
3082
3083 { .mfi
3084         nop.m 999
3085 (p12) fnma.s1 V_lo = V_hi, T_hi, f1
3086         nop.i 999 ;;
3087 }
3088
3089 { .mfi
3090         nop.m 999
3091 //
3092 //    N even: CORR = COOR + T_lo
3093 //    N odd: CORR = CORR - C_lo
3094 //
3095 (p11) fma.s1 V_lo = tanx, V_hi, V_lo
3096         nop.i 999
3097 }
3098
3099 { .mfi
3100         nop.m 999
3101 (p12) fnma.s1 V_lo = tanx, V_hi, V_lo
3102         nop.i 999 ;;
3103 }
3104
3105 { .mfi
3106         nop.m 999
3107 //
3108 //    N even: V_lo = V_lo + V_hi * tanx
3109 //    N odd: V_lo = V_lo - V_hi * tanx
3110 //
3111 (p11) fnma.s1 V_lo = C_lo, V_hi, V_lo
3112         nop.i 999
3113 }
3114
3115 { .mfi
3116         nop.m 999
3117 (p12) fnma.s1 V_lo = T_lo, V_hi, V_lo
3118         nop.i 999 ;;
3119 }
3120
3121 { .mfi
3122         nop.m 999
3123 //
3124 //    N  even: V_lo = V_lo - V_hi * C_lo
3125 //    N  odd: V_lo = V_lo - V_hi * T_lo
3126 //
3127 (p0)  fmpy.s1 V_lo = V_hi, V_lo
3128         nop.i 999 ;;
3129 }
3130
3131 { .mfi
3132         nop.m 999
3133 //
3134 //    N even and odd: V_lo = V_lo * V_hi
3135 //
3136 (p0)  fadd.s1 tail = V_hi, V_lo
3137         nop.i 999 ;;
3138 }
3139
3140 { .mfi
3141         nop.m 999
3142 //
3143 //    N even and odd: tail = V_hi + V_lo
3144 //
3145 (p0)  fma.s1 tail = tail, P, V_lo
3146         nop.i 999 ;;
3147 }
3148
3149 { .mfi
3150         nop.m 999
3151 //
3152 //    N even: T_hi = sgn_r * T_hi
3153 //    N odd : C_hi = -sgn_r * C_hi
3154 //
3155 (p0)  fma.s1 tail = tail, Sx, CORR
3156         nop.i 999 ;;
3157 }
3158
3159 { .mfi
3160         nop.m 999
3161 //
3162 //    N even and odd: tail = Sx * tail + CORR
3163 //
3164 (p0)  fma.s1 tail = V_hi, Sx, tail
3165         nop.i 999 ;;
3166 }
3167
3168 { .mfi
3169         nop.m 999
3170 //
3171 //    N even an odd: tail = Sx * V_hi + tail
3172 //
3173 (p11) fma.s0 Result = sgn_r, tail, T_hi
3174         nop.i 999
3175 }
3176
3177 { .mfb
3178         nop.m 999
3179 (p12) fma.s0 Result = sgn_r, tail, C_hi
3180 (p0)   br.ret.sptk b0 ;;
3181 }
3182
3183 .endp __libm_tan
3184 ASM_SIZE_DIRECTIVE(__libm_tan)
3185
3186
3187
3188 // *******************************************************************
3189 // *******************************************************************
3190 // *******************************************************************
3191 //
3192 //     Special Code to handle very large argument case.
3193 //     Call int pi_by_2_reduce(&x,&r)
3194 //     for |arguments| >= 2**63
3195 //     (Arg or x) is in f8
3196 //     Address to save r and c as double
3197
3198 //                 (1)                    (2)                 (3) (call)         (4)
3199 //            sp -> +               psp -> +            psp -> +           sp ->  +
3200 //                  |                      |                   |                  |
3201 //                  |                r50 ->| <- r50      f0  ->|           r50 -> | -> c
3202 //                  |                      |                   |                  |
3203 //         sp-32 -> | <- r50          f0 ->|             f0  ->| <- r50    r49 -> | -> r
3204 //                  |                      |                   |                  |
3205 //                  |               r49  ->| <- r49     Arg  ->| <- r49           | -> x
3206 //                  |                      |                   |                  |
3207 //         sp -64 ->|             sp -64 ->|          sp -64 ->|                  |
3208 //
3209 //            save pfs           save b0                                     restore gp
3210 //            save gp                                                        restore b0
3211 //                                                                           restore pfs
3212
3213
3214
3215 .proc __libm_callout
3216 __libm_callout:
3217 TAN_ARG_TOO_LARGE:
3218 .prologue
3219 // (1)
3220 { .mfi
3221         add   GR_Parameter_r =-32,sp                        // Parameter: r address
3222         nop.f 0
3223 .save   ar.pfs,GR_SAVE_PFS
3224         mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
3225 }
3226 { .mfi
3227 .fframe 64
3228         add sp=-64,sp                           // Create new stack
3229         nop.f 0
3230         mov GR_SAVE_GP=gp                       // Save gp
3231 };;
3232
3233 // (2)
3234 { .mmi
3235         stfe [GR_Parameter_r ] = f0,16                      // Clear Parameter r on stack
3236         add  GR_Parameter_X = 16,sp                        // Parameter x address
3237 .save   b0, GR_SAVE_B0
3238         mov GR_SAVE_B0=b0                       // Save b0
3239 };;
3240
3241 // (3)
3242 .body
3243 { .mib
3244         stfe [GR_Parameter_r ] = f0,-16                     // Clear Parameter c on stack
3245         nop.i 0
3246         nop.b 0
3247 }
3248 { .mib
3249         stfe [GR_Parameter_X] = Arg                        // Store Parameter x on stack
3250         nop.i 0
3251 (p0)    br.call.sptk b0=__libm_pi_by_2_reduce#
3252 }
3253 ;;
3254
3255
3256 // (4)
3257 { .mmi
3258         mov   gp = GR_SAVE_GP                  // Restore gp
3259 (p0)    mov   N_fix_gr = r8
3260         nop.i 999
3261 }
3262 ;;
3263
3264 { .mmi
3265 (p0)    ldfe  Arg        =[GR_Parameter_X],16
3266 (p0)    ldfs  TWO_TO_NEG2 = [table_ptr2],4
3267         nop.i 999
3268 }
3269 ;;
3270
3271
3272 { .mmb
3273 (p0)    ldfe  r =[GR_Parameter_r ],16
3274 (p0)    ldfs  NEGTWO_TO_NEG2 = [table_ptr2],4
3275         nop.b 999 ;;
3276 }
3277
3278 { .mfi
3279 (p0)    ldfe  c =[GR_Parameter_r ]
3280         nop.f 999
3281         nop.i 999 ;;
3282 }
3283
3284 { .mfi
3285         nop.m 999
3286 //
3287 //     Is |r| < 2**(-2)
3288 //
3289 (p0)   fcmp.lt.unc.s1  p6, p0 = r, TWO_TO_NEG2
3290         mov   b0 = GR_SAVE_B0                  // Restore return address
3291 }
3292 ;;
3293
3294 { .mfi
3295        nop.m 999
3296 (p6)   fcmp.gt.unc.s1  p6, p0 = r, NEGTWO_TO_NEG2
3297        mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
3298 }
3299 ;;
3300
3301 { .mbb
3302 .restore sp
3303         add   sp = 64,sp                       // Restore stack pointer
3304 (p6)   br.cond.spnt TAN_SMALL_R
3305 (p0)   br.cond.sptk TAN_NORMAL_R
3306 }
3307 ;;
3308 .endp __libm_callout
3309 ASM_SIZE_DIRECTIVE(__libm_callout)
3310
3311
3312 .proc __libm_TAN_SPECIAL
3313 __libm_TAN_SPECIAL:
3314
3315 //
3316 //     Code for NaNs, Unsupporteds, Infs, or +/- zero ?
3317 //     Invalid raised for Infs and SNaNs.
3318 //
3319
3320 { .mfb
3321         nop.m 999
3322 (p0)   fmpy.s0 Arg = Arg, f0
3323 (p0)   br.ret.sptk b0
3324 }
3325 .endp __libm_TAN_SPECIAL
3326 ASM_SIZE_DIRECTIVE(__libm_TAN_SPECIAL)
3327
3328
3329 .type __libm_pi_by_2_reduce#,@function
3330 .global __libm_pi_by_2_reduce#