sysdeps/ia64/fpu/s_tanl.S

   1 .file "tancotl.s"
   2
   3
   4 // Copyright (c) 2000 - 2003, Intel Corporation
   5 // All rights reserved.
   6 //
   7 // Contributed 2000 by the Intel Numerics Group, Intel Corporation
   8 //
   9 // Redistribution and use in source and binary forms, with or without
  10 // modification, are permitted provided that the following conditions are
  11 // met:
  12 //
  13 // * Redistributions of source code must retain the above copyright
  14 // notice, this list of conditions and the following disclaimer.
  15 //
  16 // * Redistributions in binary form must reproduce the above copyright
  17 // notice, this list of conditions and the following disclaimer in the
  18 // documentation and/or other materials provided with the distribution.
  19 //
  20 // * The name of Intel Corporation may not be used to endorse or promote
  21 // products derived from this software without specific prior written
  22 // permission.
  23
  24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  35 //
  36 // Intel Corporation is the author of this code, and requests that all
  37 // problem reports or change requests be submitted to it directly at
  38 // http://www.intel.com/software/products/opensource/libraries/num.htm.
  39 //
  40 //*********************************************************************
  41 //
  42 // History:
  43 //
  44 // 02/02/00 (hand-optimized)
  45 // 04/04/00 Unwind support added
  46 // 12/28/00 Fixed false invalid flags
  47 // 02/06/02 Improved speed
  48 // 05/07/02 Changed interface to __libm_pi_by_2_reduce
  49 // 05/30/02 Added cotl
  50 // 02/10/03 Reordered header: .section, .global, .proc, .align;
  51 //          used data8 for long double table values
  52 // 05/15/03 Reformatted data tables
  53 //
  54 //*********************************************************************
  55 //
  56 // Functions:   tanl(x) = tangent(x), for double-extended precision x values
  57 //              cotl(x) = cotangent(x), for double-extended precision x values
  58 //
  59 //*********************************************************************
  60 //
  61 // Resources Used:
  62 //
  63 //    Floating-Point Registers: f8 (Input and Return Value)
  64 //                              f9-f15
  65 //                              f32-f121
  66 //
  67 //    General Purpose Registers:
  68 //      r14-r26,r32-r57
  69 //
  70 //    Predicate Registers:      p6-p15
  71 //
  72 //*********************************************************************
  73 //
  74 // IEEE Special Conditions for tanl:
  75 //
  76 //    Denormal  fault raised on denormal inputs
  77 //    Overflow exceptions do not occur
  78 //    Underflow exceptions raised when appropriate for tan
  79 //    (No specialized error handling for this routine)
  80 //    Inexact raised when appropriate by algorithm
  81 //
  82 //    tanl(SNaN) = QNaN
  83 //    tanl(QNaN) = QNaN
  84 //    tanl(inf) = QNaN
  85 //    tanl(+/-0) = +/-0
  86 //
  87 //*********************************************************************
  88 //
  89 // IEEE Special Conditions for cotl:
  90 //
  91 //    Denormal  fault raised on denormal inputs
  92 //    Overflow exceptions occur at zero and near zero
  93 //    Underflow exceptions do not occur
  94 //    Inexact raised when appropriate by algorithm
  95 //
  96 //    cotl(SNaN) = QNaN
  97 //    cotl(QNaN) = QNaN
  98 //    cotl(inf) = QNaN
  99 //    cotl(+/-0) = +/-Inf and error handling is called
 100 //
 101 //*********************************************************************
 102 //
 103 //    Below are mathematical and algorithmic descriptions for tanl.
 104 //    For cotl we use next identity cot(x) = -tan(x + Pi/2).
 105 //    So, to compute cot(x) we just need to increment N (N = N + 1)
 106 //    and invert sign of the computed result.
 107 //
 108 //*********************************************************************
 109 //
 110 // Mathematical Description
 111 //
 112 // We consider the computation of FPTANL of Arg. Now, given
 113 //
 114 //      Arg = N pi/2  + alpha,          |alpha| <= pi/4,
 115 //
 116 // basic mathematical relationship shows that
 117 //
 118 //      tan( Arg ) =  tan( alpha )     if N is even;
 119 //                 = -cot( alpha )      otherwise.
 120 //
 121 // The value of alpha is obtained by argument reduction and
 122 // represented by two working precision numbers r and c where
 123 //
 124 //      alpha =  r  +  c     accurately.
 125 //
 126 // The reduction method is described in a previous write up.
 127 // The argument reduction scheme identifies 4 cases. For Cases 2
 128 // and 4, because |alpha| is small, tan(r+c) and -cot(r+c) can be
 129 // computed very easily by 2 or 3 terms of the Taylor series
 130 // expansion as follows:
 131 //
 132 // Case 2:
 133 // -------
 134 //
 135 //      tan(r + c) = r + c + r^3/3          ...accurately
 136 //     -cot(r + c) = -1/(r+c) + r/3          ...accurately
 137 //
 138 // Case 4:
 139 // -------
 140 //
 141 //      tan(r + c) = r + c + r^3/3 + 2r^5/15     ...accurately
 142 //     -cot(r + c) = -1/(r+c) + r/3 + r^3/45     ...accurately
 143 //
 144 //
 145 // The only cases left are Cases 1 and 3 of the argument reduction
 146 // procedure. These two cases will be merged since after the
 147 // argument is reduced in either cases, we have the reduced argument
 148 // represented as r + c and that the magnitude |r + c| is not small
 149 // enough to allow the usage of a very short approximation.
 150 //
 151 // The greatest challenge of this task is that the second terms of
 152 // the Taylor series for tan(r) and -cot(r)
 153 //
 154 //      r + r^3/3 + 2 r^5/15 + ...
 155 //
 156 // and
 157 //
 158 //      -1/r + r/3 + r^3/45 + ...
 159 //
 160 // are not very small when |r| is close to pi/4 and the rounding
 161 // errors will be a concern if simple polynomial accumulation is
 162 // used. When |r| < 2^(-2), however, the second terms will be small
 163 // enough (5 bits or so of right shift) that a normal Horner
 164 // recurrence suffices. Hence there are two cases that we consider
 165 // in the accurate computation of tan(r) and cot(r), |r| <= pi/4.
 166 //
 167 // Case small_r: |r| < 2^(-2)
 168 // --------------------------
 169 //
 170 // Since Arg = N pi/4 + r + c accurately, we have
 171 //
 172 //      tan(Arg) =  tan(r+c)            for N even,
 173 //               = -cot(r+c)            otherwise.
 174 //
 175 // Here for this case, both tan(r) and -cot(r) can be approximated
 176 // by simple polynomials:
 177 //
 178 //      tan(r) =    r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
 179 //     -cot(r) = -1/r + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
 180 //
 181 // accurately. Since |r| is relatively small, tan(r+c) and
 182 // -cot(r+c) can be accurately approximated by replacing r with
 183 // r+c only in the first two terms of the corresponding polynomials.
 184 //
 185 // Note that P1_1 (and Q1_1 for that matter) approximates 1/3 to
 186 // almost 64 sig. bits, thus
 187 //
 188 //      P1_1 (r+c)^3 =  P1_1 r^3 + c * r^2     accurately.
 189 //
 190 // Hence,
 191 //
 192 //      tan(r+c) =    r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
 193 //                     + c*(1 + r^2)
 194 //
 195 //        -cot(r+c) = -1/(r+c) + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
 196 //               + Q1_1*c
 197 //
 198 //
 199 // Case normal_r: 2^(-2) <= |r| <= pi/4
 200 // ------------------------------------
 201 //
 202 // This case is more likely than the previous one if one considers
 203 // r to be uniformly distributed in [-pi/4 pi/4].
 204 //
 205 // The required calculation is either
 206 //
 207 //      tan(r + c)  =  tan(r)  +  correction,  or
 208 //     -cot(r + c)  = -cot(r)  +  correction.
 209 //
 210 // Specifically,
 211 //
 212 //      tan(r + c) =  tan(r) + c tan'(r)  + O(c^2)
 213 //                 =  tan(r) + c sec^2(r) + O(c^2)
 214 //                 =  tan(r) + c SEC_sq     ...accurately
 215 //                as long as SEC_sq approximates sec^2(r)
 216 //                to, say, 5 bits or so.
 217 //
 218 // Similarly,
 219 //
 220 //     -cot(r + c) = -cot(r) - c cot'(r)  + O(c^2)
 221 //                 = -cot(r) + c csc^2(r) + O(c^2)
 222 //                 = -cot(r) + c CSC_sq     ...accurately
 223 //                as long as CSC_sq approximates csc^2(r)
 224 //                to, say, 5 bits or so.
 225 //
 226 // We therefore concentrate on accurately calculating tan(r) and
 227 // cot(r) for a working-precision number r, |r| <= pi/4 to within
 228 // 0.1% or so.
 229 //
 230 // We will employ a table-driven approach. Let
 231 //
 232 //      r = sgn_r * 2^k * 1.b_1 b_2 ... b_5 ... b_63
 233 //        = sgn_r * ( B + x )
 234 //
 235 // where
 236 //
 237 //      B = 2^k * 1.b_1 b_2 ... b_5 1
 238 //      x = |r| - B
 239 //
 240 // Now,
 241 //                   tan(B)  +   tan(x)
 242 //      tan( B + x ) =  ------------------------
 243 //                   1 -  tan(B)*tan(x)
 244 //
 245 //               /                         \
 246 //               |   tan(B)  +   tan(x)          |
 247
 248 //      = tan(B) +  | ------------------------ - tan(B) |
 249 //               |     1 -  tan(B)*tan(x)          |
 250 //               \                         /
 251 //
 252 //                 sec^2(B) * tan(x)
 253 //      = tan(B) + ------------------------
 254 //                 1 -  tan(B)*tan(x)
 255 //
 256 //                (1/[sin(B)*cos(B)]) * tan(x)
 257 //      = tan(B) + --------------------------------
 258 //                      cot(B)  -  tan(x)
 259 //
 260 //
 261 // Clearly, the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
 262 // calculated beforehand and stored in a table. Since
 263 //
 264 //      |x| <= 2^k * 2^(-6)  <= 2^(-7)  (because k = -1, -2)
 265 //
 266 // a very short polynomial will be sufficient to approximate tan(x)
 267 // accurately. The details involved in computing the last expression
 268 // will be given in the next section on algorithm description.
 269 //
 270 //
 271 // Now, we turn to the case where cot( B + x ) is needed.
 272 //
 273 //
 274 //                   1 - tan(B)*tan(x)
 275 //      cot( B + x ) =  ------------------------
 276 //                   tan(B)  +  tan(x)
 277 //
 278 //               /                           \
 279 //               |   1 - tan(B)*tan(x)              |
 280
 281 //      = cot(B) +  | ----------------------- - cot(B) |
 282 //               |     tan(B)  +  tan(x)            |
 283 //               \                           /
 284 //
 285 //               [tan(B) + cot(B)] * tan(x)
 286 //      = cot(B) - ----------------------------
 287 //                   tan(B)  +  tan(x)
 288 //
 289 //                (1/[sin(B)*cos(B)]) * tan(x)
 290 //      = cot(B) - --------------------------------
 291 //                      tan(B)  +  tan(x)
 292 //
 293 //
 294 // Note that the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) that
 295 // are needed are the same set of values needed in the previous
 296 // case.
 297 //
 298 // Finally, we can put all the ingredients together as follows:
 299 //
 300 //      Arg = N * pi/2 +  r + c          ...accurately
 301 //
 302 //      tan(Arg) =  tan(r) + correction    if N is even;
 303 //               = -cot(r) + correction    otherwise.
 304 //
 305 // For Cases 2 and 4,
 306 //
 307 //     Case 2:
 308 //     tan(Arg) =  tan(r + c) = r + c + r^3/3           N even
 309 //              = -cot(r + c) = -1/(r+c) + r/3           N odd
 310 //     Case 4:
 311 //     tan(Arg) =  tan(r + c) = r + c + r^3/3 + 2r^5/15  N even
 312 //              = -cot(r + c) = -1/(r+c) + r/3 + r^3/45  N odd
 313 //
 314 //
 315 // For Cases 1 and 3,
 316 //
 317 //     Case small_r: |r| < 2^(-2)
 318 //
 319 //      tan(Arg) =  r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
 320 //                     + c*(1 + r^2)               N even
 321 //
 322 //               = -1/(r+c) + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
 323 //                     + Q1_1*c                    N odd
 324 //
 325 //     Case normal_r: 2^(-2) <= |r| <= pi/4
 326 //
 327 //      tan(Arg) =  tan(r) + c * sec^2(r)     N even
 328 //               = -cot(r) + c * csc^2(r)     otherwise
 329 //
 330 //     For N even,
 331 //
 332 //      tan(Arg) = tan(r) + c*sec^2(r)
 333 //               = tan( sgn_r * (B+x) ) + c * sec^2(|r|)
 334 //               = sgn_r * ( tan(B+x)  + sgn_r*c*sec^2(|r|) )
 335 //               = sgn_r * ( tan(B+x)  + sgn_r*c*sec^2(B) )
 336 //
 337 // since B approximates |r| to 2^(-6) in relative accuracy.
 338 //
 339 //                 /            (1/[sin(B)*cos(B)]) * tan(x)
 340 //    tan(Arg) = sgn_r * | tan(B) + --------------------------------
 341 //                 \                     cot(B)  -  tan(x)
 342 //                                        \
 343 //                       + CORR  |
 344
 345 //                                     /
 346 // where
 347 //
 348 //    CORR = sgn_r*c*tan(B)*SC_inv(B);  SC_inv(B) = 1/(sin(B)*cos(B)).
 349 //
 350 // For N odd,
 351 //
 352 //      tan(Arg) = -cot(r) + c*csc^2(r)
 353 //               = -cot( sgn_r * (B+x) ) + c * csc^2(|r|)
 354 //               = sgn_r * ( -cot(B+x)  + sgn_r*c*csc^2(|r|) )
 355 //               = sgn_r * ( -cot(B+x)  + sgn_r*c*csc^2(B) )
 356 //
 357 // since B approximates |r| to 2^(-6) in relative accuracy.
 358 //
 359 //                 /            (1/[sin(B)*cos(B)]) * tan(x)
 360 //    tan(Arg) = sgn_r * | -cot(B) + --------------------------------
 361 //                 \                     tan(B)  +  tan(x)
 362 //                                        \
 363 //                       + CORR  |
 364
 365 //                                     /
 366 // where
 367 //
 368 //    CORR = sgn_r*c*cot(B)*SC_inv(B);  SC_inv(B) = 1/(sin(B)*cos(B)).
 369 //
 370 //
 371 // The actual algorithm prescribes how all the mathematical formulas
 372 // are calculated.
 373 //
 374 //
 375 // 2. Algorithmic Description
 376 // ==========================
 377 //
 378 // 2.1 Computation for Cases 2 and 4.
 379 // ----------------------------------
 380 //
 381 // For Case 2, we use two-term polynomials.
 382 //
 383 //    For N even,
 384 //
 385 //    rsq := r * r
 386 //    Poly := c + r * rsq * P1_1
 387 //    Result := r + Poly          ...in user-defined rounding
 388 //
 389 //    For N odd,
 390 //    S_hi  := -frcpa(r)               ...8 bits
 391 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
 392 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
 393 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
 394 //    S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
 395 //    ...S_hi + S_lo is -1/(r+c) to extra precision
 396 //    S_lo  := S_lo + Q1_1*r
 397 //
 398 //    Result := S_hi + S_lo     ...in user-defined rounding
 399 //
 400 // For Case 4, we use three-term polynomials
 401 //
 402 //    For N even,
 403 //
 404 //    rsq := r * r
 405 //    Poly := c + r * rsq * (P1_1 + rsq * P1_2)
 406 //    Result := r + Poly          ...in user-defined rounding
 407 //
 408 //    For N odd,
 409 //    S_hi  := -frcpa(r)               ...8 bits
 410 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
 411 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
 412 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
 413 //    S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
 414 //    ...S_hi + S_lo is -1/(r+c) to extra precision
 415 //    rsq   := r * r
 416 //    P      := Q1_1 + rsq*Q1_2
 417 //    S_lo  := S_lo + r*P
 418 //
 419 //    Result := S_hi + S_lo     ...in user-defined rounding
 420 //
 421 //
 422 // Note that the coefficients P1_1, P1_2, Q1_1, and Q1_2 are
 423 // the same as those used in the small_r case of Cases 1 and 3
 424 // below.
 425 //
 426 //
 427 // 2.2 Computation for Cases 1 and 3.
 428 // ----------------------------------
 429 // This is further divided into the case of small_r,
 430 // where |r| < 2^(-2), and the case of normal_r, where |r| lies between
 431 // 2^(-2) and pi/4.
 432 //
 433 // Algorithm for the case of small_r
 434 // ---------------------------------
 435 //
 436 // For N even,
 437 //      rsq   := r * r
 438 //      Poly1 := rsq*(P1_1 + rsq*(P1_2 + rsq*P1_3))
 439 //      r_to_the_8    := rsq * rsq
 440 //      r_to_the_8    := r_to_the_8 * r_to_the_8
 441 //      Poly2 := P1_4 + rsq*(P1_5 + rsq*(P1_6 + ... rsq*P1_9))
 442 //      CORR  := c * ( 1 + rsq )
 443 //      Poly  := Poly1 + r_to_the_8*Poly2
 444 //      Poly := r*Poly + CORR
 445 //      Result := r + Poly     ...in user-defined rounding
 446 //      ...note that Poly1 and r_to_the_8 can be computed in parallel
 447 //      ...with Poly2 (Poly1 is intentionally set to be much
 448 //      ...shorter than Poly2 so that r_to_the_8 and CORR can be hidden)
 449 //
 450 // For N odd,
 451 //      S_hi  := -frcpa(r)               ...8 bits
 452 //      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
 453 //      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
 454 //      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
 455 //      S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
 456 //      ...S_hi + S_lo is -1/(r+c) to extra precision
 457 //      S_lo  := S_lo + Q1_1*c
 458 //
 459 //      ...S_hi and S_lo are computed in parallel with
 460 //      ...the following
 461 //      rsq := r*r
 462 //      P   := Q1_1 + rsq*(Q1_2 + rsq*(Q1_3 + ... + rsq*Q1_7))
 463 //
 464 //      Poly :=  r*P + S_lo
 465 //      Result :=  S_hi  +  Poly      ...in user-defined rounding
 466 //
 467 //
 468 // Algorithm for the case of normal_r
 469 // ----------------------------------
 470 //
 471 // Here, we first consider the computation of tan( r + c ). As
 472 // presented in the previous section,
 473 //
 474 //      tan( r + c )  =  tan(r) + c * sec^2(r)
 475 //                 =  sgn_r * [ tan(B+x) + CORR ]
 476 //      CORR = sgn_r * c * tan(B) * 1/[sin(B)*cos(B)]
 477 //
 478 // because sec^2(r) = sec^(|r|), and B approximate |r| to 6.5 bits.
 479 //
 480 //      tan( r + c ) =
 481 //           /           (1/[sin(B)*cos(B)]) * tan(x)
 482 //      sgn_r * | tan(B) + --------------------------------  +
 483 //           \                     cot(B)  -  tan(x)
 484 //                                \
 485 //                          CORR  |
 486
 487 //                                /
 488 //
 489 // The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
 490 // calculated beforehand and stored in a table. Specifically,
 491 // the table values are
 492 //
 493 //      tan(B)             as  T_hi  +  T_lo;
 494 //      cot(B)             as  C_hi  +  C_lo;
 495 //      1/[sin(B)*cos(B)]  as  SC_inv
 496 //
 497 // T_hi, C_hi are in  double-precision  memory format;
 498 // T_lo, C_lo are in  single-precision  memory format;
 499 // SC_inv     is  in extended-precision memory format.
 500 //
 501 // The value of tan(x) will be approximated by a short polynomial of
 502 // the form
 503 //
 504 //      tan(x)  as  x  +  x * P, where
 505 //           P  =   x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
 506 //
 507 // Because |x| <= 2^(-7), cot(B) - x approximates cot(B) - tan(x)
 508 // to a relative accuracy better than 2^(-20). Thus, a good
 509 // initial guess of 1/( cot(B) - tan(x) ) to initiate the iterative
 510 // division is:
 511 //
 512 //      1/(cot(B) - tan(x))      is approximately
 513 //      1/(cot(B) -   x)         is
 514 //      tan(B)/(1 - x*tan(B))    is approximately
 515 //      T_hi / ( 1 - T_hi * x )  is approximately
 516 //
 517 //      T_hi * [ 1 + (Thi * x) + (T_hi * x)^2 ]
 518 //
 519 // The calculation of tan(r+c) therefore proceed as follows:
 520 //
 521 //      Tx     := T_hi * x
 522 //      xsq     := x * x
 523 //
 524 //      V_hi     := T_hi*(1 + Tx*(1 + Tx))
 525 //      P     := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
 526 //      ...V_hi serves as an initial guess of 1/(cot(B) - tan(x))
 527 //         ...good to about 20 bits of accuracy
 528 //
 529 //      tanx     := x + x*P
 530 //      D     := C_hi - tanx
 531 //      ...D is a double precision denominator: cot(B) - tan(x)
 532 //
 533 //      V_hi     := V_hi + V_hi*(1 - V_hi*D)
 534 //      ....V_hi approximates 1/(cot(B)-tan(x)) to 40 bits
 535 //
 536 //      V_lo     := V_hi * ( [ (1 - V_hi*C_hi) + V_hi*tanx ]
 537 //                           - V_hi*C_lo )   ...observe all order
 538 //         ...V_hi + V_lo approximates 1/(cot(B) - tan(x))
 539 //      ...to extra accuracy
 540 //
 541 //      ...               SC_inv(B) * (x + x*P)
 542 //      ...   tan(B) +      ------------------------- + CORR
 543 //         ...                cot(B) - (x + x*P)
 544 //      ...
 545 //      ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 546 //      ...
 547 //
 548 //      Sx     := SC_inv * x
 549 //      CORR     := sgn_r * c * SC_inv * T_hi
 550 //
 551 //      ...put the ingredients together to compute
 552 //      ...               SC_inv(B) * (x + x*P)
 553 //      ...   tan(B) +      ------------------------- + CORR
 554 //         ...                cot(B) - (x + x*P)
 555 //      ...
 556 //      ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 557 //      ...
 558 //      ... = T_hi + T_lo + CORR +
 559 //      ...    Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
 560 //
 561 //      CORR := CORR + T_lo
 562 //      tail := V_lo + P*(V_hi + V_lo)
 563 //         tail := Sx * tail  +  CORR
 564 //      tail := Sx * V_hi  +  tail
 565 //         T_hi := sgn_r * T_hi
 566 //
 567 //         ...T_hi + sgn_r*tail  now approximate
 568 //      ...sgn_r*(tan(B+x) + CORR) accurately
 569 //
 570 //      Result :=  T_hi + sgn_r*tail  ...in user-defined
 571 //                           ...rounding control
 572 //      ...It is crucial that independent paths be fully
 573 //      ...exploited for performance's sake.
 574 //
 575 //
 576 // Next, we consider the computation of -cot( r + c ). As
 577 // presented in the previous section,
 578 //
 579 //        -cot( r + c )  =  -cot(r) + c * csc^2(r)
 580 //                 =  sgn_r * [ -cot(B+x) + CORR ]
 581 //      CORR = sgn_r * c * cot(B) * 1/[sin(B)*cos(B)]
 582 //
 583 // because csc^2(r) = csc^(|r|), and B approximate |r| to 6.5 bits.
 584 //
 585 //        -cot( r + c ) =
 586 //           /             (1/[sin(B)*cos(B)]) * tan(x)
 587 //      sgn_r * | -cot(B) + --------------------------------  +
 588 //           \                     tan(B)  +  tan(x)
 589 //                                \
 590 //                          CORR  |
 591
 592 //                                /
 593 //
 594 // The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
 595 // calculated beforehand and stored in a table. Specifically,
 596 // the table values are
 597 //
 598 //      tan(B)             as  T_hi  +  T_lo;
 599 //      cot(B)             as  C_hi  +  C_lo;
 600 //      1/[sin(B)*cos(B)]  as  SC_inv
 601 //
 602 // T_hi, C_hi are in  double-precision  memory format;
 603 // T_lo, C_lo are in  single-precision  memory format;
 604 // SC_inv     is  in extended-precision memory format.
 605 //
 606 // The value of tan(x) will be approximated by a short polynomial of
 607 // the form
 608 //
 609 //      tan(x)  as  x  +  x * P, where
 610 //           P  =   x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
 611 //
 612 // Because |x| <= 2^(-7), tan(B) + x approximates tan(B) + tan(x)
 613 // to a relative accuracy better than 2^(-18). Thus, a good
 614 // initial guess of 1/( tan(B) + tan(x) ) to initiate the iterative
 615 // division is:
 616 //
 617 //      1/(tan(B) + tan(x))      is approximately
 618 //      1/(tan(B) +   x)         is
 619 //      cot(B)/(1 + x*cot(B))    is approximately
 620 //      C_hi / ( 1 + C_hi * x )  is approximately
 621 //
 622 //      C_hi * [ 1 - (C_hi * x) + (C_hi * x)^2 ]
 623 //
 624 // The calculation of -cot(r+c) therefore proceed as follows:
 625 //
 626 //      Cx     := C_hi * x
 627 //      xsq     := x * x
 628 //
 629 //      V_hi     := C_hi*(1 - Cx*(1 - Cx))
 630 //      P     := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
 631 //      ...V_hi serves as an initial guess of 1/(tan(B) + tan(x))
 632 //         ...good to about 18 bits of accuracy
 633 //
 634 //      tanx     := x + x*P
 635 //      D     := T_hi + tanx
 636 //      ...D is a double precision denominator: tan(B) + tan(x)
 637 //
 638 //      V_hi     := V_hi + V_hi*(1 - V_hi*D)
 639 //      ....V_hi approximates 1/(tan(B)+tan(x)) to 40 bits
 640 //
 641 //      V_lo     := V_hi * ( [ (1 - V_hi*T_hi) - V_hi*tanx ]
 642 //                           - V_hi*T_lo )   ...observe all order
 643 //         ...V_hi + V_lo approximates 1/(tan(B) + tan(x))
 644 //      ...to extra accuracy
 645 //
 646 //      ...               SC_inv(B) * (x + x*P)
 647 //      ...  -cot(B) +      ------------------------- + CORR
 648 //         ...                tan(B) + (x + x*P)
 649 //      ...
 650 //      ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 651 //      ...
 652 //
 653 //      Sx     := SC_inv * x
 654 //      CORR     := sgn_r * c * SC_inv * C_hi
 655 //
 656 //      ...put the ingredients together to compute
 657 //      ...               SC_inv(B) * (x + x*P)
 658 //      ...  -cot(B) +      ------------------------- + CORR
 659 //         ...                tan(B) + (x + x*P)
 660 //      ...
 661 //      ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 662 //      ...
 663 //      ... =-C_hi - C_lo + CORR +
 664 //      ...    Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
 665 //
 666 //      CORR := CORR - C_lo
 667 //      tail := V_lo + P*(V_hi + V_lo)
 668 //         tail := Sx * tail  +  CORR
 669 //      tail := Sx * V_hi  +  tail
 670 //         C_hi := -sgn_r * C_hi
 671 //
 672 //         ...C_hi + sgn_r*tail now approximates
 673 //      ...sgn_r*(-cot(B+x) + CORR) accurately
 674 //
 675 //      Result :=  C_hi + sgn_r*tail   in user-defined rounding control
 676 //      ...It is crucial that independent paths be fully
 677 //      ...exploited for performance's sake.
 678 //
 679 // 3. Implementation Notes
 680 // =======================
 681 //
 682 //   Table entries T_hi, T_lo; C_hi, C_lo; SC_inv
 683 //
 684 //   Recall that 2^(-2) <= |r| <= pi/4;
 685 //
 686 //      r = sgn_r * 2^k * 1.b_1 b_2 ... b_63
 687 //
 688 //   and
 689 //
 690 //        B = 2^k * 1.b_1 b_2 b_3 b_4 b_5 1
 691 //
 692 //   Thus, for k = -2, possible values of B are
 693 //
 694 //          B = 2^(-2) * ( 1 + index/32  +  1/64 ),
 695 //      index ranges from 0 to 31
 696 //
 697 //   For k = -1, however, since |r| <= pi/4 = 0.78...
 698 //   possible values of B are
 699 //
 700 //        B = 2^(-1) * ( 1 + index/32  +  1/64 )
 701 //      index ranges from 0 to 19.
 702 //
 703 //
 704
 705 RODATA
 706 .align 16
 707
 708 LOCAL_OBJECT_START(TANL_BASE_CONSTANTS)
 709
 710 tanl_table_1:
 711 data8    0xA2F9836E4E44152A, 0x00003FFE // two_by_pi
 712 data8    0xC84D32B0CE81B9F1, 0x00004016 // P_0
 713 data8    0xC90FDAA22168C235, 0x00003FFF // P_1
 714 data8    0xECE675D1FC8F8CBB, 0x0000BFBD // P_2
 715 data8    0xB7ED8FBBACC19C60, 0x0000BF7C // P_3
 716 LOCAL_OBJECT_END(TANL_BASE_CONSTANTS)
 717
 718 LOCAL_OBJECT_START(tanl_table_2)
 719 data8    0xC90FDAA22168C234, 0x00003FFE // PI_BY_4
 720 data8    0xA397E5046EC6B45A, 0x00003FE7 // Inv_P_0
 721 data8    0x8D848E89DBD171A1, 0x0000BFBF // d_1
 722 data8    0xD5394C3618A66F8E, 0x0000BF7C // d_2
 723 data4    0x3E800000 // two**-2
 724 data4    0xBE800000 // -two**-2
 725 data4    0x00000000 // pad
 726 data4    0x00000000 // pad
 727 LOCAL_OBJECT_END(tanl_table_2)
 728
 729 LOCAL_OBJECT_START(tanl_table_p1)
 730 data8    0xAAAAAAAAAAAAAABD, 0x00003FFD // P1_1
 731 data8    0x8888888888882E6A, 0x00003FFC // P1_2
 732 data8    0xDD0DD0DD0F0177B6, 0x00003FFA // P1_3
 733 data8    0xB327A440646B8C6D, 0x00003FF9 // P1_4
 734 data8    0x91371B251D5F7D20, 0x00003FF8 // P1_5
 735 data8    0xEB69A5F161C67914, 0x00003FF6 // P1_6
 736 data8    0xBEDD37BE019318D2, 0x00003FF5 // P1_7
 737 data8    0x9979B1463C794015, 0x00003FF4 // P1_8
 738 data8    0x8EBD21A38C6EB58A, 0x00003FF3 // P1_9
 739 LOCAL_OBJECT_END(tanl_table_p1)
 740
 741 LOCAL_OBJECT_START(tanl_table_q1)
 742 data8    0xAAAAAAAAAAAAAAB4, 0x00003FFD // Q1_1
 743 data8    0xB60B60B60B5FC93E, 0x00003FF9 // Q1_2
 744 data8    0x8AB355E00C9BBFBF, 0x00003FF6 // Q1_3
 745 data8    0xDDEBBC89CBEE3D4C, 0x00003FF2 // Q1_4
 746 data8    0xB3548A685F80BBB6, 0x00003FEF // Q1_5
 747 data8    0x913625604CED5BF1, 0x00003FEC // Q1_6
 748 data8    0xF189D95A8EE92A83, 0x00003FE8 // Q1_7
 749 LOCAL_OBJECT_END(tanl_table_q1)
 750
 751 LOCAL_OBJECT_START(tanl_table_p2)
 752 data8    0xAAAAAAAAAAAB362F, 0x00003FFD // P2_1
 753 data8    0x88888886E97A6097, 0x00003FFC // P2_2
 754 data8    0xDD108EE025E716A1, 0x00003FFA // P2_3
 755 LOCAL_OBJECT_END(tanl_table_p2)
 756
 757 LOCAL_OBJECT_START(tanl_table_tm2)
 758 //
 759 //  Entries T_hi   double-precision memory format
 760 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 761 //  Entries T_lo  single-precision memory format
 762 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 763 //
 764 data8 0x3FD09BC362400794
 765 data4 0x23A05C32, 0x00000000
 766 data8 0x3FD124A9DFFBC074
 767 data4 0x240078B2, 0x00000000
 768 data8 0x3FD1AE235BD4920F
 769 data4 0x23826B8E, 0x00000000
 770 data8 0x3FD2383515E2701D
 771 data4 0x22D31154, 0x00000000
 772 data8 0x3FD2C2E463739C2D
 773 data4 0x2265C9E2, 0x00000000
 774 data8 0x3FD34E36AFEEA48B
 775 data4 0x245C05EB, 0x00000000
 776 data8 0x3FD3DA317DBB35D1
 777 data4 0x24749F2D, 0x00000000
 778 data8 0x3FD466DA67321619
 779 data4 0x2462CECE, 0x00000000
 780 data8 0x3FD4F4371F94A4D5
 781 data4 0x246D0DF1, 0x00000000
 782 data8 0x3FD5824D740C3E6D
 783 data4 0x240A85B5, 0x00000000
 784 data8 0x3FD611234CB1E73D
 785 data4 0x23F96E33, 0x00000000
 786 data8 0x3FD6A0BEAD9EA64B
 787 data4 0x247C5393, 0x00000000
 788 data8 0x3FD73125B804FD01
 789 data4 0x241F3B29, 0x00000000
 790 data8 0x3FD7C25EAB53EE83
 791 data4 0x2479989B, 0x00000000
 792 data8 0x3FD8546FE6640EED
 793 data4 0x23B343BC, 0x00000000
 794 data8 0x3FD8E75FE8AF1892
 795 data4 0x241454D1, 0x00000000
 796 data8 0x3FD97B3553928BDA
 797 data4 0x238613D9, 0x00000000
 798 data8 0x3FDA0FF6EB9DE4DE
 799 data4 0x22859FA7, 0x00000000
 800 data8 0x3FDAA5AB99ECF92D
 801 data4 0x237A6D06, 0x00000000
 802 data8 0x3FDB3C5A6D8F1796
 803 data4 0x23952F6C, 0x00000000
 804 data8 0x3FDBD40A9CFB8BE4
 805 data4 0x2280FC95, 0x00000000
 806 data8 0x3FDC6CC387943100
 807 data4 0x245D2EC0, 0x00000000
 808 data8 0x3FDD068CB736C500
 809 data4 0x23C4AD7D, 0x00000000
 810 data8 0x3FDDA16DE1DDBC31
 811 data4 0x23D076E6, 0x00000000
 812 data8 0x3FDE3D6EEB515A93
 813 data4 0x244809A6, 0x00000000
 814 data8 0x3FDEDA97E6E9E5F1
 815 data4 0x220856C8, 0x00000000
 816 data8 0x3FDF78F11963CE69
 817 data4 0x244BE993, 0x00000000
 818 data8 0x3FE00C417D635BCE
 819 data4 0x23D21799, 0x00000000
 820 data8 0x3FE05CAB1C302CD3
 821 data4 0x248A1B1D, 0x00000000
 822 data8 0x3FE0ADB9DB6A1FA0
 823 data4 0x23D53E33, 0x00000000
 824 data8 0x3FE0FF724A20BA81
 825 data4 0x24DB9ED5, 0x00000000
 826 data8 0x3FE151D9153FA6F5
 827 data4 0x24E9E451, 0x00000000
 828 LOCAL_OBJECT_END(tanl_table_tm2)
 829
 830 LOCAL_OBJECT_START(tanl_table_tm1)
 831 //
 832 //  Entries T_hi   double-precision memory format
 833 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 834 //  Entries T_lo  single-precision memory format
 835 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 836 //
 837 data8 0x3FE1CEC4BA1BE39E
 838 data4 0x24B60F9E, 0x00000000
 839 data8 0x3FE277E45ABD9B2D
 840 data4 0x248C2474, 0x00000000
 841 data8 0x3FE324180272B110
 842 data4 0x247B8311, 0x00000000
 843 data8 0x3FE3D38B890E2DF0
 844 data4 0x24C55751, 0x00000000
 845 data8 0x3FE4866D46236871
 846 data4 0x24E5BC34, 0x00000000
 847 data8 0x3FE53CEE45E044B0
 848 data4 0x24001BA4, 0x00000000
 849 data8 0x3FE5F74282EC06E4
 850 data4 0x24B973DC, 0x00000000
 851 data8 0x3FE6B5A125DF43F9
 852 data4 0x24895440, 0x00000000
 853 data8 0x3FE77844CAFD348C
 854 data4 0x240021CA, 0x00000000
 855 data8 0x3FE83F6BCEED6B92
 856 data4 0x24C45372, 0x00000000
 857 data8 0x3FE90B58A34F3665
 858 data4 0x240DAD33, 0x00000000
 859 data8 0x3FE9DC522C1E56B4
 860 data4 0x24F846CE, 0x00000000
 861 data8 0x3FEAB2A427041578
 862 data4 0x2323FB6E, 0x00000000
 863 data8 0x3FEB8E9F9DD8C373
 864 data4 0x24B3090B, 0x00000000
 865 data8 0x3FEC709B65C9AA7B
 866 data4 0x2449F611, 0x00000000
 867 data8 0x3FED58F4ACCF8435
 868 data4 0x23616A7E, 0x00000000
 869 data8 0x3FEE480F97635082
 870 data4 0x24C2FEAE, 0x00000000
 871 data8 0x3FEF3E57F0ACC544
 872 data4 0x242CE964, 0x00000000
 873 data8 0x3FF01E20F7E06E4B
 874 data4 0x2480D3EE, 0x00000000
 875 data8 0x3FF0A1258A798A69
 876 data4 0x24DB8967, 0x00000000
 877 LOCAL_OBJECT_END(tanl_table_tm1)
 878
 879 LOCAL_OBJECT_START(tanl_table_cm2)
 880 //
 881 //  Entries C_hi   double-precision memory format
 882 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 883 //  Entries C_lo  single-precision memory format
 884 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 885 //
 886 data8 0x400ED3E2E63EFBD0
 887 data4 0x259D94D4, 0x00000000
 888 data8 0x400DDDB4C515DAB5
 889 data4 0x245F0537, 0x00000000
 890 data8 0x400CF57ABE19A79F
 891 data4 0x25D4EA9F, 0x00000000
 892 data8 0x400C1A06D15298ED
 893 data4 0x24AE40A0, 0x00000000
 894 data8 0x400B4A4C164B2708
 895 data4 0x25A5AAB6, 0x00000000
 896 data8 0x400A855A5285B068
 897 data4 0x25524F18, 0x00000000
 898 data8 0x4009CA5A3FFA549F
 899 data4 0x24C999C0, 0x00000000
 900 data8 0x4009188A646AF623
 901 data4 0x254FD801, 0x00000000
 902 data8 0x40086F3C6084D0E7
 903 data4 0x2560F5FD, 0x00000000
 904 data8 0x4007CDD2A29A76EE
 905 data4 0x255B9D19, 0x00000000
 906 data8 0x400733BE6C8ECA95
 907 data4 0x25CB021B, 0x00000000
 908 data8 0x4006A07E1F8DDC52
 909 data4 0x24AB4722, 0x00000000
 910 data8 0x4006139BC298AD58
 911 data4 0x252764E2, 0x00000000
 912 data8 0x40058CABBAD7164B
 913 data4 0x24DAF5DB, 0x00000000
 914 data8 0x40050B4BAE31A5D3
 915 data4 0x25EA20F4, 0x00000000
 916 data8 0x40048F2189F85A8A
 917 data4 0x2583A3E8, 0x00000000
 918 data8 0x400417DAA862380D
 919 data4 0x25DCC4CC, 0x00000000
 920 data8 0x4003A52B1088FCFE
 921 data4 0x2430A492, 0x00000000
 922 data8 0x400336CCCD3527D5
 923 data4 0x255F77CF, 0x00000000
 924 data8 0x4002CC7F5760766D
 925 data4 0x25DA0BDA, 0x00000000
 926 data8 0x4002660711CE02E3
 927 data4 0x256FF4A2, 0x00000000
 928 data8 0x4002032CD37BBE04
 929 data4 0x25208AED, 0x00000000
 930 data8 0x4001A3BD7F050775
 931 data4 0x24B72DD6, 0x00000000
 932 data8 0x40014789A554848A
 933 data4 0x24AB4DAA, 0x00000000
 934 data8 0x4000EE65323E81B7
 935 data4 0x2584C440, 0x00000000
 936 data8 0x4000982721CF1293
 937 data4 0x25C9428D, 0x00000000
 938 data8 0x400044A93D415EEB
 939 data4 0x25DC8482, 0x00000000
 940 data8 0x3FFFE78FBD72C577
 941 data4 0x257F5070, 0x00000000
 942 data8 0x3FFF4AC375EFD28E
 943 data4 0x23EBBF7A, 0x00000000
 944 data8 0x3FFEB2AF60B52DDE
 945 data4 0x22EECA07, 0x00000000
 946 data8 0x3FFE1F1935204180
 947 data4 0x24191079, 0x00000000
 948 data8 0x3FFD8FCA54F7E60A
 949 data4 0x248D3058, 0x00000000
 950 LOCAL_OBJECT_END(tanl_table_cm2)
 951
 952 LOCAL_OBJECT_START(tanl_table_cm1)
 953 //
 954 //  Entries C_hi   double-precision memory format
 955 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 956 //  Entries C_lo  single-precision memory format
 957 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 958 //
 959 data8 0x3FFCC06A79F6FADE
 960 data4 0x239C7886, 0x00000000
 961 data8 0x3FFBB91F891662A6
 962 data4 0x250BD191, 0x00000000
 963 data8 0x3FFABFB6529F155D
 964 data4 0x256CC3E6, 0x00000000
 965 data8 0x3FF9D3002E964AE9
 966 data4 0x250843E3, 0x00000000
 967 data8 0x3FF8F1EF89DCB383
 968 data4 0x2277C87E, 0x00000000
 969 data8 0x3FF81B937C87DBD6
 970 data4 0x256DA6CF, 0x00000000
 971 data8 0x3FF74F141042EDE4
 972 data4 0x2573D28A, 0x00000000
 973 data8 0x3FF68BAF1784B360
 974 data4 0x242E489A, 0x00000000
 975 data8 0x3FF5D0B57C923C4C
 976 data4 0x2532D940, 0x00000000
 977 data8 0x3FF51D88F418EF20
 978 data4 0x253C7DD6, 0x00000000
 979 data8 0x3FF4719A02F88DAE
 980 data4 0x23DB59BF, 0x00000000
 981 data8 0x3FF3CC6649DA0788
 982 data4 0x252B4756, 0x00000000
 983 data8 0x3FF32D770B980DB8
 984 data4 0x23FE585F, 0x00000000
 985 data8 0x3FF2945FE56C987A
 986 data4 0x25378A63, 0x00000000
 987 data8 0x3FF200BDB16523F6
 988 data4 0x247BB2E0, 0x00000000
 989 data8 0x3FF172358CE27778
 990 data4 0x24446538, 0x00000000
 991 data8 0x3FF0E873FDEFE692
 992 data4 0x2514638F, 0x00000000
 993 data8 0x3FF0632C33154062
 994 data4 0x24A7FC27, 0x00000000
 995 data8 0x3FEFC42EB3EF115F
 996 data4 0x248FD0FE, 0x00000000
 997 data8 0x3FEEC9E8135D26F6
 998 data4 0x2385C719, 0x00000000
 999 LOCAL_OBJECT_END(tanl_table_cm1)
1000
1001 LOCAL_OBJECT_START(tanl_table_scim2)
1002 //
1003 //  Entries SC_inv in Swapped IEEE format (extended)
1004 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
1005 //
1006 data8    0x839D6D4A1BF30C9E, 0x00004001
1007 data8    0x80092804554B0EB0, 0x00004001
1008 data8    0xF959F94CA1CF0DE9, 0x00004000
1009 data8    0xF3086BA077378677, 0x00004000
1010 data8    0xED154515CCD4723C, 0x00004000
1011 data8    0xE77909441C27CF25, 0x00004000
1012 data8    0xE22D037D8DDACB88, 0x00004000
1013 data8    0xDD2B2D8A89C73522, 0x00004000
1014 data8    0xD86E1A23BB2C1171, 0x00004000
1015 data8    0xD3F0E288DFF5E0F9, 0x00004000
1016 data8    0xCFAF16B1283BEBD5, 0x00004000
1017 data8    0xCBA4AFAA0D88DD53, 0x00004000
1018 data8    0xC7CE03CCCA67C43D, 0x00004000
1019 data8    0xC427BC820CA0DDB0, 0x00004000
1020 data8    0xC0AECD57F13D8CAB, 0x00004000
1021 data8    0xBD606C3871ECE6B1, 0x00004000
1022 data8    0xBA3A0A96A44C4929, 0x00004000
1023 data8    0xB7394F6FE5CCCEC1, 0x00004000
1024 data8    0xB45C12039637D8BC, 0x00004000
1025 data8    0xB1A0552892CB051B, 0x00004000
1026 data8    0xAF04432B6BA2FFD0, 0x00004000
1027 data8    0xAC862A237221235F, 0x00004000
1028 data8    0xAA2478AF5F00A9D1, 0x00004000
1029 data8    0xA7DDBB0C81E082BF, 0x00004000
1030 data8    0xA5B0987D45684FEE, 0x00004000
1031 data8    0xA39BD0F5627A8F53, 0x00004000
1032 data8    0xA19E3B036EC5C8B0, 0x00004000
1033 data8    0x9FB6C1F091CD7C66, 0x00004000
1034 data8    0x9DE464101FA3DF8A, 0x00004000
1035 data8    0x9C263139A8F6B888, 0x00004000
1036 data8    0x9A7B4968C27B0450, 0x00004000
1037 data8    0x98E2DB7E5EE614EE, 0x00004000
1038 LOCAL_OBJECT_END(tanl_table_scim2)
1039
1040 LOCAL_OBJECT_START(tanl_table_scim1)
1041 //
1042 //  Entries SC_inv in Swapped IEEE format (extended)
1043 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
1044 //
1045 data8    0x969F335C13B2B5BA, 0x00004000
1046 data8    0x93D446D9D4C0F548, 0x00004000
1047 data8    0x9147094F61B798AF, 0x00004000
1048 data8    0x8EF317CC758787AC, 0x00004000
1049 data8    0x8CD498B3B99EEFDB, 0x00004000
1050 data8    0x8AE82A7DDFF8BC37, 0x00004000
1051 data8    0x892AD546E3C55D42, 0x00004000
1052 data8    0x8799FEA9D15573C1, 0x00004000
1053 data8    0x86335F88435A4B4C, 0x00004000
1054 data8    0x84F4FB6E3E93A87B, 0x00004000
1055 data8    0x83DD195280A382FB, 0x00004000
1056 data8    0x82EA3D7FA4CB8C9E, 0x00004000
1057 data8    0x821B247C6861D0A8, 0x00004000
1058 data8    0x816EBED163E8D244, 0x00004000
1059 data8    0x80E42D9127E4CFC6, 0x00004000
1060 data8    0x807ABF8D28E64AFD, 0x00004000
1061 data8    0x8031EF26863B4FD8, 0x00004000
1062 data8    0x800960ADAE8C11FD, 0x00004000
1063 data8    0x8000E1475FDBEC21, 0x00004000
1064 data8    0x80186650A07791FA, 0x00004000
1065 LOCAL_OBJECT_END(tanl_table_scim1)
1066
1067 Arg                 = f8
1068 Save_Norm_Arg       = f8        // For input to reduction routine
1069 Result              = f8
1070 r                   = f8        // For output from reduction routine
1071 c                   = f9        // For output from reduction routine
1072 U_2                 = f10
1073 rsq                 = f11
1074 C_hi                = f12
1075 C_lo                = f13
1076 T_hi                = f14
1077 T_lo                = f15
1078
1079 d_1                 = f33
1080 N_0                 = f34
1081 tail                = f35
1082 tanx                = f36
1083 Cx                  = f37
1084 Sx                  = f38
1085 sgn_r               = f39
1086 CORR                = f40
1087 P                   = f41
1088 D                   = f42
1089 ArgPrime            = f43
1090 P_0                 = f44
1091
1092 P2_1                = f45
1093 P2_2                = f46
1094 P2_3                = f47
1095
1096 P1_1                = f45
1097 P1_2                = f46
1098 P1_3                = f47
1099
1100 P1_4                = f48
1101 P1_5                = f49
1102 P1_6                = f50
1103 P1_7                = f51
1104 P1_8                = f52
1105 P1_9                = f53
1106
1107 x                   = f56
1108 xsq                 = f57
1109 Tx                  = f58
1110 Tx1                 = f59
1111 Set                 = f60
1112 poly1               = f61
1113 poly2               = f62
1114 Poly                = f63
1115 Poly1               = f64
1116 Poly2               = f65
1117 r_to_the_8          = f66
1118 B                   = f67
1119 SC_inv              = f68
1120 Pos_r               = f69
1121 N_0_fix             = f70
1122 d_2                 = f71
1123 PI_BY_4             = f72
1124 TWO_TO_NEG14        = f74
1125 TWO_TO_NEG33        = f75
1126 NEGTWO_TO_NEG14     = f76
1127 NEGTWO_TO_NEG33     = f77
1128 two_by_PI           = f78
1129 N                   = f79
1130 N_fix               = f80
1131 P_1                 = f81
1132 P_2                 = f82
1133 P_3                 = f83
1134 s_val               = f84
1135 w                   = f85
1136 B_mask1             = f86
1137 B_mask2             = f87
1138 w2                  = f88
1139 A                   = f89
1140 a                   = f90
1141 t                   = f91
1142 U_1                 = f92
1143 NEGTWO_TO_NEG2      = f93
1144 TWO_TO_NEG2         = f94
1145 Q1_1                = f95
1146 Q1_2                = f96
1147 Q1_3                = f97
1148 Q1_4                = f98
1149 Q1_5                = f99
1150 Q1_6                = f100
1151 Q1_7                = f101
1152 Q1_8                = f102
1153 S_hi                = f103
1154 S_lo                = f104
1155 V_hi                = f105
1156 V_lo                = f106
1157 U_hi                = f107
1158 U_lo                = f108
1159 U_hiabs             = f109
1160 V_hiabs             = f110
1161 V                   = f111
1162 Inv_P_0             = f112
1163
1164 FR_inv_pi_2to63     = f113
1165 FR_rshf_2to64       = f114
1166 FR_2tom64           = f115
1167 FR_rshf             = f116
1168 Norm_Arg            = f117
1169 Abs_Arg             = f118
1170 TWO_TO_NEG65        = f119
1171 fp_tmp              = f120
1172 mOne                = f121
1173
1174 GR_sig_inv_pi  = r14
1175 GR_rshf_2to64  = r15
1176 GR_exp_2tom64  = r16
1177 GR_rshf        = r17
1178 GR_exp_2_to_63 = r18
1179 GR_exp_2_to_24 = r19
1180 GR_signexp_x   = r20
1181 GR_exp_x       = r21
1182 GR_exp_mask    = r22
1183 GR_exp_2tom14  = r23
1184 GR_exp_m2tom14 = r24
1185 GR_exp_2tom33  = r25
1186 GR_exp_m2tom33 = r26
1187
1188 GR_SAVE_B0     = r33
1189 GR_SAVE_GP     = r34
1190 GR_SAVE_PFS    = r35
1191 table_base     = r36
1192 table_ptr1     = r37
1193 table_ptr2     = r38
1194 table_ptr3     = r39
1195 lookup         = r40
1196 N_fix_gr       = r41
1197 GR_exp_2tom2   = r42
1198 GR_exp_2tom65  = r43
1199 exp_r          = r44
1200 sig_r          = r45
1201 bmask1         = r46
1202 table_offset   = r47
1203 bmask2         = r48
1204 gr_tmp         = r49
1205 cot_flag       = r50
1206
1207 GR_SAVE_B0                  = r51
1208 GR_SAVE_PFS                 = r52
1209 GR_SAVE_GP                  = r53
1210 GR_Parameter_X              = r54
1211 GR_Parameter_Y              = r55
1212 GR_Parameter_RESULT         = r56
1213 GR_Parameter_Tag            = r57
1214
1215
1216 .section .text
1217 .global __libm_tanl#
1218 .global __libm_cotl#
1219
1220 .proc __libm_cotl#
1221 __libm_cotl:
1222 .endp __libm_cotl#
1223 LOCAL_LIBM_ENTRY(cotl)
1224
1225 { .mlx
1226       alloc r32 = ar.pfs, 0,22,4,0
1227       movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi
1228 }
1229 { .mlx
1230       mov GR_exp_mask = 0x1ffff            // Exponent mask
1231       movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64)
1232 }
1233 ;;
1234
1235 //     Check for NatVals, Infs , NaNs, and Zeros
1236 { .mfi
1237       getf.exp GR_signexp_x = Arg          // Get sign and exponent of x
1238       fclass.m  p6,p0 = Arg, 0x1E7         // Test for natval, nan, inf, zero
1239       mov cot_flag = 0x1
1240 }
1241 { .mfb
1242       addl table_base = @ltoff(TANL_BASE_CONSTANTS), gp // Pointer to table ptr
1243       fnorm.s1 Norm_Arg = Arg              // Normalize x
1244       br.cond.sptk COMMON_PATH
1245 };;
1246
1247 LOCAL_LIBM_END(cotl)
1248
1249 .proc __libm_tanl#
1250 __libm_tanl:
1251 .endp __libm_tanl#
1252 GLOBAL_IEEE754_ENTRY(tanl)
1253
1254 { .mlx
1255       alloc r32 = ar.pfs, 0,22,4,0
1256       movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi
1257 }
1258 { .mlx
1259       mov GR_exp_mask = 0x1ffff            // Exponent mask
1260       movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64)
1261 }
1262 ;;
1263
1264 //     Check for NatVals, Infs , NaNs, and Zeros
1265 { .mfi
1266       getf.exp GR_signexp_x = Arg          // Get sign and exponent of x
1267       fclass.m  p6,p0 = Arg, 0x1E7         // Test for natval, nan, inf, zero
1268       mov cot_flag = 0x0
1269 }
1270 { .mfi
1271       addl table_base = @ltoff(TANL_BASE_CONSTANTS), gp // Pointer to table ptr
1272       fnorm.s1 Norm_Arg = Arg              // Normalize x
1273       nop.i 0
1274 };;
1275
1276 // Common path for both tanl and cotl
1277 COMMON_PATH:
1278 { .mfi
1279       setf.sig FR_inv_pi_2to63 = GR_sig_inv_pi // Form 1/pi * 2^63
1280       fclass.m p9, p0 = Arg, 0x0b          // Test x denormal
1281       mov GR_exp_2tom64 = 0xffff - 64      // Scaling constant to compute N
1282 }
1283 { .mlx
1284       setf.d FR_rshf_2to64 = GR_rshf_2to64 // Form const 1.1000 * 2^(63+64)
1285       movl GR_rshf = 0x43e8000000000000    // Form const 1.1000 * 2^63
1286 }
1287 ;;
1288
1289 // Check for everything - if false, then must be pseudo-zero or pseudo-nan.
1290 // Branch out to deal with special values.
1291 { .mfi
1292       addl gr_tmp = -1,r0
1293       fclass.nm  p7,p0 = Arg, 0x1FF        // Test x unsupported
1294       mov GR_exp_2_to_63 = 0xffff + 63     // Exponent of 2^63
1295 }
1296 { .mfb
1297       ld8 table_base = [table_base]        // Get pointer to constant table
1298       fms.s1 mOne = f0, f0, f1
1299 (p6)  br.cond.spnt TANL_SPECIAL            // Branch if x natval, nan, inf, zero
1300 }
1301 ;;
1302
1303 { .mmb
1304       setf.sig fp_tmp = gr_tmp   // Make a constant so fmpy produces inexact
1305       mov GR_exp_2_to_24 = 0xffff + 24     // Exponent of 2^24
1306 (p9)  br.cond.spnt TANL_DENORMAL           // Branch if x denormal
1307 }
1308 ;;
1309
1310 TANL_COMMON:
1311 // Return to here if x denormal
1312 //
1313 // Do fcmp to generate Denormal exception
1314 //  - can't do FNORM (will generate Underflow when U is unmasked!)
1315 // Branch out to deal with unsupporteds values.
1316 { .mfi
1317       setf.exp FR_2tom64 = GR_exp_2tom64 // Form 2^-64 for scaling N_float
1318       fcmp.eq.s0 p0, p6 = Arg, f1        // Dummy to flag denormals
1319       add table_ptr1 = 0, table_base     // Point to tanl_table_1
1320 }
1321 { .mib
1322       setf.d FR_rshf = GR_rshf           // Form right shift const 1.1000 * 2^63
1323       add table_ptr2 = 80, table_base    // Point to tanl_table_2
1324 (p7)  br.cond.spnt TANL_UNSUPPORTED      // Branch if x unsupported type
1325 }
1326 ;;
1327
1328 { .mfi
1329       and GR_exp_x = GR_exp_mask, GR_signexp_x // Get exponent of x
1330       fmpy.s1 Save_Norm_Arg = Norm_Arg, f1     // Save x if large arg reduction
1331       dep.z bmask1 = 0x7c, 56, 8               // Form mask to get 5 msb of r
1332                                                // bmask1 = 0x7c00000000000000
1333 }
1334 ;;
1335
1336 //
1337 //     Decide about the paths to take:
1338 //     Set PR_6 if |Arg| >= 2**63
1339 //     Set PR_9 if |Arg| < 2**24 - CASE 1 OR 2
1340 //     OTHERWISE Set PR_8 - CASE 3 OR 4
1341 //
1342 //     Branch out if the magnitude of the input argument is >= 2^63
1343 //     - do this branch before the next.
1344 { .mfi
1345       ldfe two_by_PI = [table_ptr1],16        // Load 2/pi
1346       nop.f 999
1347       dep.z bmask2 = 0x41, 57, 7              // Form mask to OR to produce B
1348                                               // bmask2 = 0x8200000000000000
1349 }
1350 { .mib
1351       ldfe PI_BY_4 = [table_ptr2],16          // Load pi/4
1352       cmp.ge p6,p0 = GR_exp_x, GR_exp_2_to_63 // Is |x| >= 2^63
1353 (p6)  br.cond.spnt TANL_ARG_TOO_LARGE         // Branch if |x| >= 2^63
1354 }
1355 ;;
1356
1357 { .mmi
1358       ldfe P_0 = [table_ptr1],16              // Load P_0
1359       ldfe Inv_P_0 = [table_ptr2],16          // Load Inv_P_0
1360       nop.i 999
1361 }
1362 ;;
1363
1364 { .mfi
1365       ldfe P_1 = [table_ptr1],16              // Load P_1
1366       fmerge.s Abs_Arg = f0, Norm_Arg         // Get |x|
1367       mov GR_exp_m2tom33 = 0x2ffff - 33       // Form signexp of -2^-33
1368 }
1369 { .mfi
1370       ldfe d_1 = [table_ptr2],16              // Load d_1 for 2^24 <= |x| < 2^63
1371       nop.f 999
1372       mov GR_exp_2tom33 = 0xffff - 33         // Form signexp of 2^-33
1373 }
1374 ;;
1375
1376 { .mmi
1377       ldfe P_2 = [table_ptr1],16              // Load P_2
1378       ldfe d_2 = [table_ptr2],16              // Load d_2 for 2^24 <= |x| < 2^63
1379       cmp.ge p8,p0 = GR_exp_x, GR_exp_2_to_24 // Is |x| >= 2^24
1380 }
1381 ;;
1382
1383 // Use special scaling to right shift so N=Arg * 2/pi is in rightmost bits
1384 // Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
1385 { .mfb
1386       ldfe   P_3 = [table_ptr1],16            // Load P_3
1387       fma.s1      N_fix = Norm_Arg, FR_inv_pi_2to63, FR_rshf_2to64
1388 (p8)  br.cond.spnt TANL_LARGER_ARG            // Branch if 2^24 <= |x| < 2^63
1389 }
1390 ;;
1391
1392 // Here if 0 < |x| < 2^24
1393 //     ARGUMENT REDUCTION CODE - CASE 1 and 2
1394 //
1395 { .mmf
1396       setf.exp TWO_TO_NEG33 = GR_exp_2tom33      // Form 2^-33
1397       setf.exp NEGTWO_TO_NEG33 = GR_exp_m2tom33  // Form -2^-33
1398       fmerge.s r = Norm_Arg,Norm_Arg          // Assume r=x, ok if |x| < pi/4
1399 }
1400 ;;
1401
1402 //
1403 // If |Arg| < pi/4,  set PR_8, else  pi/4 <=|Arg| < 2^24 - set PR_9.
1404 //
1405 //     Case 2: Convert integer N_fix back to normalized floating-point value.
1406 { .mfi
1407       getf.sig sig_r = Norm_Arg               // Get sig_r if 1/4 <= |x| < pi/4
1408       fcmp.lt.s1 p8,p9= Abs_Arg,PI_BY_4       // Test |x| < pi/4
1409       mov GR_exp_2tom2 = 0xffff - 2           // Form signexp of 2^-2
1410 }
1411 { .mfi
1412       ldfps TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2] // Load 2^-2, -2^-2
1413       fms.s1 N = N_fix, FR_2tom64, FR_rshf    // Use scaling to get N floated
1414       mov N_fix_gr = r0                       // Assume N=0, ok if |x| < pi/4
1415 }
1416 ;;
1417
1418 //
1419 //     Case 1: Is |r| < 2**(-2).
1420 //     Arg is the same as r in this case.
1421 //     r = Arg
1422 //     c = 0
1423 //
1424 //     Case 2: Place integer part of N in GP register.
1425 { .mfi
1426 (p9)  getf.sig N_fix_gr = N_fix
1427       fmerge.s c = f0, f0                     // Assume c=0, ok if |x| < pi/4
1428       cmp.lt p10, p0 = GR_exp_x, GR_exp_2tom2 // Test if |x| < 1/4
1429 }
1430 ;;
1431
1432 { .mfi
1433       setf.sig B_mask1 = bmask1               // Form mask to get 5 msb of r
1434       nop.f 999
1435       mov exp_r = GR_exp_x                    // Get exp_r if 1/4 <= |x| < pi/4
1436 }
1437 { .mbb
1438       setf.sig B_mask2 = bmask2               // Form mask to form B from r
1439 (p10) br.cond.spnt TANL_SMALL_R               // Branch if 0 < |x| < 1/4
1440 (p8)  br.cond.spnt TANL_NORMAL_R              // Branch if 1/4 <= |x| < pi/4
1441 }
1442 ;;
1443
1444 // Here if pi/4 <= |x| < 2^24
1445 //
1446 //     Case 1: PR_3 is only affected  when PR_1 is set.
1447 //
1448 //
1449 //     Case 2: w = N * P_2
1450 //     Case 2: s_val = -N * P_1  + Arg
1451 //
1452
1453 { .mfi
1454       nop.m 999
1455       fnma.s1 s_val = N, P_1, Norm_Arg
1456       nop.i 999
1457 }
1458 { .mfi
1459       nop.m 999
1460       fmpy.s1 w = N, P_2                     // w = N * P_2 for |s| >= 2^-33
1461       nop.i 999
1462 }
1463 ;;
1464
1465 //     Case 2_reduce: w = N * P_3 (change sign)
1466 { .mfi
1467       nop.m 999
1468       fmpy.s1 w2 = N, P_3                    // w = N * P_3 for |s| < 2^-33
1469       nop.i 999
1470 }
1471 ;;
1472
1473 //     Case 1_reduce: r = s + w (change sign)
1474 { .mfi
1475       nop.m 999
1476       fsub.s1 r = s_val, w                   // r = s_val - w for |s| >= 2^-33
1477       nop.i 999
1478 }
1479 ;;
1480
1481 //     Case 2_reduce: U_1 = N * P_2 + w
1482 { .mfi
1483       nop.m 999
1484       fma.s1  U_1 = N, P_2, w2              // U_1 = N * P_2 + w for |s| < 2^-33
1485       nop.i 999
1486 }
1487 ;;
1488
1489 //
1490 //     Decide between case_1 and case_2 reduce:
1491 //     Case 1_reduce:  |s| >= 2**(-33)
1492 //     Case 2_reduce:  |s| < 2**(-33)
1493 //
1494 { .mfi
1495       nop.m 999
1496       fcmp.lt.s1 p9, p8 = s_val, TWO_TO_NEG33
1497       nop.i 999
1498 }
1499 ;;
1500
1501 { .mfi
1502       nop.m 999
1503 (p9)  fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33
1504       nop.i 999
1505 }
1506 ;;
1507
1508 //     Case 1_reduce: c = s - r
1509 { .mfi
1510       nop.m 999
1511       fsub.s1 c = s_val, r                     // c = s_val - r for |s| >= 2^-33
1512       nop.i 999
1513 }
1514 ;;
1515
1516 //     Case 2_reduce: r is complete here - continue to calculate c .
1517 //     r = s - U_1
1518 { .mfi
1519       nop.m 999
1520 (p9)  fsub.s1 r = s_val, U_1
1521       nop.i 999
1522 }
1523 { .mfi
1524       nop.m 999
1525 (p9)  fms.s1 U_2 = N, P_2, U_1
1526       nop.i 999
1527 }
1528 ;;
1529
1530 //
1531 //     Case 1_reduce: Is |r| < 2**(-2), if so set PR_10
1532 //     else set PR_13.
1533 //
1534
1535 { .mfi
1536       nop.m 999
1537       fand B = B_mask1, r
1538       nop.i 999
1539 }
1540 { .mfi
1541       nop.m 999
1542 (p8)  fcmp.lt.unc.s1 p10, p13 = r, TWO_TO_NEG2
1543       nop.i 999
1544 }
1545 ;;
1546
1547 { .mfi
1548 (p8)  getf.sig sig_r = r               // Get signif of r if |s| >= 2^-33
1549       nop.f 999
1550       nop.i 999
1551 }
1552 ;;
1553
1554 { .mfi
1555 (p8)  getf.exp exp_r = r               // Extract signexp of r if |s| >= 2^-33
1556 (p10) fcmp.gt.s1 p10, p13 = r, NEGTWO_TO_NEG2
1557       nop.i 999
1558 }
1559 ;;
1560
1561 //     Case 1_reduce: c is complete here.
1562 //     Case 1: Branch to SMALL_R or NORMAL_R.
1563 //     c = c + w (w has not been negated.)
1564 { .mfi
1565       nop.m 999
1566 (p8)  fsub.s1 c = c, w                         // c = c - w for |s| >= 2^-33
1567       nop.i 999
1568 }
1569 { .mbb
1570       nop.m 999
1571 (p10) br.cond.spnt TANL_SMALL_R     // Branch if pi/4 < |x| < 2^24 and |r|<1/4
1572 (p13) br.cond.sptk TANL_NORMAL_R_A  // Branch if pi/4 < |x| < 2^24 and |r|>=1/4
1573 }
1574 ;;
1575
1576
1577 // Here if pi/4 < |x| < 2^24 and |s| < 2^-33
1578 //
1579 //     Is i_1 = lsb of N_fix_gr even or odd?
1580 //     if i_1 == 0, set p11, else set p12.
1581 //
1582 { .mfi
1583       nop.m 999
1584       fsub.s1 s_val = s_val, r
1585       add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl)
1586 }
1587 { .mfi
1588       nop.m 999
1589 //
1590 //     Case 2_reduce:
1591 //     U_2 = N * P_2 - U_1
1592 //     Not needed until later.
1593 //
1594       fadd.s1 U_2 = U_2, w2
1595 //
1596 //     Case 2_reduce:
1597 //     s = s - r
1598 //     U_2 = U_2 + w
1599 //
1600       nop.i 999
1601 }
1602 ;;
1603
1604 //
1605 //     Case 2_reduce:
1606 //     c = c - U_2
1607 //     c is complete here
1608 //     Argument reduction ends here.
1609 //
1610 { .mfi
1611       nop.m 999
1612       fmpy.s1 rsq = r, r
1613       tbit.z p11, p12 = N_fix_gr, 0 ;;    // Set p11 if N even, p12 if odd
1614 }
1615
1616 { .mfi
1617       nop.m 999
1618 (p12) frcpa.s1 S_hi,p0 = f1, r
1619       nop.i 999
1620 }
1621 { .mfi
1622       nop.m 999
1623       fsub.s1 c = s_val, U_1
1624       nop.i 999
1625 }
1626 ;;
1627
1628 { .mmi
1629       add table_ptr1 = 160, table_base ;;  // Point to tanl_table_p1
1630       ldfe P1_1 = [table_ptr1],144
1631       nop.i 999 ;;
1632 }
1633 //
1634 //     Load P1_1 and point to Q1_1 .
1635 //
1636 { .mfi
1637       ldfe Q1_1 = [table_ptr1]
1638 //
1639 //     N even: rsq = r * Z
1640 //     N odd:  S_hi = frcpa(r)
1641 //
1642 (p12) fmerge.ns S_hi = S_hi, S_hi
1643       nop.i 999
1644 }
1645 { .mfi
1646       nop.m 999
1647 //
1648 //     Case 2_reduce:
1649 //     c = s - U_1
1650 //
1651 (p9)  fsub.s1 c = c, U_2
1652       nop.i 999 ;;
1653 }
1654 { .mfi
1655       nop.m 999
1656 (p12) fma.s1  poly1 = S_hi, r, f1
1657       nop.i 999 ;;
1658 }
1659 { .mfi
1660       nop.m 999
1661 //
1662 //     N odd:  Change sign of S_hi
1663 //
1664 (p11) fmpy.s1 rsq = rsq, P1_1
1665       nop.i 999 ;;
1666 }
1667 { .mfi
1668       nop.m 999
1669 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
1670       nop.i 999 ;;
1671 }
1672 { .mfi
1673       nop.m 999
1674 //
1675 //     N even: rsq = rsq * P1_1
1676 //     N odd:  poly1 =  1.0 +  S_hi * r    16 bits partial  account for necessary
1677 //
1678 (p11) fma.s1 Poly = r, rsq, c
1679       nop.i 999 ;;
1680 }
1681 { .mfi
1682       nop.m 999
1683 //
1684 //     N even: Poly = c  + r * rsq
1685 //     N odd:  S_hi  = S_hi + S_hi*poly1  16 bits account for necessary
1686 //
1687 (p12) fma.s1 poly1 = S_hi, r, f1
1688 (p11) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl
1689 }
1690 { .mfi
1691       nop.m 999
1692 //
1693 //     N even: Result = Poly + r
1694 //     N odd:  poly1  = 1.0 + S_hi * r        32 bits partial
1695 //
1696 (p14) fadd.s0 Result = r, Poly             // for tanl
1697       nop.i 999
1698 }
1699 { .mfi
1700       nop.m 999
1701 (p15) fms.s0 Result = r, mOne, Poly        // for cotl
1702       nop.i 999
1703 }
1704 ;;
1705
1706 { .mfi
1707       nop.m 999
1708 (p12) fma.s1  S_hi = S_hi, poly1, S_hi
1709       nop.i 999 ;;
1710 }
1711 { .mfi
1712       nop.m 999
1713 //
1714 //     N even: Result1 = Result + r
1715 //     N odd:   S_hi  = S_hi * poly1 + S_hi   32 bits
1716 //
1717 (p12) fma.s1 poly1 = S_hi, r, f1
1718       nop.i 999 ;;
1719 }
1720 { .mfi
1721       nop.m 999
1722 //
1723 //     N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
1724 //
1725 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
1726       nop.i 999 ;;
1727 }
1728 { .mfi
1729       nop.m 999
1730 //
1731 //     N odd:  poly1  =  S_hi * poly + 1.0    64 bits
1732 //
1733 (p12) fma.s1 poly1 = S_hi, r, f1
1734       nop.i 999 ;;
1735 }
1736 { .mfi
1737       nop.m 999
1738 //
1739 //     N odd:  poly1  =  S_hi * r + 1.0
1740 //
1741 (p12) fma.s1 poly1 = S_hi, c, poly1
1742       nop.i 999 ;;
1743 }
1744 { .mfi
1745       nop.m 999
1746 //
1747 //     N odd:  poly1  =  S_hi * c + poly1
1748 //
1749 (p12) fmpy.s1 S_lo = S_hi, poly1
1750       nop.i 999 ;;
1751 }
1752 { .mfi
1753       nop.m 999
1754 //
1755 //     N odd:  S_lo  =  S_hi *  poly1
1756 //
1757 (p12) fma.s1 S_lo = Q1_1, r, S_lo
1758 (p12) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl
1759 }
1760 { .mfi
1761       nop.m 999
1762 //
1763 //     N odd:  Result =  S_hi + S_lo
1764 //
1765       fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
1766       nop.i 999 ;;
1767 }
1768 { .mfi
1769       nop.m 999
1770 //
1771 //     N odd:  S_lo  =  S_lo + Q1_1 * r
1772 //
1773 (p14) fadd.s0 Result = S_hi, S_lo          // for tanl
1774       nop.i 999
1775 }
1776 { .mfb
1777       nop.m 999
1778 (p15) fms.s0 Result = S_hi, mOne, S_lo     // for cotl
1779       br.ret.sptk b0 ;;          // Exit for pi/4 <= |x| < 2^24 and |s| < 2^-33
1780 }
1781
1782
1783 TANL_LARGER_ARG:
1784 // Here if 2^24 <= |x| < 2^63
1785 //
1786 // ARGUMENT REDUCTION CODE - CASE 3 and 4
1787 //
1788
1789 { .mmf
1790       mov GR_exp_2tom14 = 0xffff - 14          // Form signexp of 2^-14
1791       mov GR_exp_m2tom14 = 0x2ffff - 14        // Form signexp of -2^-14
1792       fmpy.s1 N_0 = Norm_Arg, Inv_P_0
1793 }
1794 ;;
1795
1796 { .mmi
1797       setf.exp TWO_TO_NEG14 = GR_exp_2tom14    // Form 2^-14
1798       setf.exp NEGTWO_TO_NEG14 = GR_exp_m2tom14// Form -2^-14
1799       nop.i 999
1800 }
1801 ;;
1802
1803
1804 //
1805 //    Adjust table_ptr1 to beginning of table.
1806 //    N_0 = Arg * Inv_P_0
1807 //
1808 { .mmi
1809       add table_ptr2 = 144, table_base ;;     // Point to 2^-2
1810       ldfps TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2]
1811       nop.i 999
1812 }
1813 ;;
1814
1815 //
1816 //    N_0_fix  = integer part of N_0 .
1817 //
1818 //
1819 //    Make N_0 the integer part.
1820 //
1821 { .mfi
1822       nop.m 999
1823       fcvt.fx.s1 N_0_fix = N_0
1824       nop.i 999 ;;
1825 }
1826 { .mfi
1827       setf.sig B_mask1 = bmask1               // Form mask to get 5 msb of r
1828       fcvt.xf N_0 = N_0_fix
1829       nop.i 999 ;;
1830 }
1831 { .mfi
1832       setf.sig B_mask2 = bmask2               // Form mask to form B from r
1833       fnma.s1 ArgPrime = N_0, P_0, Norm_Arg
1834       nop.i 999
1835 }
1836 { .mfi
1837       nop.m 999
1838       fmpy.s1 w = N_0, d_1
1839       nop.i 999 ;;
1840 }
1841 //
1842 //    ArgPrime = -N_0 * P_0 + Arg
1843 //    w  = N_0 * d_1
1844 //
1845 //
1846 //    N = ArgPrime * 2/pi
1847 //
1848 //      fcvt.fx.s1 N_fix = N
1849 // Use special scaling to right shift so N=Arg * 2/pi is in rightmost bits
1850 // Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
1851 { .mfi
1852       nop.m 999
1853       fma.s1      N_fix = ArgPrime, FR_inv_pi_2to63, FR_rshf_2to64
1854
1855       nop.i 999 ;;
1856 }
1857 //     Convert integer N_fix back to normalized floating-point value.
1858 { .mfi
1859       nop.m 999
1860       fms.s1 N = N_fix, FR_2tom64, FR_rshf    // Use scaling to get N floated
1861       nop.i 999
1862 }
1863 ;;
1864
1865 //
1866 //    N is the integer part of the reduced-reduced argument.
1867 //    Put the integer in a GP register.
1868 //
1869 { .mfi
1870       getf.sig N_fix_gr = N_fix
1871       nop.f 999
1872       nop.i 999
1873 }
1874 ;;
1875
1876 //
1877 //    s_val = -N*P_1 + ArgPrime
1878 //    w = -N*P_2 + w
1879 //
1880 { .mfi
1881       nop.m 999
1882       fnma.s1 s_val = N, P_1, ArgPrime
1883       nop.i 999
1884 }
1885 { .mfi
1886       nop.m 999
1887       fnma.s1 w = N, P_2, w
1888       nop.i 999
1889 }
1890 ;;
1891
1892 //    Case 4: V_hi = N * P_2
1893 //    Case 4: U_hi = N_0 * d_1
1894 { .mfi
1895       nop.m 999
1896       fmpy.s1 V_hi = N, P_2               // V_hi = N * P_2 for |s| < 2^-14
1897       nop.i 999
1898 }
1899 { .mfi
1900       nop.m 999
1901       fmpy.s1 U_hi = N_0, d_1             // U_hi = N_0 * d_1 for |s| < 2^-14
1902       nop.i 999
1903 }
1904 ;;
1905
1906 //    Case 3: r = s_val + w (Z complete)
1907 //    Case 4: w = N * P_3
1908 { .mfi
1909       nop.m 999
1910       fadd.s1 r = s_val, w                // r = s_val + w for |s| >= 2^-14
1911       nop.i 999
1912 }
1913 { .mfi
1914       nop.m 999
1915       fmpy.s1 w2 = N, P_3                 // w = N * P_3 for |s| < 2^-14
1916       nop.i 999
1917 }
1918 ;;
1919
1920 //    Case 4: A =  U_hi + V_hi
1921 //    Note: Worry about switched sign of V_hi, so subtract instead of add.
1922 //    Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup)
1923 //    Note: the (-) is still missing for V_hi.
1924 { .mfi
1925       nop.m 999
1926       fsub.s1 A = U_hi, V_hi           // A = U_hi - V_hi for |s| < 2^-14
1927       nop.i 999
1928 }
1929 { .mfi
1930       nop.m 999
1931       fnma.s1 V_lo = N, P_2, V_hi      // V_lo = V_hi - N * P_2 for |s| < 2^-14
1932       nop.i 999
1933 }
1934 ;;
1935
1936 //    Decide between case 3 and 4:
1937 //    Case 3:  |s| >= 2**(-14)     Set p10
1938 //    Case 4:  |s| <  2**(-14)     Set p11
1939 //
1940 //    Case 4: U_lo = N_0 * d_1 - U_hi
1941 { .mfi
1942       nop.m 999
1943       fms.s1 U_lo = N_0, d_1, U_hi     // U_lo = N_0*d_1 - U_hi for |s| < 2^-14
1944       nop.i 999
1945 }
1946 { .mfi
1947       nop.m 999
1948       fcmp.lt.s1 p11, p10 = s_val, TWO_TO_NEG14
1949       nop.i 999
1950 }
1951 ;;
1952
1953 //    Case 4: We need abs of both U_hi and V_hi - dont
1954 //    worry about switched sign of V_hi.
1955 { .mfi
1956       nop.m 999
1957       fabs V_hiabs = V_hi              // |V_hi| for |s| < 2^-14
1958       nop.i 999
1959 }
1960 { .mfi
1961       nop.m 999
1962 (p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14
1963       nop.i 999
1964 }
1965 ;;
1966
1967 //    Case 3: c = s_val - r
1968 { .mfi
1969       nop.m 999
1970       fabs U_hiabs = U_hi              // |U_hi| for |s| < 2^-14
1971       nop.i 999
1972 }
1973 { .mfi
1974       nop.m 999
1975       fsub.s1 c = s_val, r             // c = s_val - r    for |s| >= 2^-14
1976       nop.i 999
1977 }
1978 ;;
1979
1980 // For Case 3, |s| >= 2^-14, determine if |r| < 1/4
1981 //
1982 //    Case 4: C_hi = s_val + A
1983 //
1984 { .mfi
1985       nop.m 999
1986 (p11) fadd.s1 C_hi = s_val, A              // C_hi = s_val + A for |s| < 2^-14
1987       nop.i 999
1988 }
1989 { .mfi
1990       nop.m 999
1991 (p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2
1992       nop.i 999
1993 }
1994 ;;
1995
1996 { .mfi
1997       getf.sig sig_r = r               // Get signif of r if |s| >= 2^-33
1998       fand B = B_mask1, r
1999       nop.i 999
2000 }
2001 ;;
2002
2003 //    Case 4: t = U_lo + V_lo
2004 { .mfi
2005       getf.exp exp_r = r               // Extract signexp of r if |s| >= 2^-33
2006 (p11) fadd.s1 t = U_lo, V_lo               // t = U_lo + V_lo for |s| < 2^-14
2007       nop.i 999
2008 }
2009 { .mfi
2010       nop.m 999
2011 (p14) fcmp.gt.s1 p14, p15 = r, NEGTWO_TO_NEG2
2012       nop.i 999
2013 }
2014 ;;
2015
2016 //    Case 3: c = (s - r) + w (c complete)
2017 { .mfi
2018       nop.m 999
2019 (p10) fadd.s1 c = c, w              // c = c + w for |s| >= 2^-14
2020       nop.i 999
2021 }
2022 { .mbb
2023       nop.m 999
2024 (p14) br.cond.spnt TANL_SMALL_R     // Branch if 2^24 <= |x| < 2^63 and |r|< 1/4
2025 (p15) br.cond.sptk TANL_NORMAL_R_A  // Branch if 2^24 <= |x| < 2^63 and |r|>=1/4
2026 }
2027 ;;
2028
2029
2030 // Here if 2^24 <= |x| < 2^63 and |s| < 2^-14  >>>>>>>  Case 4.
2031 //
2032 //    Case 4: Set P_12 if U_hiabs >= V_hiabs
2033 //    Case 4: w = w + N_0 * d_2
2034 //    Note: the (-) is now incorporated in w .
2035 { .mfi
2036       add table_ptr1 = 160, table_base           // Point to tanl_table_p1
2037       fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs
2038       nop.i 999
2039 }
2040 { .mfi
2041       nop.m 999
2042       fms.s1 w2 = N_0, d_2, w2
2043       nop.i 999
2044 }
2045 ;;
2046
2047 //    Case 4: C_lo = s_val - C_hi
2048 { .mfi
2049       ldfe P1_1 = [table_ptr1], 16               // Load P1_1
2050       fsub.s1 C_lo = s_val, C_hi
2051       nop.i 999
2052 }
2053 ;;
2054
2055 //
2056 //    Case 4: a = U_hi - A
2057 //            a = V_hi - A (do an add to account for missing (-) on V_hi
2058 //
2059 { .mfi
2060       ldfe P1_2 = [table_ptr1], 128              // Load P1_2
2061 (p12) fsub.s1 a = U_hi, A
2062       nop.i 999
2063 }
2064 { .mfi
2065       nop.m 999
2066 (p13) fadd.s1 a = V_hi, A
2067       nop.i 999
2068 }
2069 ;;
2070
2071 //    Case 4: t = U_lo + V_lo  + w
2072 { .mfi
2073       ldfe Q1_1 = [table_ptr1], 16               // Load Q1_1
2074       fadd.s1 t = t, w2
2075       nop.i 999
2076 }
2077 ;;
2078
2079 //    Case 4: a = (U_hi - A)  + V_hi
2080 //            a = (V_hi - A)  + U_hi
2081 //    In each case account for negative missing form V_hi .
2082 //
2083 { .mfi
2084       ldfe Q1_2 = [table_ptr1], 16               // Load Q1_2
2085 (p12) fsub.s1 a = a, V_hi
2086       nop.i 999
2087 }
2088 { .mfi
2089       nop.m 999
2090 (p13) fsub.s1 a = U_hi, a
2091       nop.i 999
2092 }
2093 ;;
2094
2095 //
2096 //    Case 4: C_lo = (s_val - C_hi) + A
2097 //
2098 { .mfi
2099       nop.m 999
2100       fadd.s1 C_lo = C_lo, A
2101       nop.i 999 ;;
2102 }
2103 //
2104 //    Case 4: t = t + a
2105 //
2106 { .mfi
2107       nop.m 999
2108       fadd.s1 t = t, a
2109       nop.i 999
2110 }
2111 ;;
2112
2113 //    Case 4: C_lo = C_lo + t
2114 //    Case 4: r = C_hi + C_lo
2115 { .mfi
2116       nop.m 999
2117       fadd.s1 C_lo = C_lo, t
2118       nop.i 999
2119 }
2120 ;;
2121
2122 { .mfi
2123       nop.m 999
2124       fadd.s1 r = C_hi, C_lo
2125       nop.i 999
2126 }
2127 ;;
2128
2129 //
2130 //    Case 4: c = C_hi - r
2131 //
2132 { .mfi
2133       nop.m 999
2134       fsub.s1 c = C_hi, r
2135       nop.i 999
2136 }
2137 { .mfi
2138       nop.m 999
2139       fmpy.s1 rsq = r, r
2140       add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl)
2141 }
2142 ;;
2143
2144 //    Case 4: c = c + C_lo  finished.
2145 //
2146 //    Is i_1 = lsb of N_fix_gr even or odd?
2147 //    if i_1 == 0, set PR_11, else set PR_12.
2148 //
2149 { .mfi
2150       nop.m 999
2151       fadd.s1 c = c , C_lo
2152       tbit.z p11, p12 =  N_fix_gr, 0
2153 }
2154 ;;
2155
2156 // r and c have been computed.
2157 { .mfi
2158       nop.m 999
2159 (p12) frcpa.s1 S_hi, p0 = f1, r
2160       nop.i 999
2161 }
2162 { .mfi
2163       nop.m 999
2164 //
2165 //    N odd: Change sign of S_hi
2166 //
2167 (p11) fma.s1 Poly = rsq, P1_2, P1_1
2168       nop.i 999 ;;
2169 }
2170 { .mfi
2171       nop.m 999
2172 (p12) fma.s1 P = rsq, Q1_2, Q1_1
2173       nop.i 999
2174 }
2175 { .mfi
2176       nop.m 999
2177 //
2178 //    N odd:  Result  =  S_hi + S_lo      (User supplied rounding mode for C1)
2179 //
2180        fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2181       nop.i 999 ;;
2182 }
2183 { .mfi
2184       nop.m 999
2185 //
2186 //    N even: rsq = r * r
2187 //    N odd:  S_hi = frcpa(r)
2188 //
2189 (p12) fmerge.ns S_hi = S_hi, S_hi
2190       nop.i 999
2191 }
2192 { .mfi
2193       nop.m 999
2194 //
2195 //    N even: rsq = rsq * P1_2 + P1_1
2196 //    N odd:  poly1 =  1.0 +  S_hi * r    16 bits partial  account for necessary
2197 //
2198 (p11) fmpy.s1 Poly = rsq, Poly
2199       nop.i 999 ;;
2200 }
2201 { .mfi
2202       nop.m 999
2203 (p12) fma.s1 poly1 = S_hi, r,f1
2204 (p11) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl
2205 }
2206 { .mfi
2207       nop.m 999
2208 //
2209 //    N even: Poly =  Poly * rsq
2210 //    N odd:  S_hi  = S_hi + S_hi*poly1  16 bits account for necessary
2211 //
2212 (p11) fma.s1 Poly = r, Poly, c
2213       nop.i 999 ;;
2214 }
2215 { .mfi
2216       nop.m 999
2217 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2218       nop.i 999
2219 }
2220 { .mfi
2221       nop.m 999
2222 //
2223 //    N odd:   S_hi  = S_hi * poly1 + S_hi   32 bits
2224 //
2225 (p14) fadd.s0 Result = r, Poly          // for tanl
2226       nop.i 999 ;;
2227 }
2228
2229 .pred.rel "mutex",p15,p12
2230 { .mfi
2231       nop.m 999
2232 (p15) fms.s0 Result = r, mOne, Poly     // for cotl
2233       nop.i 999
2234 }
2235 { .mfi
2236       nop.m 999
2237 (p12) fma.s1 poly1 =  S_hi, r, f1
2238       nop.i 999 ;;
2239 }
2240 { .mfi
2241       nop.m 999
2242 //
2243 //    N even: Poly = Poly * r + c
2244 //    N odd:  poly1  = 1.0 + S_hi * r        32 bits partial
2245 //
2246 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2247       nop.i 999 ;;
2248 }
2249 { .mfi
2250       nop.m 999
2251 (p12) fma.s1 poly1 = S_hi, r, f1
2252       nop.i 999 ;;
2253 }
2254 { .mfi
2255       nop.m 999
2256 //
2257 //    N even: Result = Poly + r  (Rounding mode S0)
2258 //    N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
2259 //
2260 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2261       nop.i 999 ;;
2262 }
2263 { .mfi
2264       nop.m 999
2265 //
2266 //    N odd:  poly1  =  S_hi * poly + S_hi    64 bits
2267 //
2268 (p12) fma.s1 poly1 = S_hi, r, f1
2269       nop.i 999 ;;
2270 }
2271 { .mfi
2272       nop.m 999
2273 //
2274 //    N odd:  poly1  =  S_hi * r + 1.0
2275 //
2276 (p12) fma.s1 poly1 = S_hi, c, poly1
2277       nop.i 999 ;;
2278 }
2279 { .mfi
2280       nop.m 999
2281 //
2282 //    N odd:  poly1  =  S_hi * c + poly1
2283 //
2284 (p12) fmpy.s1 S_lo = S_hi, poly1
2285       nop.i 999 ;;
2286 }
2287 { .mfi
2288       nop.m 999
2289 //
2290 //    N odd:  S_lo  =  S_hi *  poly1
2291 //
2292 (p12) fma.s1 S_lo = P, r, S_lo
2293 (p12) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl
2294 }
2295
2296 { .mfi
2297       nop.m 999
2298 (p14) fadd.s0 Result = S_hi, S_lo           // for tanl
2299       nop.i 999
2300 }
2301 { .mfb
2302       nop.m 999
2303 //
2304 //    N odd:  S_lo  =  S_lo + r * P
2305 //
2306 (p15) fms.s0 Result = S_hi, mOne, S_lo      // for cotl
2307       br.ret.sptk b0 ;;      // Exit for 2^24 <= |x| < 2^63 and |s| < 2^-14
2308 }
2309
2310
2311 TANL_SMALL_R:
2312 // Here if |r| < 1/4
2313 // r and c have been computed.
2314 // *****************************************************************
2315 // *****************************************************************
2316 // *****************************************************************
2317 //    N odd:  S_hi = frcpa(r)
2318 //    Get [i_1] - lsb of N_fix_gr.  Set p11 if N even, p12 if N odd.
2319 //    N even: rsq = r * r
2320 { .mfi
2321       add table_ptr1 = 160, table_base    // Point to tanl_table_p1
2322       frcpa.s1 S_hi, p0 = f1, r           // S_hi for N odd
2323       add N_fix_gr = N_fix_gr, cot_flag   // N = N + 1 (for cotl)
2324 }
2325 { .mfi
2326       add table_ptr2 = 400, table_base    // Point to Q1_7
2327       fmpy.s1 rsq = r, r
2328       nop.i 999
2329 }
2330 ;;
2331
2332 { .mmi
2333       ldfe P1_1 = [table_ptr1], 16
2334 ;;
2335       ldfe P1_2 = [table_ptr1], 16
2336       tbit.z p11, p12 = N_fix_gr, 0
2337 }
2338 ;;
2339
2340
2341 { .mfi
2342       ldfe P1_3 = [table_ptr1], 96
2343       nop.f 999
2344       nop.i 999
2345 }
2346 ;;
2347
2348 { .mfi
2349 (p11) ldfe P1_9 = [table_ptr1], -16
2350 (p12) fmerge.ns S_hi = S_hi, S_hi
2351       nop.i 999
2352 }
2353 { .mfi
2354       nop.m 999
2355 (p11) fmpy.s1 r_to_the_8 = rsq, rsq
2356       nop.i 999
2357 }
2358 ;;
2359
2360 //
2361 //    N even: Poly2 = P1_7 + Poly2 * rsq
2362 //    N odd:  poly2 = Q1_5 + poly2 * rsq
2363 //
2364 { .mfi
2365 (p11) ldfe P1_8 = [table_ptr1], -16
2366 (p11) fadd.s1 CORR = rsq, f1
2367       nop.i 999
2368 }
2369 ;;
2370
2371 //
2372 //    N even: Poly1 = P1_2 + P1_3 * rsq
2373 //    N odd:  poly1 =  1.0 +  S_hi * r
2374 //    16 bits partial  account for necessary (-1)
2375 //
2376 { .mmi
2377 (p11) ldfe P1_7 = [table_ptr1], -16
2378 ;;
2379 (p11) ldfe P1_6 = [table_ptr1], -16
2380       nop.i 999
2381 }
2382 ;;
2383
2384 //
2385 //    N even: Poly1 = P1_1 + Poly1 * rsq
2386 //    N odd:  S_hi  =  S_hi + S_hi * poly1)     16 bits account for necessary
2387 //
2388 //
2389 //    N even: Poly2 = P1_5 + Poly2 * rsq
2390 //    N odd:  poly2 = Q1_3 + poly2 * rsq
2391 //
2392 { .mfi
2393 (p11) ldfe P1_5 = [table_ptr1], -16
2394 (p11) fmpy.s1 r_to_the_8 = r_to_the_8, r_to_the_8
2395       nop.i 999
2396 }
2397 { .mfi
2398       nop.m 999
2399 (p12) fma.s1 poly1 =  S_hi, r, f1
2400       nop.i 999
2401 }
2402 ;;
2403
2404 //
2405 //    N even: Poly1 =  Poly1 * rsq
2406 //    N odd:  poly1  = 1.0 + S_hi * r         32 bits partial
2407 //
2408
2409 //
2410 //    N even: CORR =  CORR * c
2411 //    N odd:  S_hi  =  S_hi * poly1 + S_hi    32 bits
2412 //
2413
2414 //
2415 //    N even: Poly2 = P1_6 + Poly2 * rsq
2416 //    N odd:  poly2 = Q1_4 + poly2 * rsq
2417 //
2418
2419 { .mmf
2420 (p11) ldfe P1_4 = [table_ptr1], -16
2421       nop.m 999
2422 (p11) fmpy.s1 CORR =  CORR, c
2423 }
2424 ;;
2425
2426 { .mfi
2427       nop.m 999
2428 (p11) fma.s1 Poly1 = P1_3, rsq, P1_2
2429       nop.i 999 ;;
2430 }
2431 { .mfi
2432 (p12) ldfe Q1_7 = [table_ptr2], -16
2433 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2434       nop.i 999 ;;
2435 }
2436 { .mfi
2437 (p12) ldfe Q1_6 = [table_ptr2], -16
2438 (p11) fma.s1 Poly2 = P1_9, rsq, P1_8
2439       nop.i 999 ;;
2440 }
2441 { .mmi
2442 (p12) ldfe Q1_5 = [table_ptr2], -16 ;;
2443 (p12) ldfe Q1_4 = [table_ptr2], -16
2444       nop.i 999 ;;
2445 }
2446 { .mfi
2447 (p12) ldfe Q1_3 = [table_ptr2], -16
2448 //
2449 //    N even: Poly2 = P1_8 + P1_9 * rsq
2450 //    N odd:  poly2 = Q1_6 + Q1_7 * rsq
2451 //
2452 (p11) fma.s1 Poly1 = Poly1, rsq, P1_1
2453       nop.i 999 ;;
2454 }
2455 { .mfi
2456 (p12) ldfe Q1_2 = [table_ptr2], -16
2457 (p12) fma.s1 poly1 = S_hi, r, f1
2458       nop.i 999 ;;
2459 }
2460 { .mfi
2461 (p12) ldfe Q1_1 = [table_ptr2], -16
2462 (p11) fma.s1 Poly2 = Poly2, rsq, P1_7
2463       nop.i 999 ;;
2464 }
2465 { .mfi
2466       nop.m 999
2467 //
2468 //    N even: CORR =  rsq + 1
2469 //    N even: r_to_the_8 =  rsq * rsq
2470 //
2471 (p11) fmpy.s1 Poly1 = Poly1, rsq
2472       nop.i 999 ;;
2473 }
2474 { .mfi
2475       nop.m 999
2476 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2477       nop.i 999
2478 }
2479 { .mfi
2480       nop.m 999
2481 (p12) fma.s1 poly2 = Q1_7, rsq, Q1_6
2482       nop.i 999 ;;
2483 }
2484 { .mfi
2485       nop.m 999
2486 (p11) fma.s1 Poly2 = Poly2, rsq, P1_6
2487       nop.i 999 ;;
2488 }
2489 { .mfi
2490       nop.m 999
2491 (p12) fma.s1 poly1 = S_hi, r, f1
2492       nop.i 999
2493 }
2494 { .mfi
2495       nop.m 999
2496 (p12) fma.s1 poly2 = poly2, rsq, Q1_5
2497       nop.i 999 ;;
2498 }
2499 { .mfi
2500       nop.m 999
2501 (p11) fma.s1 Poly2= Poly2, rsq, P1_5
2502       nop.i 999 ;;
2503 }
2504 { .mfi
2505       nop.m 999
2506 (p12) fma.s1 S_hi =  S_hi, poly1, S_hi
2507       nop.i 999
2508 }
2509 { .mfi
2510       nop.m 999
2511 (p12) fma.s1 poly2 = poly2, rsq, Q1_4
2512       nop.i 999 ;;
2513 }
2514 { .mfi
2515       nop.m 999
2516 //
2517 //    N even: r_to_the_8 = r_to_the_8 * r_to_the_8
2518 //    N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
2519 //
2520 (p11) fma.s1 Poly2 = Poly2, rsq, P1_4
2521       nop.i 999 ;;
2522 }
2523 { .mfi
2524       nop.m 999
2525 //
2526 //    N even: Poly = CORR + Poly * r
2527 //    N odd:  P = Q1_1 + poly2 * rsq
2528 //
2529 (p12) fma.s1 poly1 = S_hi, r, f1
2530       nop.i 999
2531 }
2532 { .mfi
2533       nop.m 999
2534 (p12) fma.s1 poly2 = poly2, rsq, Q1_3
2535       nop.i 999 ;;
2536 }
2537 { .mfi
2538       nop.m 999
2539 //
2540 //    N even: Poly2 = P1_4 + Poly2 * rsq
2541 //    N odd:  poly2 = Q1_2 + poly2 * rsq
2542 //
2543 (p11) fma.s1 Poly = Poly2, r_to_the_8, Poly1
2544       nop.i 999 ;;
2545 }
2546 { .mfi
2547       nop.m 999
2548 (p12) fma.s1 poly1 = S_hi, c, poly1
2549       nop.i 999
2550 }
2551 { .mfi
2552       nop.m 999
2553 (p12) fma.s1 poly2 = poly2, rsq, Q1_2
2554       nop.i 999 ;;
2555 }
2556
2557 { .mfi
2558       nop.m 999
2559 //
2560 //    N even: Poly = Poly1 + Poly2 * r_to_the_8
2561 //    N odd:  S_hi =  S_hi * poly1 + S_hi    64 bits
2562 //
2563 (p11) fma.s1 Poly = Poly, r, CORR
2564       nop.i 999 ;;
2565 }
2566 { .mfi
2567       nop.m 999
2568 //
2569 //    N even: Result =  r + Poly  (User supplied rounding mode)
2570 //    N odd:  poly1  =  S_hi * c + poly1
2571 //
2572 (p12) fmpy.s1 S_lo = S_hi, poly1
2573 (p11) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl
2574 }
2575 { .mfi
2576       nop.m 999
2577 (p12) fma.s1 P = poly2, rsq, Q1_1
2578       nop.i 999 ;;
2579 }
2580 { .mfi
2581       nop.m 999
2582 //
2583 //    N odd:  poly1  =  S_hi * r + 1.0
2584 //
2585 //
2586 //    N odd:  S_lo  =  S_hi *  poly1
2587 //
2588 (p14) fadd.s0 Result = Poly, r          // for tanl
2589       nop.i 999
2590 }
2591 { .mfi
2592       nop.m 999
2593 (p15) fms.s0 Result = Poly, mOne, r     // for cotl
2594       nop.i 999 ;;
2595 }
2596
2597 { .mfi
2598       nop.m 999
2599 //
2600 //    N odd:  S_lo  =  Q1_1 * c + S_lo
2601 //
2602 (p12) fma.s1 S_lo = Q1_1, c, S_lo
2603       nop.i 999
2604 }
2605 { .mfi
2606       nop.m 999
2607       fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2608       nop.i 999 ;;
2609 }
2610 { .mfi
2611       nop.m 999
2612 //
2613 //    N odd:  Result =  S_lo + r * P
2614 //
2615 (p12) fma.s1 Result = P, r, S_lo
2616 (p12) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl
2617 }
2618
2619 //
2620 //    N odd:  Result = Result + S_hi  (user supplied rounding mode)
2621 //
2622 { .mfi
2623       nop.m 999
2624 (p14) fadd.s0 Result = Result, S_hi         // for tanl
2625       nop.i 999
2626 }
2627 { .mfb
2628       nop.m 999
2629 (p15) fms.s0 Result = Result, mOne, S_hi    // for cotl
2630       br.ret.sptk b0 ;;              // Exit |r| < 1/4 path
2631 }
2632
2633
2634 TANL_NORMAL_R:
2635 // Here if 1/4 <= |x| < pi/4  or  if |x| >= 2^63 and |r| >= 1/4
2636 // *******************************************************************
2637 // *******************************************************************
2638 // *******************************************************************
2639 //
2640 //    r and c have been computed.
2641 //
2642 { .mfi
2643       nop.m 999
2644       fand B = B_mask1, r
2645       nop.i 999
2646 }
2647 ;;
2648
2649 TANL_NORMAL_R_A:
2650 // Enter here if pi/4 <= |x| < 2^63 and |r| >= 1/4
2651 //    Get the 5 bits or r for the lookup.   1.xxxxx ....
2652 { .mmi
2653       add table_ptr1 = 416, table_base     // Point to tanl_table_p2
2654       mov GR_exp_2tom65 = 0xffff - 65      // Scaling constant for B
2655       extr.u lookup = sig_r, 58, 5
2656 }
2657 ;;
2658
2659 { .mmi
2660       ldfe P2_1 = [table_ptr1], 16
2661       setf.exp TWO_TO_NEG65 = GR_exp_2tom65  // 2^-65 for scaling B if exp_r=-2
2662       add N_fix_gr = N_fix_gr, cot_flag      // N = N + 1 (for cotl)
2663 }
2664 ;;
2665
2666 .pred.rel "mutex",p11,p12
2667 //    B =  2^63 * 1.xxxxx 100...0
2668 { .mfi
2669       ldfe P2_2 = [table_ptr1], 16
2670       for B = B_mask2, B
2671       mov table_offset = 512               // Assume table offset is 512
2672 }
2673 ;;
2674
2675 { .mfi
2676       ldfe P2_3 = [table_ptr1], 16
2677       fmerge.s  Pos_r = f1, r
2678       tbit.nz p8,p9 = exp_r, 0
2679 }
2680 ;;
2681
2682 //    Is  B = 2** -2 or  B= 2** -1? If 2**-1, then
2683 //    we want an offset of 512 for table addressing.
2684 { .mii
2685       add table_ptr2 = 1296, table_base     // Point to tanl_table_cm2
2686 (p9)  shladd table_offset = lookup, 4, table_offset
2687 (p8)  shladd table_offset = lookup, 4, r0
2688 }
2689 ;;
2690
2691 { .mmi
2692       add table_ptr1 = table_ptr1, table_offset  // Point to T_hi
2693       add table_ptr2 = table_ptr2, table_offset  // Point to C_hi
2694       add table_ptr3 = 2128, table_base     // Point to tanl_table_scim2
2695 }
2696 ;;
2697
2698 { .mmi
2699       ldfd T_hi = [table_ptr1], 8                // Load T_hi
2700 ;;
2701       ldfd C_hi = [table_ptr2], 8                // Load C_hi
2702       add table_ptr3 = table_ptr3, table_offset  // Point to SC_inv
2703 }
2704 ;;
2705
2706 //
2707 //    x = |r| - B
2708 //
2709 //   Convert B so it has the same exponent as Pos_r before subtracting
2710 { .mfi
2711       ldfs T_lo = [table_ptr1]                   // Load T_lo
2712 (p9)  fnma.s1 x = B, FR_2tom64, Pos_r
2713       nop.i 999
2714 }
2715 { .mfi
2716       nop.m 999
2717 (p8)  fnma.s1 x = B, TWO_TO_NEG65, Pos_r
2718       nop.i 999
2719 }
2720 ;;
2721
2722 { .mfi
2723       ldfs C_lo = [table_ptr2]                   // Load C_lo
2724       nop.f 999
2725       nop.i 999
2726 }
2727 ;;
2728
2729 { .mfi
2730       ldfe SC_inv = [table_ptr3]                 // Load SC_inv
2731       fmerge.s  sgn_r = r, f1
2732       tbit.z p11, p12 = N_fix_gr, 0              // p11 if N even, p12 if odd
2733
2734 }
2735 ;;
2736
2737 //
2738 //    xsq = x * x
2739 //    N even: Tx = T_hi * x
2740 //
2741 //    N even: Tx1 = Tx + 1
2742 //    N odd:  Cx1 = 1 - Cx
2743 //
2744
2745 { .mfi
2746       nop.m 999
2747       fmpy.s1 xsq = x, x
2748       nop.i 999
2749 }
2750 { .mfi
2751       nop.m 999
2752 (p11) fmpy.s1 Tx = T_hi, x
2753       nop.i 999
2754 }
2755 ;;
2756
2757 //
2758 //    N odd: Cx = C_hi * x
2759 //
2760 { .mfi
2761       nop.m 999
2762 (p12) fmpy.s1 Cx = C_hi, x
2763       nop.i 999
2764 }
2765 ;;
2766 //
2767 //    N even and odd: P = P2_3 + P2_2 * xsq
2768 //
2769 { .mfi
2770       nop.m 999
2771       fma.s1 P = P2_3, xsq, P2_2
2772       nop.i 999
2773 }
2774 { .mfi
2775       nop.m 999
2776 (p11) fadd.s1 Tx1 = Tx, f1
2777       nop.i 999 ;;
2778 }
2779 { .mfi
2780       nop.m 999
2781 //
2782 //    N even: D = C_hi - tanx
2783 //    N odd: D = T_hi + tanx
2784 //
2785 (p11) fmpy.s1 CORR = SC_inv, T_hi
2786       nop.i 999
2787 }
2788 { .mfi
2789       nop.m 999
2790       fmpy.s1 Sx = SC_inv, x
2791       nop.i 999 ;;
2792 }
2793 { .mfi
2794       nop.m 999
2795 (p12) fmpy.s1 CORR = SC_inv, C_hi
2796       nop.i 999 ;;
2797 }
2798 { .mfi
2799       nop.m 999
2800 (p12) fsub.s1 V_hi = f1, Cx
2801       nop.i 999 ;;
2802 }
2803 { .mfi
2804       nop.m 999
2805       fma.s1 P = P, xsq, P2_1
2806       nop.i 999
2807 }
2808 { .mfi
2809       nop.m 999
2810 //
2811 //    N even and odd: P = P2_1 + P * xsq
2812 //
2813 (p11) fma.s1 V_hi = Tx, Tx1, f1
2814       nop.i 999 ;;
2815 }
2816 { .mfi
2817       nop.m 999
2818 //
2819 //    N even: Result  = sgn_r * tail + T_hi (user rounding mode for C1)
2820 //    N odd:  Result  = sgn_r * tail + C_hi (user rounding mode for C1)
2821 //
2822       fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2823       nop.i 999 ;;
2824 }
2825 { .mfi
2826       nop.m 999
2827       fmpy.s1 CORR = CORR, c
2828       nop.i 999 ;;
2829 }
2830 { .mfi
2831       nop.m 999
2832 (p12) fnma.s1 V_hi = Cx,V_hi,f1
2833       nop.i 999 ;;
2834 }
2835 { .mfi
2836       nop.m 999
2837 //
2838 //    N even: V_hi = Tx * Tx1 + 1
2839 //    N odd: Cx1 = 1 - Cx * Cx1
2840 //
2841       fmpy.s1 P = P, xsq
2842       nop.i 999
2843 }
2844 { .mfi
2845       nop.m 999
2846 //
2847 //    N even and odd: P = P * xsq
2848 //
2849 (p11) fmpy.s1 V_hi = V_hi, T_hi
2850       nop.i 999 ;;
2851 }
2852 { .mfi
2853       nop.m 999
2854 //
2855 //    N even and odd: tail = P * tail + V_lo
2856 //
2857 (p11) fmpy.s1 T_hi = sgn_r, T_hi
2858       nop.i 999 ;;
2859 }
2860 { .mfi
2861       nop.m 999
2862       fmpy.s1 CORR = CORR, sgn_r
2863       nop.i 999 ;;
2864 }
2865 { .mfi
2866       nop.m 999
2867 (p12) fmpy.s1 V_hi = V_hi,C_hi
2868       nop.i 999 ;;
2869 }
2870 { .mfi
2871       nop.m 999
2872 //
2873 //    N even: V_hi = T_hi * V_hi
2874 //    N odd: V_hi  = C_hi * V_hi
2875 //
2876       fma.s1 tanx = P, x, x
2877       nop.i 999
2878 }
2879 { .mfi
2880       nop.m 999
2881 (p12) fnmpy.s1 C_hi = sgn_r, C_hi
2882       nop.i 999 ;;
2883 }
2884 { .mfi
2885       nop.m 999
2886 //
2887 //    N even: V_lo = 1 - V_hi + C_hi
2888 //    N odd: V_lo = 1 - V_hi + T_hi
2889 //
2890 (p11) fadd.s1 CORR = CORR, T_lo
2891       nop.i 999
2892 }
2893 { .mfi
2894       nop.m 999
2895 (p12) fsub.s1 CORR = CORR, C_lo
2896       nop.i 999 ;;
2897 }
2898 { .mfi
2899       nop.m 999
2900 //
2901 //    N even and odd: tanx = x + x * P
2902 //    N even and odd: Sx = SC_inv * x
2903 //
2904 (p11) fsub.s1 D = C_hi, tanx
2905       nop.i 999
2906 }
2907 { .mfi
2908       nop.m 999
2909 (p12) fadd.s1 D = T_hi, tanx
2910       nop.i 999 ;;
2911 }
2912 { .mfi
2913       nop.m 999
2914 //
2915 //    N odd: CORR = SC_inv * C_hi
2916 //    N even: CORR = SC_inv * T_hi
2917 //
2918       fnma.s1 D = V_hi, D, f1
2919       nop.i 999 ;;
2920 }
2921 { .mfi
2922       nop.m 999
2923 //
2924 //    N even and odd: D = 1 - V_hi * D
2925 //    N even and odd: CORR = CORR * c
2926 //
2927       fma.s1 V_hi = V_hi, D, V_hi
2928       nop.i 999 ;;
2929 }
2930 { .mfi
2931       nop.m 999
2932 //
2933 //    N even and odd: V_hi = V_hi + V_hi * D
2934 //    N even and odd: CORR = sgn_r * CORR
2935 //
2936 (p11) fnma.s1 V_lo = V_hi, C_hi, f1
2937       nop.i 999
2938 }
2939 { .mfi
2940       nop.m 999
2941 (p12) fnma.s1 V_lo = V_hi, T_hi, f1
2942       nop.i 999 ;;
2943 }
2944 { .mfi
2945       nop.m 999
2946 //
2947 //    N even: CORR = COOR + T_lo
2948 //    N odd: CORR = CORR - C_lo
2949 //
2950 (p11) fma.s1 V_lo = tanx, V_hi, V_lo
2951       tbit.nz p15, p0 = cot_flag, 0       // p15=1 if we compute cotl
2952 }
2953 { .mfi
2954       nop.m 999
2955 (p12) fnma.s1 V_lo = tanx, V_hi, V_lo
2956       nop.i 999 ;;
2957 }
2958
2959 { .mfi
2960       nop.m 999
2961 (p15) fms.s1 T_hi = f0, f0, T_hi        // to correct result's sign for cotl
2962       nop.i 999
2963 }
2964 { .mfi
2965       nop.m 999
2966 (p15) fms.s1 C_hi = f0, f0, C_hi        // to correct result's sign for cotl
2967       nop.i 999
2968 };;
2969
2970 { .mfi
2971       nop.m 999
2972 (p15) fms.s1 sgn_r = f0, f0, sgn_r      // to correct result's sign for cotl
2973       nop.i 999
2974 };;
2975
2976 { .mfi
2977       nop.m 999
2978 //
2979 //    N even: V_lo = V_lo + V_hi * tanx
2980 //    N odd: V_lo = V_lo - V_hi * tanx
2981 //
2982 (p11) fnma.s1 V_lo = C_lo, V_hi, V_lo
2983       nop.i 999
2984 }
2985 { .mfi
2986       nop.m 999
2987 (p12) fnma.s1 V_lo = T_lo, V_hi, V_lo
2988       nop.i 999 ;;
2989 }
2990 { .mfi
2991       nop.m 999
2992 //
2993 //    N  even: V_lo = V_lo - V_hi * C_lo
2994 //    N  odd: V_lo = V_lo - V_hi * T_lo
2995 //
2996       fmpy.s1 V_lo = V_hi, V_lo
2997       nop.i 999 ;;
2998 }
2999 { .mfi
3000       nop.m 999
3001 //
3002 //    N even and odd: V_lo = V_lo * V_hi
3003 //
3004       fadd.s1 tail = V_hi, V_lo
3005       nop.i 999 ;;
3006 }
3007 { .mfi
3008       nop.m 999
3009 //
3010 //    N even and odd: tail = V_hi + V_lo
3011 //
3012       fma.s1 tail = tail, P, V_lo
3013       nop.i 999 ;;
3014 }
3015 { .mfi
3016       nop.m 999
3017 //
3018 //    N even: T_hi = sgn_r * T_hi
3019 //    N odd : C_hi = -sgn_r * C_hi
3020 //
3021       fma.s1 tail = tail, Sx, CORR
3022       nop.i 999 ;;
3023 }
3024 { .mfi
3025       nop.m 999
3026 //
3027 //    N even and odd: tail = Sx * tail + CORR
3028 //
3029       fma.s1 tail = V_hi, Sx, tail
3030       nop.i 999 ;;
3031 }
3032 { .mfi
3033       nop.m 999
3034 //
3035 //    N even an odd: tail = Sx * V_hi + tail
3036 //
3037 (p11) fma.s0 Result = sgn_r, tail, T_hi
3038       nop.i 999
3039 }
3040 { .mfb
3041       nop.m 999
3042 (p12) fma.s0 Result = sgn_r, tail, C_hi
3043       br.ret.sptk b0 ;;                 // Exit for 1/4 <= |r| < pi/4
3044 }
3045
3046 TANL_DENORMAL:
3047 // Here if x denormal
3048 { .mfb
3049       getf.exp GR_signexp_x = Norm_Arg          // Get sign and exponent of x
3050       nop.f 999
3051       br.cond.sptk TANL_COMMON                  // Return to common code
3052 }
3053 ;;
3054
3055
3056 TANL_SPECIAL:
3057 TANL_UNSUPPORTED:
3058 //
3059 //     Code for NaNs, Unsupporteds, Infs, or +/- zero ?
3060 //     Invalid raised for Infs and SNaNs.
3061 //
3062
3063 { .mfi
3064       nop.m 999
3065       fmerge.s  f10 = f8, f8            // Save input for error call
3066       tbit.nz p6, p7 = cot_flag, 0      // p6=1 if we compute cotl
3067 }
3068 ;;
3069
3070 { .mfi
3071       nop.m 999
3072 (p6)  fclass.m p6, p7 = f8, 0x7         // Test for zero (cotl only)
3073       nop.i 999
3074 }
3075 ;;
3076
3077 .pred.rel "mutex", p6, p7
3078 { .mfi
3079 (p6)  mov GR_Parameter_Tag = 225        // (cotl)
3080 (p6)  frcpa.s0  f8, p0 = f1, f8         // cotl(+-0) = +-Inf
3081       nop.i 999
3082 }
3083 { .mfb
3084       nop.m 999
3085 (p7)  fmpy.s0 f8 = f8, f0
3086 (p7)  br.ret.sptk b0
3087 }
3088 ;;
3089
3090 GLOBAL_IEEE754_END(tanl)
3091
3092 LOCAL_LIBM_ENTRY(__libm_error_region)
3093 .prologue
3094
3095 // (1)
3096 { .mfi
3097       add           GR_Parameter_Y=-32,sp        // Parameter 2 value
3098       nop.f         0
3099 .save   ar.pfs,GR_SAVE_PFS
3100       mov           GR_SAVE_PFS=ar.pfs           // Save ar.pfs
3101 }
3102 { .mfi
3103 .fframe 64
3104       add sp=-64,sp                              // Create new stack
3105       nop.f 0
3106       mov GR_SAVE_GP=gp                          // Save gp
3107 };;
3108
3109 // (2)
3110 { .mmi
3111       stfe [GR_Parameter_Y] = f1,16              // STORE Parameter 2 on stack
3112       add GR_Parameter_X = 16,sp                 // Parameter 1 address
3113 .save   b0, GR_SAVE_B0
3114       mov GR_SAVE_B0=b0                          // Save b0
3115 };;
3116
3117 .body
3118 // (3)
3119 { .mib
3120       stfe [GR_Parameter_X] = f10                // STORE Parameter 1 on stack
3121       add   GR_Parameter_RESULT = 0,GR_Parameter_Y  // Parameter 3 address
3122       nop.b 0
3123 }
3124 { .mib
3125       stfe [GR_Parameter_Y] = f8                 // STORE Parameter 3 on stack
3126       add   GR_Parameter_Y = -16,GR_Parameter_Y
3127       br.call.sptk b0=__libm_error_support#      // Call error handling function
3128 };;
3129 { .mmi
3130       nop.m 0
3131       nop.m 0
3132       add   GR_Parameter_RESULT = 48,sp
3133 };;
3134
3135 // (4)
3136 { .mmi
3137       ldfe  f8 = [GR_Parameter_RESULT]           // Get return result off stack
3138 .restore sp
3139       add   sp = 64,sp                           // Restore stack pointer
3140       mov   b0 = GR_SAVE_B0                      // Restore return address
3141 };;
3142 { .mib
3143       mov   gp = GR_SAVE_GP                      // Restore gp
3144       mov   ar.pfs = GR_SAVE_PFS                 // Restore ar.pfs
3145       br.ret.sptk     b0                         // Return
3146 };;
3147
3148 LOCAL_LIBM_END(__libm_error_region)
3149
3150 .type   __libm_error_support#,@function
3151 .global __libm_error_support#
3152
3153
3154 // *******************************************************************
3155 // *******************************************************************
3156 // *******************************************************************
3157 //
3158 //     Special Code to handle very large argument case.
3159 //     Call int __libm_pi_by_2_reduce(x,r,c) for |arguments| >= 2**63
3160 //     The interface is custom:
3161 //       On input:
3162 //         (Arg or x) is in f8
3163 //       On output:
3164 //         r is in f8
3165 //         c is in f9
3166 //         N is in r8
3167 //     We know also that __libm_pi_by_2_reduce preserves f10-15, f71-127.  We
3168 //     use this to eliminate save/restore of key fp registers in this calling
3169 //     function.
3170 //
3171 // *******************************************************************
3172 // *******************************************************************
3173 // *******************************************************************
3174
3175 LOCAL_LIBM_ENTRY(__libm_callout)
3176 TANL_ARG_TOO_LARGE:
3177 .prologue
3178 { .mfi
3179       add table_ptr2 = 144, table_base        // Point to 2^-2
3180       nop.f 999
3181 .save   ar.pfs,GR_SAVE_PFS
3182       mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
3183 }
3184 ;;
3185
3186 //     Load 2^-2, -2^-2
3187 { .mmi
3188       ldfps  TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2]
3189       setf.sig B_mask1 = bmask1               // Form mask to get 5 msb of r
3190 .save   b0, GR_SAVE_B0
3191       mov GR_SAVE_B0=b0                       // Save b0
3192 };;
3193
3194 .body
3195 //
3196 //     Call argument reduction with x in f8
3197 //     Returns with N in r8, r in f8, c in f9
3198 //     Assumes f71-127 are preserved across the call
3199 //
3200 { .mib
3201       setf.sig B_mask2 = bmask2               // Form mask to form B from r
3202       mov GR_SAVE_GP=gp                       // Save gp
3203       br.call.sptk b0=__libm_pi_by_2_reduce#
3204 }
3205 ;;
3206
3207 //
3208 //     Is |r| < 2**(-2)
3209 //
3210 { .mfi
3211       getf.sig sig_r = r                     // Extract significand of r
3212       fcmp.lt.s1  p6, p0 = r, TWO_TO_NEG2
3213       mov   gp = GR_SAVE_GP                  // Restore gp
3214 }
3215 ;;
3216
3217 { .mfi
3218       getf.exp exp_r = r                     // Extract signexp of r
3219       nop.f 999
3220       mov    b0 = GR_SAVE_B0                 // Restore return address
3221 }
3222 ;;
3223
3224 //
3225 //     Get N_fix_gr
3226 //
3227 { .mfi
3228       mov   N_fix_gr = r8
3229 (p6)  fcmp.gt.unc.s1  p6, p0 = r, NEGTWO_TO_NEG2
3230       mov   ar.pfs = GR_SAVE_PFS             // Restore pfs
3231 }
3232 ;;
3233
3234 { .mbb
3235       nop.m 999
3236 (p6)  br.cond.spnt TANL_SMALL_R              // Branch if |r| < 1/4
3237       br.cond.sptk TANL_NORMAL_R             // Branch if 1/4 <= |r| < pi/4
3238 }
3239 ;;
3240
3241 LOCAL_LIBM_END(__libm_callout)
3242
3243 .type __libm_pi_by_2_reduce#,@function
3244 .global __libm_pi_by_2_reduce#