sysdeps/ia64/fpu/s_cosl.S

   1 .file "sincosl.s"
   2
   3 // Copyright (C) 2000, 2001, Intel Corporation
   4 // All rights reserved.
   5 //
   6 // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
   7 // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
   8 //
   9 // Redistribution and use in source and binary forms, with or without
  10 // modification, are permitted provided that the following conditions are
  11 // met:
  12 //
  13 // * Redistributions of source code must retain the above copyright
  14 // notice, this list of conditions and the following disclaimer.
  15 //
  16 // * Redistributions in binary form must reproduce the above copyright
  17 // notice, this list of conditions and the following disclaimer in the
  18 // documentation and/or other materials provided with the distribution.
  19 //
  20 // * The name of Intel Corporation may not be used to endorse or promote
  21 // products derived from this software without specific prior written
  22 // permission.
  23 //
  24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  35 //
  36 // Intel Corporation is the author of this code, and requests that all
  37 // problem reports or change requests be submitted to it directly at
  38 // http://developer.intel.com/opensource.
  39 //
  40 // *********************************************************************
  41 //
  42 // History:
  43 // 2/02/2000 (hand-optimized)
  44 // 4/04/00  Unwind support added
  45 //
  46 // *********************************************************************
  47 //
  48 // Function:   Combined sinl(x) and cosl(x), where
  49 //
  50 //             sinl(x) = sine(x), for double-extended precision x values
  51 //             cosl(x) = cosine(x), for double-extended precision x values
  52 //
  53 // *********************************************************************
  54 //
  55 // Resources Used:
  56 //
  57 //    Floating-Point Registers: f8 (Input and Return Value)
  58 //                              f32-f99
  59 //
  60 //    General Purpose Registers:
  61 //      r32-r43
  62 //      r44-r45 (Used to pass arguments to pi_by_2 reduce routine)
  63 //
  64 //    Predicate Registers:      p6-p13
  65 //
  66 // *********************************************************************
  67 //
  68 //  IEEE Special Conditions:
  69 //
  70 //    Denormal  fault raised on denormal inputs
  71 //    Overflow exceptions do not occur
  72 //    Underflow exceptions raised when appropriate for sin
  73 //    (No specialized error handling for this routine)
  74 //    Inexact raised when appropriate by algorithm
  75 //
  76 //    sinl(SNaN) = QNaN
  77 //    sinl(QNaN) = QNaN
  78 //    sinl(inf) = QNaN
  79 //    sinl(+/-0) = +/-0
  80 //    cosl(inf) = QNaN
  81 //    cosl(SNaN) = QNaN
  82 //    cosl(QNaN) = QNaN
  83 //    cosl(0) = 1
  84 //
  85 // *********************************************************************
  86 //
  87 //  Mathematical Description
  88 //  ========================
  89 //
  90 //  The computation of FSIN and FCOS is best handled in one piece of
  91 //  code. The main reason is that given any argument Arg, computation
  92 //  of trigonometric functions first calculate N and an approximation
  93 //  to alpha where
  94 //
  95 //  Arg = N pi/2 + alpha, |alpha| <= pi/4.
  96 //
  97 //  Since
  98 //
  99 //  cosl( Arg ) = sinl( (N+1) pi/2 + alpha ),
 100 //
 101 //  therefore, the code for computing sine will produce cosine as long
 102 //  as 1 is added to N immediately after the argument reduction
 103 //  process.
 104 //
 105 //  Let M = N if sine
 106 //      N+1 if cosine.
 107 //
 108 //  Now, given
 109 //
 110 //  Arg = M pi/2  + alpha, |alpha| <= pi/4,
 111 //
 112 //  let I = M mod 4, or I be the two lsb of M when M is represented
 113 //  as 2's complement. I = [i_0 i_1]. Then
 114 //
 115 //  sinl( Arg ) = (-1)^i_0  sinl( alpha )       if i_1 = 0,
 116 //             = (-1)^i_0  cosl( alpha )     if i_1 = 1.
 117 //
 118 //  For example:
 119 //       if M = -1, I = 11
 120 //         sin ((-pi/2 + alpha) = (-1) cos (alpha)
 121 //       if M = 0, I = 00
 122 //         sin (alpha) = sin (alpha)
 123 //       if M = 1, I = 01
 124 //         sin (pi/2 + alpha) = cos (alpha)
 125 //       if M = 2, I = 10
 126 //         sin (pi + alpha) = (-1) sin (alpha)
 127 //       if M = 3, I = 11
 128 //         sin ((3/2)pi + alpha) = (-1) cos (alpha)
 129 //
 130 //  The value of alpha is obtained by argument reduction and
 131 //  represented by two working precision numbers r and c where
 132 //
 133 //  alpha =  r  +  c     accurately.
 134 //
 135 //  The reduction method is described in a previous write up.
 136 //  The argument reduction scheme identifies 4 cases. For Cases 2
 137 //  and 4, because |alpha| is small, sinl(r+c) and cosl(r+c) can be
 138 //  computed very easily by 2 or 3 terms of the Taylor series
 139 //  expansion as follows:
 140 //
 141 //  Case 2:
 142 //  -------
 143 //
 144 //  sinl(r + c) = r + c - r^3/6 accurately
 145 //  cosl(r + c) = 1 - 2^(-67)   accurately
 146 //
 147 //  Case 4:
 148 //  -------
 149 //
 150 //  sinl(r + c) = r + c - r^3/6 + r^5/120       accurately
 151 //  cosl(r + c) = 1 - r^2/2 + r^4/24            accurately
 152 //
 153 //  The only cases left are Cases 1 and 3 of the argument reduction
 154 //  procedure. These two cases will be merged since after the
 155 //  argument is reduced in either cases, we have the reduced argument
 156 //  represented as r + c and that the magnitude |r + c| is not small
 157 //  enough to allow the usage of a very short approximation.
 158 //
 159 //  The required calculation is either
 160 //
 161 //  sinl(r + c)  =  sinl(r)  +  correction,  or
 162 //  cosl(r + c)  =  cosl(r)  +  correction.
 163 //
 164 //  Specifically,
 165 //
 166 //      sinl(r + c) = sinl(r) + c sin'(r) + O(c^2)
 167 //                 = sinl(r) + c cos (r) + O(c^2)
 168 //                 = sinl(r) + c(1 - r^2/2)  accurately.
 169 //  Similarly,
 170 //
 171 //      cosl(r + c) = cosl(r) - c sinl(r) + O(c^2)
 172 //                 = cosl(r) - c(r - r^3/6)  accurately.
 173 //
 174 //  We therefore concentrate on accurately calculating sinl(r) and
 175 //  cosl(r) for a working-precision number r, |r| <= pi/4 to within
 176 //  0.1% or so.
 177 //
 178 //  The greatest challenge of this task is that the second terms of
 179 //  the Taylor series
 180 //
 181 //      r - r^3/3! + r^r/5! - ...
 182 //
 183 //  and
 184 //
 185 //      1 - r^2/2! + r^4/4! - ...
 186 //
 187 //  are not very small when |r| is close to pi/4 and the rounding
 188 //  errors will be a concern if simple polynomial accumulation is
 189 //  used. When |r| < 2^-3, however, the second terms will be small
 190 //  enough (6 bits or so of right shift) that a normal Horner
 191 //  recurrence suffices. Hence there are two cases that we consider
 192 //  in the accurate computation of sinl(r) and cosl(r), |r| <= pi/4.
 193 //
 194 //  Case small_r: |r| < 2^(-3)
 195 //  --------------------------
 196 //
 197 //  Since Arg = M pi/4 + r + c accurately, and M mod 4 is [i_0 i_1],
 198 //  we have
 199 //
 200 //      sinl(Arg) = (-1)^i_0 * sinl(r + c)      if i_1 = 0
 201 //               = (-1)^i_0 * cosl(r + c)       if i_1 = 1
 202 //
 203 //  can be accurately approximated by
 204 //
 205 //  sinl(Arg) = (-1)^i_0 * [sinl(r) + c]        if i_1 = 0
 206 //           = (-1)^i_0 * [cosl(r) - c*r] if i_1 = 1
 207 //
 208 //  because |r| is small and thus the second terms in the correction
 209 //  are unneccessary.
 210 //
 211 //  Finally, sinl(r) and cosl(r) are approximated by polynomials of
 212 //  moderate lengths.
 213 //
 214 //  sinl(r) =  r + S_1 r^3 + S_2 r^5 + ... + S_5 r^11
 215 //  cosl(r) =  1 + C_1 r^2 + C_2 r^4 + ... + C_5 r^10
 216 //
 217 //  We can make use of predicates to selectively calculate
 218 //  sinl(r) or cosl(r) based on i_1.
 219 //
 220 //  Case normal_r: 2^(-3) <= |r| <= pi/4
 221 //  ------------------------------------
 222 //
 223 //  This case is more likely than the previous one if one considers
 224 //  r to be uniformly distributed in [-pi/4 pi/4]. Again,
 225 //
 226 //  sinl(Arg) = (-1)^i_0 * sinl(r + c)  if i_1 = 0
 227 //           = (-1)^i_0 * cosl(r + c)   if i_1 = 1.
 228 //
 229 //  Because |r| is now larger, we need one extra term in the
 230 //  correction. sinl(Arg) can be accurately approximated by
 231 //
 232 //  sinl(Arg) = (-1)^i_0 * [sinl(r) + c(1-r^2/2)]      if i_1 = 0
 233 //           = (-1)^i_0 * [cosl(r) - c*r*(1 - r^2/6)]    i_1 = 1.
 234 //
 235 //  Finally, sinl(r) and cosl(r) are approximated by polynomials of
 236 //  moderate lengths.
 237 //
 238 //      sinl(r) =  r + PP_1_hi r^3 + PP_1_lo r^3 +
 239 //                    PP_2 r^5 + ... + PP_8 r^17
 240 //
 241 //      cosl(r) =  1 + QQ_1 r^2 + QQ_2 r^4 + ... + QQ_8 r^16
 242 //
 243 //  where PP_1_hi is only about 16 bits long and QQ_1 is -1/2.
 244 //  The crux in accurate computation is to calculate
 245 //
 246 //  r + PP_1_hi r^3   or  1 + QQ_1 r^2
 247 //
 248 //  accurately as two pieces: U_hi and U_lo. The way to achieve this
 249 //  is to obtain r_hi as a 10 sig. bit number that approximates r to
 250 //  roughly 8 bits or so of accuracy. (One convenient way is
 251 //
 252 //  r_hi := frcpa( frcpa( r ) ).)
 253 //
 254 //  This way,
 255 //
 256 //      r + PP_1_hi r^3 =  r + PP_1_hi r_hi^3 +
 257 //                              PP_1_hi (r^3 - r_hi^3)
 258 //                      =  [r + PP_1_hi r_hi^3]  +
 259 //                         [PP_1_hi (r - r_hi)
 260 //                            (r^2 + r_hi r + r_hi^2) ]
 261 //                      =  U_hi  +  U_lo
 262 //
 263 //  Since r_hi is only 10 bit long and PP_1_hi is only 16 bit long,
 264 //  PP_1_hi * r_hi^3 is only at most 46 bit long and thus computed
 265 //  exactly. Furthermore, r and PP_1_hi r_hi^3 are of opposite sign
 266 //  and that there is no more than 8 bit shift off between r and
 267 //  PP_1_hi * r_hi^3. Hence the sum, U_hi, is representable and thus
 268 //  calculated without any error. Finally, the fact that
 269 //
 270 //      |U_lo| <= 2^(-8) |U_hi|
 271 //
 272 //  says that U_hi + U_lo is approximating r + PP_1_hi r^3 to roughly
 273 //  8 extra bits of accuracy.
 274 //
 275 //  Similarly,
 276 //
 277 //      1 + QQ_1 r^2  =  [1 + QQ_1 r_hi^2]  +
 278 //                          [QQ_1 (r - r_hi)(r + r_hi)]
 279 //                    =  U_hi  +  U_lo.
 280 //
 281 //  Summarizing, we calculate r_hi = frcpa( frcpa( r ) ).
 282 //
 283 //  If i_1 = 0, then
 284 //
 285 //    U_hi := r + PP_1_hi * r_hi^3
 286 //    U_lo := PP_1_hi * (r - r_hi) * (r^2 + r*r_hi + r_hi^2)
 287 //    poly := PP_1_lo r^3 + PP_2 r^5 + ... + PP_8 r^17
 288 //    correction := c * ( 1 + C_1 r^2 )
 289 //
 290 //  Else ...i_1 = 1
 291 //
 292 //    U_hi := 1 + QQ_1 * r_hi * r_hi
 293 //    U_lo := QQ_1 * (r - r_hi) * (r + r_hi)
 294 //    poly := QQ_2 * r^4 + QQ_3 * r^6 + ... + QQ_8 r^16
 295 //    correction := -c * r * (1 + S_1 * r^2)
 296 //
 297 //  End
 298 //
 299 //  Finally,
 300 //
 301 //      V := poly + ( U_lo + correction )
 302 //
 303 //                 /    U_hi  +  V         if i_0 = 0
 304 //      result := |
 305 //                 \  (-U_hi) -  V         if i_0 = 1
 306 //
 307 //  It is important that in the last step, negation of U_hi is
 308 //  performed prior to the subtraction which is to be performed in
 309 //  the user-set rounding mode.
 310 //
 311 //
 312 //  Algorithmic Description
 313 //  =======================
 314 //
 315 //  The argument reduction algorithm is tightly integrated into FSIN
 316 //  and FCOS which share the same code. The following is complete and
 317 //  self-contained. The argument reduction description given
 318 //  previously is repeated below.
 319 //
 320 //
 321 //  Step 0. Initialization.
 322 //
 323 //   If FSIN is invoked, set N_inc := 0; else if FCOS is invoked,
 324 //   set N_inc := 1.
 325 //
 326 //  Step 1. Check for exceptional and special cases.
 327 //
 328 //   * If Arg is +-0, +-inf, NaN, NaT, go to Step 10 for special
 329 //     handling.
 330 //   * If |Arg| < 2^24, go to Step 2 for reduction of moderate
 331 //     arguments. This is the most likely case.
 332 //   * If |Arg| < 2^63, go to Step 8 for pre-reduction of large
 333 //     arguments.
 334 //   * If |Arg| >= 2^63, go to Step 10 for special handling.
 335 //
 336 //  Step 2. Reduction of moderate arguments.
 337 //
 338 //  If |Arg| < pi/4     ...quick branch
 339 //     N_fix := N_inc   (integer)
 340 //     r     := Arg
 341 //     c     := 0.0
 342 //     Branch to Step 4, Case_1_complete
 343 //  Else                ...cf. argument reduction
 344 //     N     := Arg * two_by_PI (fp)
 345 //     N_fix := fcvt.fx( N )    (int)
 346 //     N     := fcvt.xf( N_fix )
 347 //     N_fix := N_fix + N_inc
 348 //     s     := Arg - N * P_1   (first piece of pi/2)
 349 //     w     := -N * P_2        (second piece of pi/2)
 350 //
 351 //     If |s| >= 2^(-33)
 352 //        go to Step 3, Case_1_reduce
 353 //     Else
 354 //        go to Step 7, Case_2_reduce
 355 //     Endif
 356 //  Endif
 357 //
 358 //  Step 3. Case_1_reduce.
 359 //
 360 //  r := s + w
 361 //  c := (s - r) + w    ...observe order
 362 //
 363 //  Step 4. Case_1_complete
 364 //
 365 //  ...At this point, the reduced argument alpha is
 366 //  ...accurately represented as r + c.
 367 //  If |r| < 2^(-3), go to Step 6, small_r.
 368 //
 369 //  Step 5. Normal_r.
 370 //
 371 //  Let [i_0 i_1] by the 2 lsb of N_fix.
 372 //  FR_rsq  := r * r
 373 //  r_hi := frcpa( frcpa( r ) )
 374 //  r_lo := r - r_hi
 375 //
 376 //  If i_1 = 0, then
 377 //    poly := r*FR_rsq*(PP_1_lo + FR_rsq*(PP_2 + ... FR_rsq*PP_8))
 378 //    U_hi := r + PP_1_hi*r_hi*r_hi*r_hi        ...any order
 379 //    U_lo := PP_1_hi*r_lo*(r*r + r*r_hi + r_hi*r_hi)
 380 //    correction := c + c*C_1*FR_rsq            ...any order
 381 //  Else
 382 //    poly := FR_rsq*FR_rsq*(QQ_2 + FR_rsq*(QQ_3 + ... + FR_rsq*QQ_8))
 383 //    U_hi := 1 + QQ_1 * r_hi * r_hi            ...any order
 384 //    U_lo := QQ_1 * r_lo * (r + r_hi)
 385 //    correction := -c*(r + S_1*FR_rsq*r)       ...any order
 386 //  Endif
 387 //
 388 //  V := poly + (U_lo + correction)     ...observe order
 389 //
 390 //  result := (i_0 == 0?   1.0 : -1.0)
 391 //
 392 //  Last instruction in user-set rounding mode
 393 //
 394 //  result := (i_0 == 0?   result*U_hi + V :
 395 //                        result*U_hi - V)
 396 //
 397 //  Return
 398 //
 399 //  Step 6. Small_r.
 400 //
 401 //  ...Use flush to zero mode without causing exception
 402 //    Let [i_0 i_1] be the two lsb of N_fix.
 403 //
 404 //  FR_rsq := r * r
 405 //
 406 //  If i_1 = 0 then
 407 //     z := FR_rsq*FR_rsq; z := FR_rsq*z *r
 408 //     poly_lo := S_3 + FR_rsq*(S_4 + FR_rsq*S_5)
 409 //     poly_hi := r*FR_rsq*(S_1 + FR_rsq*S_2)
 410 //     correction := c
 411 //     result := r
 412 //  Else
 413 //     z := FR_rsq*FR_rsq; z := FR_rsq*z
 414 //     poly_lo := C_3 + FR_rsq*(C_4 + FR_rsq*C_5)
 415 //     poly_hi := FR_rsq*(C_1 + FR_rsq*C_2)
 416 //     correction := -c*r
 417 //     result := 1
 418 //  Endif
 419 //
 420 //  poly := poly_hi + (z * poly_lo + correction)
 421 //
 422 //  If i_0 = 1, result := -result
 423 //
 424 //  Last operation. Perform in user-set rounding mode
 425 //
 426 //  result := (i_0 == 0?     result + poly :
 427 //                          result - poly )
 428 //  Return
 429 //
 430 //  Step 7. Case_2_reduce.
 431 //
 432 //  ...Refer to the write up for argument reduction for
 433 //  ...rationale. The reduction algorithm below is taken from
 434 //  ...argument reduction description and integrated this.
 435 //
 436 //  w := N*P_3
 437 //  U_1 := N*P_2 + w            ...FMA
 438 //  U_2 := (N*P_2 - U_1) + w    ...2 FMA
 439 //  ...U_1 + U_2 is  N*(P_2+P_3) accurately
 440 //
 441 //  r := s - U_1
 442 //  c := ( (s - r) - U_1 ) - U_2
 443 //
 444 //  ...The mathematical sum r + c approximates the reduced
 445 //  ...argument accurately. Note that although compared to
 446 //  ...Case 1, this case requires much more work to reduce
 447 //  ...the argument, the subsequent calculation needed for
 448 //  ...any of the trigonometric function is very little because
 449 //  ...|alpha| < 1.01*2^(-33) and thus two terms of the
 450 //  ...Taylor series expansion suffices.
 451 //
 452 //  If i_1 = 0 then
 453 //     poly := c + S_1 * r * r * r      ...any order
 454 //     result := r
 455 //  Else
 456 //     poly := -2^(-67)
 457 //     result := 1.0
 458 //  Endif
 459 //
 460 //  If i_0 = 1, result := -result
 461 //
 462 //  Last operation. Perform in user-set rounding mode
 463 //
 464 //  result := (i_0 == 0?     result + poly :
 465 //                           result - poly )
 466 //
 467 //  Return
 468 //
 469 //
 470 //  Step 8. Pre-reduction of large arguments.
 471 //
 472 //  ...Again, the following reduction procedure was described
 473 //  ...in the separate write up for argument reduction, which
 474 //  ...is tightly integrated here.
 475
 476 //  N_0 := Arg * Inv_P_0
 477 //  N_0_fix := fcvt.fx( N_0 )
 478 //  N_0 := fcvt.xf( N_0_fix)
 479
 480 //  Arg' := Arg - N_0 * P_0
 481 //  w := N_0 * d_1
 482 //  N := Arg' * two_by_PI
 483 //  N_fix := fcvt.fx( N )
 484 //  N := fcvt.xf( N_fix )
 485 //  N_fix := N_fix + N_inc
 486 //
 487 //  s := Arg' - N * P_1
 488 //  w := w - N * P_2
 489 //
 490 //  If |s| >= 2^(-14)
 491 //     go to Step 3
 492 //  Else
 493 //     go to Step 9
 494 //  Endif
 495 //
 496 //  Step 9. Case_4_reduce.
 497 //
 498 //    ...first obtain N_0*d_1 and -N*P_2 accurately
 499 //   U_hi := N_0 * d_1          V_hi := -N*P_2
 500 //   U_lo := N_0 * d_1 - U_hi   V_lo := -N*P_2 - U_hi   ...FMAs
 501 //
 502 //   ...compute the contribution from N_0*d_1 and -N*P_3
 503 //   w := -N*P_3
 504 //   w := w + N_0*d_2
 505 //   t := U_lo + V_lo + w               ...any order
 506 //
 507 //   ...at this point, the mathematical value
 508 //   ...s + U_hi + V_hi  + t approximates the true reduced argument
 509 //   ...accurately. Just need to compute this accurately.
 510 //
 511 //   ...Calculate U_hi + V_hi accurately:
 512 //   A := U_hi + V_hi
 513 //   if |U_hi| >= |V_hi| then
 514 //      a := (U_hi - A) + V_hi
 515 //   else
 516 //      a := (V_hi - A) + U_hi
 517 //   endif
 518 //   ...order in computing "a" must be observed. This branch is
 519 //   ...best implemented by predicates.
 520 //   ...A + a  is U_hi + V_hi accurately. Moreover, "a" is
 521 //   ...much smaller than A: |a| <= (1/2)ulp(A).
 522 //
 523 //   ...Just need to calculate   s + A + a + t
 524 //   C_hi := s + A              t := t + a
 525 //   C_lo := (s - C_hi) + A
 526 //   C_lo := C_lo + t
 527 //
 528 //   ...Final steps for reduction
 529 //   r := C_hi + C_lo
 530 //   c := (C_hi - r) + C_lo
 531 //
 532 //   ...At this point, we have r and c
 533 //   ...And all we need is a couple of terms of the corresponding
 534 //   ...Taylor series.
 535 //
 536 //   If i_1 = 0
 537 //      poly := c + r*FR_rsq*(S_1 + FR_rsq*S_2)
 538 //      result := r
 539 //   Else
 540 //      poly := FR_rsq*(C_1 + FR_rsq*C_2)
 541 //      result := 1
 542 //   Endif
 543 //
 544 //   If i_0 = 1, result := -result
 545 //
 546 //   Last operation. Perform in user-set rounding mode
 547 //
 548 //   result := (i_0 == 0?     result + poly :
 549 //                            result - poly )
 550 //   Return
 551 //
 552 //   Large Arguments: For arguments above 2**63, a Payne-Hanek
 553 //   style argument reduction is used and pi_by_2 reduce is called.
 554 //
 555
 556 #include "libm_support.h"
 557
 558 #ifdef _LIBC
 559 .rodata
 560 #else
 561 .data
 562 #endif
 563 .align 64
 564
 565 FSINCOSL_CONSTANTS:
 566 ASM_TYPE_DIRECTIVE(FSINCOSL_CONSTANTS,@object)
 567 data4 0x4B800000, 0xCB800000, 0x00000000,0x00000000 // two**24, -two**24
 568 data4 0x4E44152A, 0xA2F9836E, 0x00003FFE,0x00000000 // Inv_pi_by_2
 569 data4 0xCE81B9F1, 0xC84D32B0, 0x00004016,0x00000000 // P_0
 570 data4 0x2168C235, 0xC90FDAA2, 0x00003FFF,0x00000000 // P_1
 571 data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD,0x00000000 // P_2
 572 data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C,0x00000000 // P_3
 573 data4 0x5F000000, 0xDF000000, 0x00000000,0x00000000 // two_to_63, -two_to_63
 574 data4 0x6EC6B45A, 0xA397E504, 0x00003FE7,0x00000000 // Inv_P_0
 575 data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF,0x00000000 // d_1
 576 data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C,0x00000000 // d_2
 577 data4 0x2168C234, 0xC90FDAA2, 0x00003FFE,0x00000000 // pi_by_4
 578 data4 0x2168C234, 0xC90FDAA2, 0x0000BFFE,0x00000000 // neg_pi_by_4
 579 data4 0x3E000000, 0xBE000000, 0x00000000,0x00000000 // two**-3, -two**-3
 580 data4 0x2F000000, 0xAF000000, 0x9E000000,0x00000000 // two**-33, -two**-33, -two**-67
 581 data4 0xA21C0BC9, 0xCC8ABEBC, 0x00003FCE,0x00000000 // PP_8
 582 data4 0x720221DA, 0xD7468A05, 0x0000BFD6,0x00000000 // PP_7
 583 data4 0x640AD517, 0xB092382F, 0x00003FDE,0x00000000 // PP_6
 584 data4 0xD1EB75A4, 0xD7322B47, 0x0000BFE5,0x00000000 // PP_5
 585 data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1
 586 data4 0x00000000, 0xAAAA0000, 0x0000BFFC,0x00000000 // PP_1_hi
 587 data4 0xBAF69EEA, 0xB8EF1D2A, 0x00003FEC,0x00000000 // PP_4
 588 data4 0x0D03BB69, 0xD00D00D0, 0x0000BFF2,0x00000000 // PP_3
 589 data4 0x88888962, 0x88888888, 0x00003FF8,0x00000000 // PP_2
 590 data4 0xAAAB0000, 0xAAAAAAAA, 0x0000BFEC,0x00000000 // PP_1_lo
 591 data4 0xC2B0FE52, 0xD56232EF, 0x00003FD2,0x00000000 // QQ_8
 592 data4 0x2B48DCA6, 0xC9C99ABA, 0x0000BFDA,0x00000000 // QQ_7
 593 data4 0x9C716658, 0x8F76C650, 0x00003FE2,0x00000000 // QQ_6
 594 data4 0xFDA8D0FC, 0x93F27DBA, 0x0000BFE9,0x00000000 // QQ_5
 595 data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1
 596 data4 0x00000000, 0x80000000, 0x0000BFFE,0x00000000 // QQ_1
 597 data4 0x0C6E5041, 0xD00D00D0, 0x00003FEF,0x00000000 // QQ_4
 598 data4 0x0B607F60, 0xB60B60B6, 0x0000BFF5,0x00000000 // QQ_3
 599 data4 0xAAAAAA9B, 0xAAAAAAAA, 0x00003FFA,0x00000000 // QQ_2
 600 data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1
 601 data4 0xAAAA719F, 0xAAAAAAAA, 0x00003FFA,0x00000000 // C_2
 602 data4 0x0356F994, 0xB60B60B6, 0x0000BFF5,0x00000000 // C_3
 603 data4 0xB2385EA9, 0xD00CFFD5, 0x00003FEF,0x00000000 // C_4
 604 data4 0x292A14CD, 0x93E4BD18, 0x0000BFE9,0x00000000 // C_5
 605 data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1
 606 data4 0x888868DB, 0x88888888, 0x00003FF8,0x00000000 // S_2
 607 data4 0x055EFD4B, 0xD00D00D0, 0x0000BFF2,0x00000000 // S_3
 608 data4 0x839730B9, 0xB8EF1C5D, 0x00003FEC,0x00000000 // S_4
 609 data4 0xE5B3F492, 0xD71EA3A4, 0x0000BFE5,0x00000000 // S_5
 610 data4 0x38800000, 0xB8800000, 0x00000000            // two**-14, -two**-14
 611 ASM_SIZE_DIRECTIVE(FSINCOSL_CONSTANTS)
 612
 613 FR_Input_X        = f8
 614 FR_Neg_Two_to_M3  = f32
 615 FR_Two_to_63      = f32
 616 FR_Two_to_24      = f33
 617 FR_Pi_by_4        = f33
 618 FR_Two_to_M14     = f34
 619 FR_Two_to_M33     = f35
 620 FR_Neg_Two_to_24  = f36
 621 FR_Neg_Pi_by_4    = f36
 622 FR_Neg_Two_to_M14 = f37
 623 FR_Neg_Two_to_M33 = f38
 624 FR_Neg_Two_to_M67 = f39
 625 FR_Inv_pi_by_2    = f40
 626 FR_N_float        = f41
 627 FR_N_fix          = f42
 628 FR_P_1            = f43
 629 FR_P_2            = f44
 630 FR_P_3            = f45
 631 FR_s              = f46
 632 FR_w              = f47
 633 FR_c              = f48
 634 FR_r              = f49
 635 FR_Z              = f50
 636 FR_A              = f51
 637 FR_a              = f52
 638 FR_t              = f53
 639 FR_U_1            = f54
 640 FR_U_2            = f55
 641 FR_C_1            = f56
 642 FR_C_2            = f57
 643 FR_C_3            = f58
 644 FR_C_4            = f59
 645 FR_C_5            = f60
 646 FR_S_1            = f61
 647 FR_S_2            = f62
 648 FR_S_3            = f63
 649 FR_S_4            = f64
 650 FR_S_5            = f65
 651 FR_poly_hi        = f66
 652 FR_poly_lo        = f67
 653 FR_r_hi           = f68
 654 FR_r_lo           = f69
 655 FR_rsq            = f70
 656 FR_r_cubed        = f71
 657 FR_C_hi           = f72
 658 FR_N_0            = f73
 659 FR_d_1            = f74
 660 FR_V              = f75
 661 FR_V_hi           = f75
 662 FR_V_lo           = f76
 663 FR_U_hi           = f77
 664 FR_U_lo           = f78
 665 FR_U_hiabs        = f79
 666 FR_V_hiabs        = f80
 667 FR_PP_8           = f81
 668 FR_QQ_8           = f81
 669 FR_PP_7           = f82
 670 FR_QQ_7           = f82
 671 FR_PP_6           = f83
 672 FR_QQ_6           = f83
 673 FR_PP_5           = f84
 674 FR_QQ_5           = f84
 675 FR_PP_4           = f85
 676 FR_QQ_4           = f85
 677 FR_PP_3           = f86
 678 FR_QQ_3           = f86
 679 FR_PP_2           = f87
 680 FR_QQ_2           = f87
 681 FR_QQ_1           = f88
 682 FR_N_0_fix        = f89
 683 FR_Inv_P_0        = f90
 684 FR_corr           = f91
 685 FR_poly           = f92
 686 FR_d_2            = f93
 687 FR_Two_to_M3      = f94
 688 FR_Neg_Two_to_63  = f94
 689 FR_P_0            = f95
 690 FR_C_lo           = f96
 691 FR_PP_1           = f97
 692 FR_PP_1_lo        = f98
 693 FR_ArgPrime       = f99
 694
 695 GR_Table_Base  = r32
 696 GR_Table_Base1 = r33
 697 GR_i_0         = r34
 698 GR_i_1         = r35
 699 GR_N_Inc       = r36
 700 GR_Sin_or_Cos  = r37
 701
 702 // Added for unwind support
 703
 704 GR_SAVE_B0     = r39
 705 GR_SAVE_GP     = r40
 706 GR_SAVE_PFS    = r41
 707
 708
 709 .global sinl#
 710 .global cosl#
 711 #ifdef _LIBC
 712 .global __sinl#
 713 .global __cosl#
 714 #endif
 715
 716 .section .text
 717 .proc sinl#
 718 #ifdef _LIBC
 719 .proc __sinl#
 720 #endif
 721 .align 64
 722 sinl:
 723 #ifdef _LIBC
 724 __sinl:
 725 #endif
 726 { .mlx
 727 alloc GR_Table_Base = ar.pfs,0,12,2,0
 728 (p0)   movl GR_Sin_or_Cos = 0x0 ;;
 729 }
 730
 731 { .mmi
 732       nop.m 999
 733 (p0)  addl           GR_Table_Base   = @ltoff(FSINCOSL_CONSTANTS#), gp
 734       nop.i 999
 735 }
 736 ;;
 737
 738 { .mmb
 739       ld8 GR_Table_Base = [GR_Table_Base]
 740       nop.m 999
 741 (p0)   br.cond.sptk L(SINCOSL_CONTINUE) ;;
 742 }
 743 ;;
 744
 745
 746 .endp sinl#
 747 ASM_SIZE_DIRECTIVE(sinl#)
 748
 749 .section .text
 750 .proc cosl#
 751 cosl:
 752 #ifdef _LIBC
 753 .proc __cosl#
 754 __cosl:
 755 #endif
 756 { .mlx
 757 alloc GR_Table_Base= ar.pfs,0,12,2,0
 758 (p0)   movl GR_Sin_or_Cos = 0x1 ;;
 759 }
 760 ;;
 761
 762 { .mmi
 763       nop.m 999
 764 (p0)  addl           GR_Table_Base   = @ltoff(FSINCOSL_CONSTANTS#), gp
 765       nop.i 999
 766 }
 767 ;;
 768
 769 { .mmb
 770       ld8 GR_Table_Base = [GR_Table_Base]
 771       nop.m 999
 772       nop.b 999
 773 }
 774 ;;
 775
 776
 777
 778 //
 779 //     Load Table Address
 780 //
 781
 782 L(SINCOSL_CONTINUE):
 783 { .mmi
 784 (p0)   add GR_Table_Base1 = 96, GR_Table_Base
 785 (p0)   ldfs     FR_Two_to_24 = [GR_Table_Base], 4
 786 // GR_Sin_or_Cos denotes
 787 (p0)   mov   r39 = b0 ;;
 788 }
 789 { .mmi
 790        nop.m 0
 791 //
 792 //     Load 2**24, load 2**63.
 793 //
 794 (p0)   ldfs     FR_Neg_Two_to_24 = [GR_Table_Base], 12
 795        nop.i 0
 796 }
 797 { .mfi
 798 (p0)   ldfs     FR_Two_to_63 = [GR_Table_Base1], 4
 799 //
 800 //     Check for unnormals - unsupported operands. We do not want
 801 //     to generate denormal exception
 802 //     Check for NatVals, QNaNs, SNaNs, +/-Infs
 803 //     Check for EM unsupporteds
 804 //     Check for Zero
 805 //
 806 (p0)   fclass.m.unc  p6, p0 =  FR_Input_X, 0x1E3
 807        nop.i 0
 808 };;
 809 { .mmf
 810         nop.m 999
 811 (p0)   ldfs     FR_Neg_Two_to_63 = [GR_Table_Base1], 12
 812 (p0)   fclass.nm.unc p8, p0 =  FR_Input_X, 0x1FF
 813 }
 814 { .mfb
 815         nop.m 999
 816 (p0)   fclass.m.unc p10, p0 = FR_Input_X, 0x007
 817 (p6)   br.cond.spnt L(SINCOSL_SPECIAL) ;;
 818 }
 819 { .mib
 820         nop.m 999
 821         nop.i 999
 822 (p8)   br.cond.spnt L(SINCOSL_SPECIAL) ;;
 823 }
 824 { .mib
 825         nop.m 999
 826         nop.i 999
 827 //
 828 //     Branch if +/- NaN, Inf.
 829 //     Load -2**24, load -2**63.
 830 //
 831 (p10)  br.cond.spnt L(SINCOSL_ZERO) ;;
 832 }
 833 { .mmb
 834 (p0)   ldfe     FR_Inv_pi_by_2 = [GR_Table_Base], 16
 835 (p0)   ldfe     FR_Inv_P_0 = [GR_Table_Base1], 16
 836         nop.b 999 ;;
 837 }
 838 { .mmb
 839 (p0)   ldfe             FR_d_1 = [GR_Table_Base1], 16
 840 //
 841 //     Raise possible denormal operand flag with useful fcmp
 842 //     Is x <= -2**63
 843 //     Load Inv_P_0 for pre-reduction
 844 //     Load Inv_pi_by_2
 845 //
 846 (p0)   ldfe             FR_P_0 = [GR_Table_Base], 16
 847         nop.b 999 ;;
 848 }
 849 { .mmb
 850 (p0)   ldfe     FR_d_2 = [GR_Table_Base1], 16
 851 //
 852 //     Load P_0
 853 //     Load d_1
 854 //     Is x >= 2**63
 855 //     Is x <= -2**24?
 856 //
 857 (p0)   ldfe     FR_P_1 = [GR_Table_Base], 16
 858         nop.b 999 ;;
 859 }
 860 //
 861 //     Load P_1
 862 //     Load d_2
 863 //     Is x >= 2**24?
 864 //
 865 { .mfi
 866 (p0)   ldfe     FR_P_2 = [GR_Table_Base], 16
 867 (p0)   fcmp.le.unc.s1   p7, p8 = FR_Input_X, FR_Neg_Two_to_24
 868         nop.i 999 ;;
 869 }
 870 { .mbb
 871 (p0)   ldfe     FR_P_3 = [GR_Table_Base], 16
 872         nop.b 999
 873         nop.b 999 ;;
 874 }
 875 { .mfi
 876         nop.m 999
 877 (p8)   fcmp.ge.s1 p7, p0 = FR_Input_X, FR_Two_to_24
 878         nop.i 999
 879 }
 880 { .mfi
 881 (p0)   ldfe     FR_Pi_by_4 = [GR_Table_Base1], 16
 882 //
 883 //     Branch if +/- zero.
 884 //     Decide about the paths to take:
 885 //     If -2**24 < FR_Input_X < 2**24 - CASE 1 OR 2
 886 //     OTHERWISE - CASE 3 OR 4
 887 //
 888 (p0)   fcmp.le.unc.s0   p10, p11 = FR_Input_X, FR_Neg_Two_to_63
 889         nop.i 999 ;;
 890 }
 891 { .mmi
 892 (p0)   ldfe     FR_Neg_Pi_by_4 = [GR_Table_Base1], 16 ;;
 893 (p0)   ldfs     FR_Two_to_M3 = [GR_Table_Base1], 4
 894         nop.i 999
 895 }
 896 { .mfi
 897         nop.m 999
 898 (p11)  fcmp.ge.s1       p10, p0 = FR_Input_X, FR_Two_to_63
 899         nop.i 999 ;;
 900 }
 901 { .mib
 902 (p0)   ldfs     FR_Neg_Two_to_M3 = [GR_Table_Base1], 12
 903         nop.i 999
 904 //
 905 //     Load P_2
 906 //     Load P_3
 907 //     Load pi_by_4
 908 //     Load neg_pi_by_4
 909 //     Load 2**(-3)
 910 //     Load -2**(-3).
 911 //
 912 (p10)  br.cond.spnt L(SINCOSL_ARG_TOO_LARGE) ;;
 913 }
 914 { .mib
 915         nop.m 999
 916         nop.i 999
 917 //
 918 //     Branch out if x >= 2**63. Use Payne-Hanek Reduction
 919 //
 920 (p7)   br.cond.spnt L(SINCOSL_LARGER_ARG) ;;
 921 }
 922 { .mfi
 923         nop.m 999
 924 //
 925 //     Branch if Arg <= -2**24 or Arg >= 2**24 and use pre-reduction.
 926 //
 927 (p0)   fma.s1   FR_N_float = FR_Input_X, FR_Inv_pi_by_2, f0
 928         nop.i 999 ;;
 929 }
 930 { .mfi
 931         nop.m 999
 932 (p0)   fcmp.lt.unc.s1   p6, p7 = FR_Input_X, FR_Pi_by_4
 933         nop.i 999 ;;
 934 }
 935 { .mfi
 936         nop.m 999
 937 //
 938 //     Select the case when |Arg| < pi/4
 939 //     Else Select the case when |Arg| >= pi/4
 940 //
 941 (p0)   fcvt.fx.s1 FR_N_fix = FR_N_float
 942         nop.i 999 ;;
 943 }
 944 { .mfi
 945         nop.m 999
 946 //
 947 //     N  = Arg * 2/pi
 948 //     Check if Arg < pi/4
 949 //
 950 (p6)   fcmp.gt.s1 p6, p7 = FR_Input_X, FR_Neg_Pi_by_4
 951         nop.i 999 ;;
 952 }
 953 //
 954 //     Case 2: Convert integer N_fix back to normalized floating-point value.
 955 //     Case 1: p8 is only affected  when p6 is set
 956 //
 957 { .mfi
 958 (p7)   ldfs FR_Two_to_M33 = [GR_Table_Base1], 4
 959 //
 960 //     Grab the integer part of N and call it N_fix
 961 //
 962 (p6)   fmerge.se FR_r = FR_Input_X, FR_Input_X
 963 //     If |x| < pi/4, r = x and c = 0
 964 //     lf |x| < pi/4, is x < 2**(-3).
 965 //     r = Arg
 966 //     c = 0
 967 (p6)   mov GR_N_Inc = GR_Sin_or_Cos ;;
 968 }
 969 { .mmf
 970         nop.m 999
 971 (p7)   ldfs FR_Neg_Two_to_M33 = [GR_Table_Base1], 4
 972 (p6)   fmerge.se FR_c = f0, f0
 973 }
 974 { .mfi
 975         nop.m 999
 976 (p6)   fcmp.lt.unc.s1   p8, p9 = FR_Input_X, FR_Two_to_M3
 977         nop.i 999 ;;
 978 }
 979 { .mfi
 980         nop.m 999
 981 //
 982 //     lf |x| < pi/4, is -2**(-3)< x < 2**(-3) - set p8.
 983 //     If |x| >= pi/4,
 984 //     Create the right N for |x| < pi/4 and otherwise
 985 //     Case 2: Place integer part of N in GP register
 986 //
 987 (p7)   fcvt.xf FR_N_float = FR_N_fix
 988         nop.i 999 ;;
 989 }
 990 { .mmf
 991         nop.m 999
 992 (p7)   getf.sig GR_N_Inc = FR_N_fix
 993 (p8)   fcmp.gt.s1 p8, p0 = FR_Input_X, FR_Neg_Two_to_M3 ;;
 994 }
 995 { .mib
 996         nop.m 999
 997         nop.i 999
 998 //
 999 //     Load 2**(-33), -2**(-33)
1000 //
1001 (p8)   br.cond.spnt L(SINCOSL_SMALL_R) ;;
1002 }
1003 { .mib
1004         nop.m 999
1005         nop.i 999
1006 (p6)   br.cond.sptk L(SINCOSL_NORMAL_R) ;;
1007 }
1008 //
1009 //     if |x| < pi/4, branch based on |x| < 2**(-3) or otherwise.
1010 //
1011 //
1012 //     In this branch, |x| >= pi/4.
1013 //
1014 { .mfi
1015 (p0)   ldfs FR_Neg_Two_to_M67 = [GR_Table_Base1], 8
1016 //
1017 //     Load -2**(-67)
1018 //
1019 (p0)   fnma.s1  FR_s = FR_N_float, FR_P_1, FR_Input_X
1020 //
1021 //     w = N * P_2
1022 //     s = -N * P_1  + Arg
1023 //
1024 (p0)   add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos
1025 }
1026 { .mfi
1027         nop.m 999
1028 (p0)   fma.s1   FR_w = FR_N_float, FR_P_2, f0
1029         nop.i 999 ;;
1030 }
1031 { .mfi
1032         nop.m 999
1033 //
1034 //     Adjust N_fix by N_inc to determine whether sine or
1035 //     cosine is being calculated
1036 //
1037 (p0)   fcmp.lt.unc.s1 p7, p6 = FR_s, FR_Two_to_M33
1038         nop.i 999 ;;
1039 }
1040 { .mfi
1041         nop.m 999
1042 (p7)   fcmp.gt.s1 p7, p6 = FR_s, FR_Neg_Two_to_M33
1043         nop.i 999 ;;
1044 }
1045 { .mfi
1046         nop.m 999
1047 //     Remember x >= pi/4.
1048 //     Is s <= -2**(-33) or s >= 2**(-33) (p6)
1049 //     or -2**(-33) < s < 2**(-33) (p7)
1050 (p6)   fms.s1 FR_r = FR_s, f1, FR_w
1051         nop.i 999
1052 }
1053 { .mfi
1054         nop.m 999
1055 (p7)   fma.s1 FR_w = FR_N_float, FR_P_3, f0
1056         nop.i 999 ;;
1057 }
1058 { .mfi
1059         nop.m 999
1060 (p7)   fma.s1 FR_U_1 = FR_N_float, FR_P_2, FR_w
1061         nop.i 999
1062 }
1063 { .mfi
1064         nop.m 999
1065 (p6)   fms.s1 FR_c = FR_s, f1, FR_r
1066         nop.i 999 ;;
1067 }
1068 { .mfi
1069         nop.m 999
1070 //
1071 //     For big s: r = s - w: No futher reduction is necessary
1072 //     For small s: w = N * P_3 (change sign) More reduction
1073 //
1074 (p6)   fcmp.lt.unc.s1 p8, p9 = FR_r, FR_Two_to_M3
1075         nop.i 999 ;;
1076 }
1077 { .mfi
1078         nop.m 999
1079 (p8)   fcmp.gt.s1 p8, p9 = FR_r, FR_Neg_Two_to_M3
1080         nop.i 999 ;;
1081 }
1082 { .mfi
1083         nop.m 999
1084 (p7)   fms.s1 FR_r = FR_s, f1, FR_U_1
1085         nop.i 999
1086 }
1087 { .mfb
1088         nop.m 999
1089 //
1090 //     For big s: Is |r| < 2**(-3)?
1091 //     For big s: c = S - r
1092 //     For small s: U_1 = N * P_2 + w
1093 //
1094 //     If p8 is set, prepare to branch to Small_R.
1095 //     If p9 is set, prepare to branch to Normal_R.
1096 //     For big s,  r is complete here.
1097 //
1098 (p6)   fms.s1 FR_c = FR_c, f1, FR_w
1099 //
1100 //     For big s: c = c + w (w has not been negated.)
1101 //     For small s: r = S - U_1
1102 //
1103 (p8)   br.cond.spnt     L(SINCOSL_SMALL_R) ;;
1104 }
1105 { .mib
1106         nop.m 999
1107         nop.i 999
1108 (p9)   br.cond.sptk     L(SINCOSL_NORMAL_R) ;;
1109 }
1110 { .mfi
1111 (p7)   add GR_Table_Base1 = 224, GR_Table_Base1
1112 //
1113 //     Branch to SINCOSL_SMALL_R or SINCOSL_NORMAL_R
1114 //
1115 (p7)   fms.s1 FR_U_2 = FR_N_float, FR_P_2, FR_U_1
1116 //
1117 //     c = S - U_1
1118 //     r = S_1 * r
1119 //
1120 //
1121 (p7)   extr.u   GR_i_1 = GR_N_Inc, 0, 1 ;;
1122 }
1123 { .mmi
1124         nop.m 999
1125 //
1126 //     Get [i_0,i_1] - two lsb of N_fix_gr.
1127 //     Do dummy fmpy so inexact is always set.
1128 //
1129 (p7)   cmp.eq.unc p9, p10 = 0x0, GR_i_1
1130 (p7)   extr.u   GR_i_0 = GR_N_Inc, 1, 1 ;;
1131 }
1132 //
1133 //     For small s: U_2 = N * P_2 - U_1
1134 //     S_1 stored constant - grab the one stored with the
1135 //     coefficients.
1136 //
1137 { .mfi
1138 (p7)   ldfe FR_S_1 = [GR_Table_Base1], 16
1139 //
1140 //     Check if i_1 and i_0  != 0
1141 //
1142 (p10)  fma.s1   FR_poly = f0, f1, FR_Neg_Two_to_M67
1143 (p7)   cmp.eq.unc p11, p12 = 0x0, GR_i_0 ;;
1144 }
1145 { .mfi
1146         nop.m 999
1147 (p7)   fms.s1   FR_s = FR_s, f1, FR_r
1148         nop.i 999
1149 }
1150 { .mfi
1151         nop.m 999
1152 //
1153 //     S = S - r
1154 //     U_2 = U_2 + w
1155 //     load S_1
1156 //
1157 (p7)   fma.s1   FR_rsq = FR_r, FR_r, f0
1158         nop.i 999 ;;
1159 }
1160 { .mfi
1161         nop.m 999
1162 (p7)   fma.s1   FR_U_2 = FR_U_2, f1, FR_w
1163         nop.i 999
1164 }
1165 { .mfi
1166         nop.m 999
1167 (p7)   fmerge.se FR_Input_X = FR_r, FR_r
1168         nop.i 999 ;;
1169 }
1170 { .mfi
1171         nop.m 999
1172 (p10)  fma.s1 FR_Input_X = f0, f1, f1
1173         nop.i 999 ;;
1174 }
1175 { .mfi
1176         nop.m 999
1177 //
1178 //     FR_rsq = r * r
1179 //     Save r as the result.
1180 //
1181 (p7)   fms.s1   FR_c = FR_s, f1, FR_U_1
1182         nop.i 999 ;;
1183 }
1184 { .mfi
1185         nop.m 999
1186 //
1187 //     if ( i_1 ==0) poly = c + S_1*r*r*r
1188 //     else Result = 1
1189 //
1190 (p12)  fnma.s1 FR_Input_X = FR_Input_X, f1, f0
1191         nop.i 999
1192 }
1193 { .mfi
1194         nop.m 999
1195 (p7)   fma.s1   FR_r = FR_S_1, FR_r, f0
1196         nop.i 999 ;;
1197 }
1198 { .mfi
1199         nop.m 999
1200 (p7)   fma.s0   FR_S_1 = FR_S_1, FR_S_1, f0
1201         nop.i 999 ;;
1202 }
1203 { .mfi
1204         nop.m 999
1205 //
1206 //     If i_1 != 0, poly = 2**(-67)
1207 //
1208 (p7)   fms.s1 FR_c = FR_c, f1, FR_U_2
1209         nop.i 999 ;;
1210 }
1211 { .mfi
1212         nop.m 999
1213 //
1214 //     c = c - U_2
1215 //
1216 (p9)   fma.s1 FR_poly = FR_r, FR_rsq, FR_c
1217         nop.i 999 ;;
1218 }
1219 { .mfi
1220         nop.m 999
1221 //
1222 //     i_0 != 0, so Result = -Result
1223 //
1224 (p11)  fma.s0 FR_Input_X = FR_Input_X, f1, FR_poly
1225         nop.i 999 ;;
1226 }
1227 { .mfb
1228         nop.m 999
1229 (p12)  fms.s0 FR_Input_X = FR_Input_X, f1, FR_poly
1230 //
1231 //     if (i_0 == 0),  Result = Result + poly
1232 //     else            Result = Result - poly
1233 //
1234 (p0)    br.ret.sptk   b0 ;;
1235 }
1236 L(SINCOSL_LARGER_ARG):
1237 { .mfi
1238         nop.m 999
1239 (p0)   fma.s1 FR_N_0 = FR_Input_X, FR_Inv_P_0, f0
1240         nop.i 999
1241 }
1242 ;;
1243
1244 //     This path for argument > 2*24
1245 //     Adjust table_ptr1 to beginning of table.
1246 //
1247
1248 { .mmi
1249       nop.m 999
1250 (p0)  addl           GR_Table_Base   = @ltoff(FSINCOSL_CONSTANTS#), gp
1251       nop.i 999
1252 }
1253 ;;
1254
1255 { .mmi
1256       ld8 GR_Table_Base = [GR_Table_Base]
1257       nop.m 999
1258       nop.i 999
1259 }
1260 ;;
1261
1262
1263 //
1264 //     Point to  2*-14
1265 //     N_0 = Arg * Inv_P_0
1266 //
1267 { .mmi
1268 (p0)   add GR_Table_Base = 688, GR_Table_Base ;;
1269 (p0)   ldfs FR_Two_to_M14 = [GR_Table_Base], 4
1270         nop.i 999 ;;
1271 }
1272 { .mfi
1273 (p0)   ldfs FR_Neg_Two_to_M14 = [GR_Table_Base], 0
1274         nop.f 999
1275         nop.i 999 ;;
1276 }
1277 { .mfi
1278         nop.m 999
1279 //
1280 //     Load values 2**(-14) and -2**(-14)
1281 //
1282 (p0)   fcvt.fx.s1 FR_N_0_fix = FR_N_0
1283         nop.i 999 ;;
1284 }
1285 { .mfi
1286         nop.m 999
1287 //
1288 //     N_0_fix  = integer part of N_0
1289 //
1290 (p0)   fcvt.xf FR_N_0 = FR_N_0_fix
1291         nop.i 999 ;;
1292 }
1293 { .mfi
1294         nop.m 999
1295 //
1296 //     Make N_0 the integer part
1297 //
1298 (p0)   fnma.s1 FR_ArgPrime = FR_N_0, FR_P_0, FR_Input_X
1299         nop.i 999
1300 }
1301 { .mfi
1302         nop.m 999
1303 (p0)   fma.s1 FR_w = FR_N_0, FR_d_1, f0
1304         nop.i 999 ;;
1305 }
1306 { .mfi
1307         nop.m 999
1308 //
1309 //     Arg' = -N_0 * P_0 + Arg
1310 //     w  = N_0 * d_1
1311 //
1312 (p0)   fma.s1 FR_N_float = FR_ArgPrime, FR_Inv_pi_by_2, f0
1313         nop.i 999 ;;
1314 }
1315 { .mfi
1316         nop.m 999
1317 //
1318 //     N = A' * 2/pi
1319 //
1320 (p0)   fcvt.fx.s1 FR_N_fix = FR_N_float
1321         nop.i 999 ;;
1322 }
1323 { .mfi
1324         nop.m 999
1325 //
1326 //     N_fix is the integer part
1327 //
1328 (p0)   fcvt.xf FR_N_float = FR_N_fix
1329         nop.i 999 ;;
1330 }
1331 { .mfi
1332 (p0)   getf.sig GR_N_Inc = FR_N_fix
1333         nop.f 999
1334         nop.i 999 ;;
1335 }
1336 { .mii
1337         nop.m 999
1338         nop.i 999 ;;
1339 (p0)   add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos ;;
1340 }
1341 { .mfi
1342         nop.m 999
1343 //
1344 //     N is the integer part of the reduced-reduced argument.
1345 //     Put the integer in a GP register
1346 //
1347 (p0)   fnma.s1 FR_s = FR_N_float, FR_P_1, FR_ArgPrime
1348         nop.i 999
1349 }
1350 { .mfi
1351         nop.m 999
1352 (p0)   fnma.s1 FR_w = FR_N_float, FR_P_2, FR_w
1353         nop.i 999 ;;
1354 }
1355 { .mfi
1356         nop.m 999
1357 //
1358 //     s = -N*P_1 + Arg'
1359 //     w = -N*P_2 + w
1360 //     N_fix_gr = N_fix_gr + N_inc
1361 //
1362 (p0)   fcmp.lt.unc.s1 p9, p8 = FR_s, FR_Two_to_M14
1363         nop.i 999 ;;
1364 }
1365 { .mfi
1366         nop.m 999
1367 (p9)   fcmp.gt.s1 p9, p8 = FR_s, FR_Neg_Two_to_M14
1368         nop.i 999 ;;
1369 }
1370 { .mfi
1371         nop.m 999
1372 //
1373 //     For |s|  > 2**(-14) r = S + w (r complete)
1374 //     Else       U_hi = N_0 * d_1
1375 //
1376 (p9)   fma.s1 FR_V_hi = FR_N_float, FR_P_2, f0
1377         nop.i 999
1378 }
1379 { .mfi
1380         nop.m 999
1381 (p9)   fma.s1 FR_U_hi = FR_N_0, FR_d_1, f0
1382         nop.i 999 ;;
1383 }
1384 { .mfi
1385         nop.m 999
1386 //
1387 //     Either S <= -2**(-14) or S >= 2**(-14)
1388 //     or -2**(-14) < s < 2**(-14)
1389 //
1390 (p8)   fma.s1 FR_r = FR_s, f1, FR_w
1391         nop.i 999
1392 }
1393 { .mfi
1394         nop.m 999
1395 (p9)   fma.s1 FR_w = FR_N_float, FR_P_3, f0
1396         nop.i 999 ;;
1397 }
1398 { .mfi
1399         nop.m 999
1400 //
1401 //     We need abs of both U_hi and V_hi - don't
1402 //     worry about switched sign of V_hi.
1403 //
1404 (p9)   fms.s1 FR_A = FR_U_hi, f1, FR_V_hi
1405         nop.i 999
1406 }
1407 { .mfi
1408         nop.m 999
1409 //
1410 //     Big s: finish up c = (S - r) + w (c complete)
1411 //     Case 4: A =  U_hi + V_hi
1412 //     Note: Worry about switched sign of V_hi, so subtract instead of add.
1413 //
1414 (p9)   fnma.s1 FR_V_lo = FR_N_float, FR_P_2, FR_V_hi
1415         nop.i 999 ;;
1416 }
1417 { .mmf
1418         nop.m 999
1419         nop.m 999
1420 (p9)   fms.s1 FR_U_lo = FR_N_0, FR_d_1, FR_U_hi
1421 }
1422 { .mfi
1423         nop.m 999
1424 (p9)   fmerge.s FR_V_hiabs = f0, FR_V_hi
1425         nop.i 999 ;;
1426 }
1427 { .mfi
1428         nop.m 999
1429 //     For big s: c = S - r
1430 //     For small s do more work: U_lo = N_0 * d_1 - U_hi
1431 //
1432 (p9)   fmerge.s FR_U_hiabs = f0, FR_U_hi
1433         nop.i 999
1434 }
1435 { .mfi
1436         nop.m 999
1437 //
1438 //     For big s: Is |r| < 2**(-3)
1439 //     For big s: if p12 set, prepare to branch to Small_R.
1440 //     For big s: If p13 set, prepare to branch to Normal_R.
1441 //
1442 (p8)   fms.s1 FR_c = FR_s, f1, FR_r
1443         nop.i 999 ;;
1444 }
1445 { .mfi
1446         nop.m 999
1447 //
1448 //     For small S: V_hi = N * P_2
1449 //                  w = N * P_3
1450 //     Note the product does not include the (-) as in the writeup
1451 //     so (-) missing for V_hi and w.
1452 //
1453 (p8)   fcmp.lt.unc.s1 p12, p13 = FR_r, FR_Two_to_M3
1454         nop.i 999 ;;
1455 }
1456 { .mfi
1457         nop.m 999
1458 (p12)  fcmp.gt.s1 p12, p13 = FR_r, FR_Neg_Two_to_M3
1459         nop.i 999 ;;
1460 }
1461 { .mfi
1462         nop.m 999
1463 (p8)   fma.s1 FR_c = FR_c, f1, FR_w
1464         nop.i 999
1465 }
1466 { .mfb
1467         nop.m 999
1468 (p9)   fms.s1 FR_w = FR_N_0, FR_d_2, FR_w
1469 (p12)  br.cond.spnt L(SINCOSL_SMALL_R) ;;
1470 }
1471 { .mib
1472         nop.m 999
1473         nop.i 999
1474 (p13)  br.cond.sptk L(SINCOSL_NORMAL_R) ;;
1475 }
1476 { .mfi
1477         nop.m 999
1478 //
1479 //     Big s: Vector off when |r| < 2**(-3).  Recall that p8 will be true.
1480 //     The remaining stuff is for Case 4.
1481 //     Small s: V_lo = N * P_2 + U_hi (U_hi is in place of V_hi in writeup)
1482 //     Note: the (-) is still missing for V_lo.
1483 //     Small s: w = w + N_0 * d_2
1484 //     Note: the (-) is now incorporated in w.
1485 //
1486 (p9)   fcmp.ge.unc.s1 p10, p11 = FR_U_hiabs, FR_V_hiabs
1487 (p0)   extr.u   GR_i_1 = GR_N_Inc, 0, 1
1488 }
1489 { .mfi
1490         nop.m 999
1491 //
1492 //     C_hi = S + A
1493 //
1494 (p9)   fma.s1 FR_t = FR_U_lo, f1, FR_V_lo
1495 (p0)   extr.u   GR_i_0 = GR_N_Inc, 1, 1 ;;
1496 }
1497 { .mfi
1498         nop.m 999
1499 //
1500 //     t = U_lo + V_lo
1501 //
1502 //
1503 (p10)  fms.s1 FR_a = FR_U_hi, f1, FR_A
1504         nop.i 999 ;;
1505 }
1506 { .mfi
1507         nop.m 999
1508 (p11)  fma.s1 FR_a = FR_V_hi, f1, FR_A
1509         nop.i 999
1510 }
1511 ;;
1512
1513 { .mmi
1514       nop.m 999
1515 (p0)  addl           GR_Table_Base   = @ltoff(FSINCOSL_CONSTANTS#), gp
1516       nop.i 999
1517 }
1518 ;;
1519
1520 { .mmi
1521       ld8 GR_Table_Base = [GR_Table_Base]
1522       nop.m 999
1523       nop.i 999
1524 }
1525 ;;
1526
1527
1528 { .mfi
1529 (p0)   add GR_Table_Base = 528, GR_Table_Base
1530 //
1531 //     Is U_hiabs >= V_hiabs?
1532 //
1533 (p9)   fma.s1 FR_C_hi = FR_s, f1, FR_A
1534         nop.i 999 ;;
1535 }
1536 { .mmi
1537 (p0)   ldfe FR_C_1 = [GR_Table_Base], 16 ;;
1538 (p0)   ldfe FR_C_2 = [GR_Table_Base], 64
1539         nop.i 999 ;;
1540 }
1541 //
1542 //     c = c + C_lo  finished.
1543 //     Load  C_2
1544 //
1545 { .mfi
1546 (p0)   ldfe     FR_S_1 = [GR_Table_Base], 16
1547 //
1548 //     C_lo = S - C_hi
1549 //
1550 (p0)   fma.s1 FR_t = FR_t, f1, FR_w
1551         nop.i 999 ;;
1552 }
1553 //
1554 //     r and c have been computed.
1555 //     Make sure ftz mode is set - should be automatic when using wre
1556 //     |r| < 2**(-3)
1557 //     Get [i_0,i_1] - two lsb of N_fix.
1558 //     Load S_1
1559 //
1560 { .mfi
1561 (p0)   ldfe FR_S_2 = [GR_Table_Base], 64
1562 //
1563 //     t = t + w
1564 //
1565 (p10)  fms.s1 FR_a = FR_a, f1, FR_V_hi
1566 (p0)   cmp.eq.unc p9, p10 = 0x0, GR_i_0 ;;
1567 }
1568 { .mfi
1569         nop.m 999
1570 //
1571 //     For larger u than v: a = U_hi - A
1572 //     Else a = V_hi - A (do an add to account for missing (-) on V_hi
1573 //
1574 (p0)   fms.s1 FR_C_lo = FR_s, f1, FR_C_hi
1575         nop.i 999 ;;
1576 }
1577 { .mfi
1578         nop.m 999
1579 (p11)  fms.s1 FR_a = FR_U_hi, f1, FR_a
1580 (p0)   cmp.eq.unc p11, p12 = 0x0, GR_i_1 ;;
1581 }
1582 { .mfi
1583         nop.m 999
1584 //
1585 //     If u > v: a = (U_hi - A)  + V_hi
1586 //     Else      a = (V_hi - A)  + U_hi
1587 //     In each case account for negative missing from V_hi.
1588 //
1589 (p0)   fma.s1 FR_C_lo = FR_C_lo, f1, FR_A
1590         nop.i 999 ;;
1591 }
1592 { .mfi
1593         nop.m 999
1594 //
1595 //     C_lo = (S - C_hi) + A
1596 //
1597 (p0)   fma.s1 FR_t = FR_t, f1, FR_a
1598         nop.i 999 ;;
1599 }
1600 { .mfi
1601         nop.m 999
1602 //
1603 //     t = t + a
1604 //
1605 (p0)   fma.s1 FR_C_lo = FR_C_lo, f1, FR_t
1606         nop.i 999 ;;
1607 }
1608 { .mfi
1609         nop.m 999
1610 //
1611 //     C_lo = C_lo + t
1612 //     Adjust Table_Base to beginning of table
1613 //
1614 (p0)   fma.s1 FR_r = FR_C_hi, f1, FR_C_lo
1615         nop.i 999 ;;
1616 }
1617 { .mfi
1618         nop.m 999
1619 //
1620 //     Load S_2
1621 //
1622 (p0)   fma.s1 FR_rsq = FR_r, FR_r, f0
1623         nop.i 999
1624 }
1625 { .mfi
1626         nop.m 999
1627 //
1628 //     Table_Base points to C_1
1629 //     r = C_hi + C_lo
1630 //
1631 (p0)   fms.s1 FR_c = FR_C_hi, f1, FR_r
1632         nop.i 999 ;;
1633 }
1634 { .mfi
1635         nop.m 999
1636 //
1637 //     if i_1 ==0: poly = S_2 * FR_rsq + S_1
1638 //     else        poly = C_2 * FR_rsq + C_1
1639 //
1640 (p11)  fma.s1 FR_Input_X = f0, f1, FR_r
1641         nop.i 999 ;;
1642 }
1643 { .mfi
1644         nop.m 999
1645 (p12)  fma.s1 FR_Input_X = f0, f1, f1
1646         nop.i 999 ;;
1647 }
1648 { .mfi
1649         nop.m 999
1650 //
1651 //     Compute r_cube = FR_rsq * r
1652 //
1653 (p11)  fma.s1 FR_poly = FR_rsq, FR_S_2, FR_S_1
1654         nop.i 999 ;;
1655 }
1656 { .mfi
1657         nop.m 999
1658 (p12)  fma.s1 FR_poly = FR_rsq, FR_C_2, FR_C_1
1659         nop.i 999
1660 }
1661 { .mfi
1662         nop.m 999
1663 //
1664 //     Compute FR_rsq = r * r
1665 //     Is i_1 == 0 ?
1666 //
1667 (p0)   fma.s1 FR_r_cubed = FR_rsq, FR_r, f0
1668         nop.i 999 ;;
1669 }
1670 { .mfi
1671         nop.m 999
1672 //
1673 //     c = C_hi - r
1674 //     Load  C_1
1675 //
1676 (p0)   fma.s1 FR_c = FR_c, f1, FR_C_lo
1677         nop.i 999
1678 }
1679 { .mfi
1680         nop.m 999
1681 //
1682 //     if i_1 ==0: poly = r_cube * poly + c
1683 //     else        poly = FR_rsq * poly
1684 //
1685 (p10)  fms.s1 FR_Input_X = f0, f1, FR_Input_X
1686         nop.i 999 ;;
1687 }
1688 { .mfi
1689         nop.m 999
1690 //
1691 //     if i_1 ==0: Result = r
1692 //     else        Result = 1.0
1693 //
1694 (p11)  fma.s1 FR_poly = FR_r_cubed, FR_poly, FR_c
1695         nop.i 999 ;;
1696 }
1697 { .mfi
1698         nop.m 999
1699 (p12)  fma.s1 FR_poly = FR_rsq, FR_poly, f0
1700         nop.i 999 ;;
1701 }
1702 { .mfi
1703         nop.m 999
1704 //
1705 //     if i_0 !=0: Result = -Result
1706 //
1707 (p9)   fma.s0 FR_Input_X = FR_Input_X, f1, FR_poly
1708         nop.i 999 ;;
1709 }
1710 { .mfb
1711         nop.m 999
1712 (p10)  fms.s0 FR_Input_X = FR_Input_X, f1, FR_poly
1713 //
1714 //     if i_0 == 0: Result = Result + poly
1715 //     else         Result = Result - poly
1716 //
1717 (p0)    br.ret.sptk   b0 ;;
1718 }
1719 L(SINCOSL_SMALL_R):
1720 { .mii
1721         nop.m 999
1722 (p0)    extr.u  GR_i_1 = GR_N_Inc, 0, 1 ;;
1723 //
1724 //
1725 //      Compare both i_1 and i_0 with 0.
1726 //      if i_1 == 0, set p9.
1727 //      if i_0 == 0, set p11.
1728 //
1729 (p0)    cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;;
1730 }
1731 { .mfi
1732         nop.m 999
1733 (p0)    fma.s1 FR_rsq = FR_r, FR_r, f0
1734 (p0)    extr.u  GR_i_0 = GR_N_Inc, 1, 1 ;;
1735 }
1736 { .mfi
1737         nop.m 999
1738 //
1739 //      Z = Z * FR_rsq
1740 //
1741 (p10)   fnma.s1 FR_c = FR_c, FR_r, f0
1742 (p0)    cmp.eq.unc p11, p12 = 0x0, GR_i_0
1743 }
1744 ;;
1745
1746 // ******************************************************************
1747 // ******************************************************************
1748 // ******************************************************************
1749 //      r and c have been computed.
1750 //      We know whether this is the sine or cosine routine.
1751 //      Make sure ftz mode is set - should be automatic when using wre
1752 //      |r| < 2**(-3)
1753 //
1754 //      Set table_ptr1 to beginning of constant table.
1755 //      Get [i_0,i_1] - two lsb of N_fix_gr.
1756 //
1757
1758 { .mmi
1759       nop.m 999
1760 (p0)  addl           GR_Table_Base   = @ltoff(FSINCOSL_CONSTANTS#), gp
1761       nop.i 999
1762 }
1763 ;;
1764
1765 { .mmi
1766       ld8 GR_Table_Base = [GR_Table_Base]
1767       nop.m 999
1768       nop.i 999
1769 }
1770 ;;
1771
1772
1773 //
1774 //      Set table_ptr1 to point to S_5.
1775 //      Set table_ptr1 to point to C_5.
1776 //      Compute FR_rsq = r * r
1777 //
1778 { .mfi
1779 (p9)    add GR_Table_Base = 672, GR_Table_Base
1780 (p10)   fmerge.s FR_r = f1, f1
1781 (p10)   add GR_Table_Base = 592, GR_Table_Base ;;
1782 }
1783 //
1784 //      Set table_ptr1 to point to S_5.
1785 //      Set table_ptr1 to point to C_5.
1786 //
1787 { .mmi
1788 (p9)    ldfe FR_S_5 = [GR_Table_Base], -16 ;;
1789 //
1790 //      if (i_1 == 0) load S_5
1791 //      if (i_1 != 0) load C_5
1792 //
1793 (p9)    ldfe FR_S_4 = [GR_Table_Base], -16
1794         nop.i 999 ;;
1795 }
1796 { .mmf
1797 (p10)   ldfe FR_C_5 = [GR_Table_Base], -16
1798 //
1799 //      Z = FR_rsq * FR_rsq
1800 //
1801 (p9)    ldfe FR_S_3 = [GR_Table_Base], -16
1802 //
1803 //      Compute FR_rsq = r * r
1804 //      if (i_1 == 0) load S_4
1805 //      if (i_1 != 0) load C_4
1806 //
1807 (p0)    fma.s1 FR_Z = FR_rsq, FR_rsq, f0 ;;
1808 }
1809 //
1810 //      if (i_1 == 0) load S_3
1811 //      if (i_1 != 0) load C_3
1812 //
1813 { .mmi
1814 (p9)    ldfe FR_S_2 = [GR_Table_Base], -16 ;;
1815 //
1816 //      if (i_1 == 0) load S_2
1817 //      if (i_1 != 0) load C_2
1818 //
1819 (p9)    ldfe FR_S_1 = [GR_Table_Base], -16
1820         nop.i 999
1821 }
1822 { .mmi
1823 (p10)   ldfe FR_C_4 = [GR_Table_Base], -16 ;;
1824 (p10)   ldfe FR_C_3 = [GR_Table_Base], -16
1825         nop.i 999 ;;
1826 }
1827 { .mmi
1828 (p10)   ldfe FR_C_2 = [GR_Table_Base], -16 ;;
1829 (p10)   ldfe FR_C_1 = [GR_Table_Base], -16
1830         nop.i 999
1831 }
1832 { .mfi
1833         nop.m 999
1834 //
1835 //      if (i_1 != 0):
1836 //      poly_lo = FR_rsq * C_5 + C_4
1837 //      poly_hi = FR_rsq * C_2 + C_1
1838 //
1839 (p9)    fma.s1 FR_Z = FR_Z, FR_r, f0
1840         nop.i 999 ;;
1841 }
1842 { .mfi
1843         nop.m 999
1844 //
1845 //      if (i_1 == 0) load S_1
1846 //      if (i_1 != 0) load C_1
1847 //
1848 (p9)    fma.s1 FR_poly_lo = FR_rsq, FR_S_5, FR_S_4
1849         nop.i 999
1850 }
1851 { .mfi
1852         nop.m 999
1853 //
1854 //      c = -c * r
1855 //      dummy fmpy's to flag inexact.
1856 //
1857 (p9)    fma.s0 FR_S_4 = FR_S_4, FR_S_4, f0
1858         nop.i 999 ;;
1859 }
1860 { .mfi
1861         nop.m 999
1862 //
1863 //      poly_lo = FR_rsq * poly_lo + C_3
1864 //      poly_hi = FR_rsq * poly_hi
1865 //
1866 (p0)    fma.s1  FR_Z = FR_Z, FR_rsq, f0
1867         nop.i 999 ;;
1868 }
1869 { .mfi
1870         nop.m 999
1871 (p9)    fma.s1 FR_poly_hi = FR_rsq, FR_S_2, FR_S_1
1872         nop.i 999
1873 }
1874 { .mfi
1875         nop.m 999
1876 //
1877 //      if (i_1 == 0):
1878 //      poly_lo = FR_rsq * S_5 + S_4
1879 //      poly_hi = FR_rsq * S_2 + S_1
1880 //
1881 (p10)   fma.s1 FR_poly_lo = FR_rsq, FR_C_5, FR_C_4
1882         nop.i 999 ;;
1883 }
1884 { .mfi
1885         nop.m 999
1886 //
1887 //      if (i_1 == 0):
1888 //      Z = Z * r  for only one of the small r cases - not there
1889 //      in original implementation notes.
1890 //
1891 (p9)    fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_S_3
1892         nop.i 999 ;;
1893 }
1894 { .mfi
1895         nop.m 999
1896 (p10)   fma.s1 FR_poly_hi = FR_rsq, FR_C_2, FR_C_1
1897         nop.i 999
1898 }
1899 { .mfi
1900         nop.m 999
1901 (p10)   fma.s0 FR_C_1 = FR_C_1, FR_C_1, f0
1902         nop.i 999 ;;
1903 }
1904 { .mfi
1905         nop.m 999
1906 (p9)    fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0
1907         nop.i 999
1908 }
1909 { .mfi
1910         nop.m 999
1911 //
1912 //      poly_lo = FR_rsq * poly_lo + S_3
1913 //      poly_hi = FR_rsq * poly_hi
1914 //
1915 (p10)   fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_C_3
1916         nop.i 999 ;;
1917 }
1918 { .mfi
1919         nop.m 999
1920 (p10)   fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0
1921         nop.i 999 ;;
1922 }
1923 { .mfi
1924         nop.m 999
1925 //
1926 //      if (i_1 == 0): dummy fmpy's to flag inexact
1927 //      r = 1
1928 //
1929 (p9)    fma.s1 FR_poly_hi = FR_r, FR_poly_hi, f0
1930         nop.i 999
1931 }
1932 { .mfi
1933         nop.m 999
1934 //
1935 //      poly_hi = r * poly_hi
1936 //
1937 (p0)    fma.s1  FR_poly = FR_Z, FR_poly_lo, FR_c
1938         nop.i 999 ;;
1939 }
1940 { .mfi
1941         nop.m 999
1942 (p12)   fms.s1  FR_r = f0, f1, FR_r
1943         nop.i 999 ;;
1944 }
1945 { .mfi
1946         nop.m 999
1947 //
1948 //      poly_hi = Z * poly_lo + c
1949 //      if i_0 == 1: r = -r
1950 //
1951 (p0)    fma.s1  FR_poly = FR_poly, f1, FR_poly_hi
1952         nop.i 999 ;;
1953 }
1954 { .mfi
1955         nop.m 999
1956 (p12)   fms.s0 FR_Input_X = FR_r, f1, FR_poly
1957         nop.i 999
1958 }
1959 { .mfb
1960         nop.m 999
1961 //
1962 //      poly = poly + poly_hi
1963 //
1964 (p11)   fma.s0 FR_Input_X = FR_r, f1, FR_poly
1965 //
1966 //      if (i_0 == 0) Result = r + poly
1967 //      if (i_0 != 0) Result = r - poly
1968 //
1969 (p0)    br.ret.sptk   b0 ;;
1970 }
1971 L(SINCOSL_NORMAL_R):
1972 { .mii
1973         nop.m 999
1974 (p0)    extr.u  GR_i_1 = GR_N_Inc, 0, 1 ;;
1975 //
1976 //      Set table_ptr1 and table_ptr2 to base address of
1977 //      constant table.
1978 (p0)    cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;;
1979 }
1980 { .mfi
1981         nop.m 999
1982 (p0)    fma.s1  FR_rsq = FR_r, FR_r, f0
1983 (p0)    extr.u  GR_i_0 = GR_N_Inc, 1, 1 ;;
1984 }
1985 { .mfi
1986         nop.m 999
1987 (p0)    frcpa.s1 FR_r_hi, p6 = f1, FR_r
1988 (p0)    cmp.eq.unc p11, p12 = 0x0, GR_i_0
1989 }
1990 ;;
1991
1992 // ******************************************************************
1993 // ******************************************************************
1994 // ******************************************************************
1995 //
1996 //      r and c have been computed.
1997 //      We known whether this is the sine or cosine routine.
1998 //      Make sure ftz mode is set - should be automatic when using wre
1999 //      Get [i_0,i_1] - two lsb of N_fix_gr alone.
2000 //
2001
2002 { .mmi
2003       nop.m 999
2004 (p0)  addl           GR_Table_Base   = @ltoff(FSINCOSL_CONSTANTS#), gp
2005       nop.i 999
2006 }
2007 ;;
2008
2009 { .mmi
2010       ld8 GR_Table_Base = [GR_Table_Base]
2011       nop.m 999
2012       nop.i 999
2013 }
2014 ;;
2015
2016
2017 { .mfi
2018 (p10)   add GR_Table_Base = 384, GR_Table_Base
2019 (p12)   fms.s1 FR_Input_X = f0, f1, f1
2020 (p9)    add GR_Table_Base = 224, GR_Table_Base ;;
2021 }
2022 { .mfi
2023 (p10)   ldfe FR_QQ_8 = [GR_Table_Base], 16
2024 //
2025 //      if (i_1==0) poly = poly * FR_rsq + PP_1_lo
2026 //      else        poly = FR_rsq * poly
2027 //
2028 (p11)   fma.s1 FR_Input_X = f0, f1, f1
2029         nop.i 999 ;;
2030 }
2031 { .mmb
2032 (p10)   ldfe FR_QQ_7 = [GR_Table_Base], 16
2033 //
2034 //      Adjust table pointers based on i_0
2035 //      Compute rsq = r * r
2036 //
2037 (p9)    ldfe FR_PP_8 = [GR_Table_Base], 16
2038         nop.b 999 ;;
2039 }
2040 { .mfi
2041         nop.m 999
2042 (p0)    fma.s1 FR_r_cubed = FR_r, FR_rsq, f0
2043         nop.i 999 ;;
2044 }
2045 { .mmf
2046 (p9)    ldfe FR_PP_7 = [GR_Table_Base], 16
2047 (p10)   ldfe FR_QQ_6 = [GR_Table_Base], 16
2048 //
2049 //      Load PP_8 and QQ_8; PP_7 and QQ_7
2050 //
2051 (p0)    frcpa.s1 FR_r_hi, p6 = f1, FR_r_hi ;;
2052 }
2053 //
2054 //      if (i_1==0) poly =   PP_7 + FR_rsq * PP_8.
2055 //      else        poly =   QQ_7 + FR_rsq * QQ_8.
2056 //
2057 { .mmb
2058 (p9)    ldfe FR_PP_6 = [GR_Table_Base], 16
2059 (p10)   ldfe FR_QQ_5 = [GR_Table_Base], 16
2060         nop.b 999 ;;
2061 }
2062 { .mmb
2063 (p9)    ldfe FR_PP_5 = [GR_Table_Base], 16
2064 (p10)   ldfe FR_S_1 = [GR_Table_Base], 16
2065         nop.b 999 ;;
2066 }
2067 { .mmb
2068 (p10)   ldfe FR_QQ_1 = [GR_Table_Base], 16
2069 (p9)    ldfe FR_C_1 = [GR_Table_Base], 16
2070         nop.b 999 ;;
2071 }
2072 { .mmb
2073 (p10)   ldfe FR_QQ_4 = [GR_Table_Base], 16
2074 (p9)    ldfe FR_PP_1 = [GR_Table_Base], 16
2075         nop.b 999 ;;
2076 }
2077 { .mmb
2078 (p10)   ldfe FR_QQ_3 = [GR_Table_Base], 16
2079 //
2080 //      if (i_1=0) corr = corr + c*c
2081 //      else       corr = corr * c
2082 //
2083 (p9)    ldfe FR_PP_4 = [GR_Table_Base], 16
2084         nop.b 999 ;;
2085 }
2086 { .mfi
2087         nop.m 999
2088 (p10)   fma.s1 FR_poly = FR_rsq, FR_QQ_8, FR_QQ_7
2089         nop.i 999 ;;
2090 }
2091 //
2092 //      if (i_1=0) poly = rsq * poly + PP_5
2093 //      else       poly = rsq * poly + QQ_5
2094 //      Load PP_4 or QQ_4
2095 //
2096 { .mmi
2097 (p9)    ldfe FR_PP_3 = [GR_Table_Base], 16 ;;
2098 (p10)   ldfe FR_QQ_2 = [GR_Table_Base], 16
2099         nop.i 999
2100 }
2101 { .mfi
2102         nop.m 999
2103 //
2104 //      r_hi =   frcpa(frcpa(r)).
2105 //      r_cube = r * FR_rsq.
2106 //
2107 (p9)    fma.s1 FR_poly = FR_rsq, FR_PP_8, FR_PP_7
2108         nop.i 999 ;;
2109 }
2110 //
2111 //      Do dummy multiplies so inexact is always set.
2112 //
2113 { .mfi
2114 (p9)    ldfe FR_PP_2 = [GR_Table_Base], 16
2115 //
2116 //      r_lo = r - r_hi
2117 //
2118 (p9)    fma.s1 FR_U_lo = FR_r_hi, FR_r_hi, f0
2119         nop.i 999 ;;
2120 }
2121 { .mbb
2122 (p9)    ldfe FR_PP_1_lo = [GR_Table_Base], 16
2123         nop.b 999
2124         nop.b 999 ;;
2125 }
2126 { .mfi
2127         nop.m 999
2128 (p10)   fma.s1 FR_corr = FR_S_1, FR_r_cubed, FR_r
2129         nop.i 999
2130 }
2131 { .mfi
2132         nop.m 999
2133 (p10)   fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_6
2134         nop.i 999 ;;
2135 }
2136 { .mfi
2137         nop.m 999
2138 //
2139 //      if (i_1=0) U_lo = r_hi * r_hi
2140 //      else       U_lo = r_hi + r
2141 //
2142 (p9)    fma.s1 FR_corr = FR_C_1, FR_rsq, f0
2143         nop.i 999 ;;
2144 }
2145 { .mfi
2146         nop.m 999
2147 //
2148 //      if (i_1=0) corr = C_1 * rsq
2149 //      else       corr = S_1 * r_cubed + r
2150 //
2151 (p9)    fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_6
2152         nop.i 999 ;;
2153 }
2154 { .mfi
2155         nop.m 999
2156 (p10)   fma.s1 FR_U_lo = FR_r_hi, f1, FR_r
2157         nop.i 999
2158 }
2159 { .mfi
2160         nop.m 999
2161 //
2162 //      if (i_1=0) U_hi = r_hi + U_hi
2163 //      else       U_hi = QQ_1 * U_hi + 1
2164 //
2165 (p9)    fma.s1 FR_U_lo = FR_r, FR_r_hi, FR_U_lo
2166         nop.i 999 ;;
2167 }
2168 { .mfi
2169         nop.m 999
2170 //
2171 //      U_hi = r_hi * r_hi
2172 //
2173 (p0)    fms.s1 FR_r_lo = FR_r, f1, FR_r_hi
2174         nop.i 999
2175 }
2176 { .mfi
2177         nop.m 999
2178 //
2179 //      Load PP_1, PP_6, PP_5, and C_1
2180 //      Load QQ_1, QQ_6, QQ_5, and S_1
2181 //
2182 (p0)    fma.s1 FR_U_hi = FR_r_hi, FR_r_hi, f0
2183         nop.i 999 ;;
2184 }
2185 { .mfi
2186         nop.m 999
2187 (p10)   fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_5
2188         nop.i 999
2189 }
2190 { .mfi
2191         nop.m 999
2192 (p10)   fnma.s1 FR_corr = FR_corr, FR_c, f0
2193         nop.i 999 ;;
2194 }
2195 { .mfi
2196         nop.m 999
2197 //
2198 //      if (i_1=0) U_lo = r * r_hi + U_lo
2199 //      else       U_lo = r_lo * U_lo
2200 //
2201 (p9)    fma.s1 FR_corr = FR_corr, FR_c, FR_c
2202         nop.i 999 ;;
2203 }
2204 { .mfi
2205         nop.m 999
2206 (p9)    fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_5
2207         nop.i 999
2208 }
2209 { .mfi
2210         nop.m 999
2211 //
2212 //      if (i_1 =0) U_hi = r + U_hi
2213 //      if (i_1 =0) U_lo = r_lo * U_lo
2214 //
2215 //
2216 (p9)    fma.s0 FR_PP_5 = FR_PP_5, FR_PP_4, f0
2217         nop.i 999 ;;
2218 }
2219 { .mfi
2220         nop.m 999
2221 (p9)    fma.s1 FR_U_lo = FR_r, FR_r, FR_U_lo
2222         nop.i 999 ;;
2223 }
2224 { .mfi
2225         nop.m 999
2226 (p10)   fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0
2227         nop.i 999 ;;
2228 }
2229 { .mfi
2230         nop.m 999
2231 //
2232 //      if (i_1=0) poly = poly * rsq + PP_6
2233 //      else       poly = poly * rsq + QQ_6
2234 //
2235 (p9)    fma.s1 FR_U_hi = FR_r_hi, FR_U_hi, f0
2236         nop.i 999
2237 }
2238 { .mfi
2239         nop.m 999
2240 (p10)   fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_4
2241         nop.i 999 ;;
2242 }
2243 { .mfi
2244         nop.m 999
2245 (p10)   fma.s1 FR_U_hi = FR_QQ_1, FR_U_hi, f1
2246         nop.i 999
2247 }
2248 { .mfi
2249         nop.m 999
2250 (p10)   fma.s0 FR_QQ_5 = FR_QQ_5, FR_QQ_5, f0
2251         nop.i 999 ;;
2252 }
2253 { .mfi
2254         nop.m 999
2255 //
2256 //      if (i_1!=0) U_hi = PP_1 * U_hi
2257 //      if (i_1!=0) U_lo = r * r  + U_lo
2258 //      Load PP_3 or QQ_3
2259 //
2260 (p9)    fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_4
2261         nop.i 999 ;;
2262 }
2263 { .mfi
2264         nop.m 999
2265 (p9)    fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0
2266         nop.i 999 ;;
2267 }
2268 { .mfi
2269         nop.m 999
2270 (p10)   fma.s1 FR_U_lo = FR_QQ_1,FR_U_lo, f0
2271         nop.i 999 ;;
2272 }
2273 { .mfi
2274         nop.m 999
2275 (p9)    fma.s1 FR_U_hi = FR_PP_1, FR_U_hi, f0
2276         nop.i 999
2277 }
2278 { .mfi
2279         nop.m 999
2280 (p10)   fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_3
2281         nop.i 999 ;;
2282 }
2283 { .mfi
2284         nop.m 999
2285 //
2286 //      Load PP_2, QQ_2
2287 //
2288 (p9)    fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_3
2289         nop.i 999 ;;
2290 }
2291 { .mfi
2292         nop.m 999
2293 //
2294 //      if (i_1==0) poly = FR_rsq * poly  + PP_3
2295 //      else        poly = FR_rsq * poly  + QQ_3
2296 //      Load PP_1_lo
2297 //
2298 (p9)    fma.s1 FR_U_lo = FR_PP_1, FR_U_lo, f0
2299         nop.i 999 ;;
2300 }
2301 { .mfi
2302         nop.m 999
2303 //
2304 //      if (i_1 =0) poly = poly * rsq + pp_r4
2305 //      else        poly = poly * rsq + qq_r4
2306 //
2307 (p9)    fma.s1 FR_U_hi = FR_r, f1, FR_U_hi
2308         nop.i 999
2309 }
2310 { .mfi
2311         nop.m 999
2312 (p10)   fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_2
2313         nop.i 999 ;;
2314 }
2315 { .mfi
2316         nop.m 999
2317 //
2318 //      if (i_1==0) U_lo =  PP_1_hi * U_lo
2319 //      else        U_lo =  QQ_1 * U_lo
2320 //
2321 (p9)    fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_2
2322         nop.i 999 ;;
2323 }
2324 { .mfi
2325         nop.m 999
2326 //
2327 //      if (i_0==0)  Result = 1
2328 //      else         Result = -1
2329 //
2330 (p0)    fma.s1 FR_V = FR_U_lo, f1, FR_corr
2331         nop.i 999 ;;
2332 }
2333 { .mfi
2334         nop.m 999
2335 (p10)   fma.s1 FR_poly = FR_rsq, FR_poly, f0
2336         nop.i 999 ;;
2337 }
2338 { .mfi
2339         nop.m 999
2340 //
2341 //      if (i_1==0) poly =  FR_rsq * poly + PP_2
2342 //      else poly =  FR_rsq * poly + QQ_2
2343 //
2344 (p9)    fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_1_lo
2345         nop.i 999 ;;
2346 }
2347 { .mfi
2348         nop.m 999
2349 (p10)   fma.s1 FR_poly = FR_rsq, FR_poly, f0
2350         nop.i 999 ;;
2351 }
2352 { .mfi
2353         nop.m 999
2354 //
2355 //      V = U_lo + corr
2356 //
2357 (p9)    fma.s1 FR_poly = FR_r_cubed, FR_poly, f0
2358         nop.i 999 ;;
2359 }
2360 { .mfi
2361         nop.m 999
2362 //
2363 //      if (i_1==0) poly = r_cube * poly
2364 //      else        poly = FR_rsq * poly
2365 //
2366 (p0)    fma.s1  FR_V = FR_poly, f1, FR_V
2367         nop.i 999 ;;
2368 }
2369 { .mfi
2370         nop.m 999
2371 (p12)   fms.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V
2372         nop.i 999
2373 }
2374 { .mfb
2375         nop.m 999
2376 //
2377 //      V = V + poly
2378 //
2379 (p11)   fma.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V
2380 //
2381 //      if (i_0==0) Result = Result * U_hi + V
2382 //      else        Result = Result * U_hi - V
2383 //
2384 (p0)    br.ret.sptk   b0
2385 };;
2386
2387 //
2388 //      If cosine, FR_Input_X = 1
2389 //      If sine, FR_Input_X = +/-Zero (Input FR_Input_X)
2390 //      Results are exact, no exceptions
2391 //
2392
2393 L(SINCOSL_ZERO):
2394 { .mbb
2395 (p0)    cmp.eq.unc p6, p7 = 0x1, GR_Sin_or_Cos
2396         nop.b 999
2397         nop.b 999 ;;
2398 }
2399 { .mfi
2400         nop.m 999
2401 (p7)    fmerge.s FR_Input_X = FR_Input_X, FR_Input_X
2402         nop.i 999
2403 }
2404 { .mfb
2405         nop.m 999
2406 (p6)    fmerge.s FR_Input_X = f1, f1
2407 (p0)    br.ret.sptk   b0 ;;
2408 }
2409 L(SINCOSL_SPECIAL):
2410 { .mfb
2411         nop.m 999
2412 //
2413 //      Path for Arg = +/- QNaN, SNaN, Inf
2414 //      Invalid can be raised. SNaNs
2415 //      become QNaNs
2416 //
2417 (p0)    fmpy.s0 FR_Input_X = FR_Input_X, f0
2418 (p0)    br.ret.sptk   b0 ;;
2419 }
2420 .endp cosl#
2421 ASM_SIZE_DIRECTIVE(cosl#)
2422
2423 //      Call int pi_by_2_reduce(double* x, double *y)
2424 //      for |arguments| >= 2**63
2425 //      Address to save r and c as double
2426 //
2427 //             sp+32  -> f0
2428 //      r45    sp+16  -> f0
2429 //      r44 -> sp     -> InputX
2430 //
2431
2432 .proc __libm_callout
2433 __libm_callout:
2434 L(SINCOSL_ARG_TOO_LARGE):
2435 .prologue
2436 { .mfi
2437         add   r45=-32,sp                        // Parameter: r address
2438         nop.f 0
2439 .save   ar.pfs,GR_SAVE_PFS
2440         mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
2441 }
2442 { .mfi
2443 .fframe 64
2444         add sp=-64,sp                           // Create new stack
2445         nop.f 0
2446         mov GR_SAVE_GP=gp                       // Save gp
2447 };;
2448 { .mmi
2449         stfe [r45] = f0,16                      // Clear Parameter r on stack
2450         add  r44 = 16,sp                        // Parameter x address
2451 .save   b0, GR_SAVE_B0
2452         mov GR_SAVE_B0=b0                       // Save b0
2453 };;
2454 .body
2455 { .mib
2456         stfe [r45] = f0,-16                     // Clear Parameter c on stack
2457         nop.i 0
2458         nop.b 0
2459 }
2460 { .mib
2461         stfe [r44] = FR_Input_X                 // Store Parameter x on stack
2462         nop.i 0
2463 (p0)    br.call.sptk b0=__libm_pi_by_2_reduce# ;;
2464 };;
2465 { .mii
2466 (p0)    ldfe  FR_Input_X =[r44],16
2467 //
2468 //      Get r and c off stack
2469 //
2470 (p0)    adds  GR_Table_Base1 = -16, GR_Table_Base1
2471 //
2472 //      Get r and c off stack
2473 //
2474 (p0)    add   GR_N_Inc = GR_Sin_or_Cos,r8 ;;
2475 }
2476 { .mmb
2477 (p0)    ldfe  FR_r =[r45],16
2478 //
2479 //      Get X off the stack
2480 //      Readjust Table ptr
2481 //
2482 (p0)    ldfs FR_Two_to_M3 = [GR_Table_Base1],4
2483         nop.b 999 ;;
2484 }
2485 { .mmb
2486 (p0)    ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1],0
2487 (p0)    ldfe  FR_c =[r45]
2488         nop.b 999 ;;
2489 }
2490 { .mfi
2491 .restore sp
2492         add   sp = 64,sp                       // Restore stack pointer
2493 (p0)    fcmp.lt.unc.s1  p6, p0 = FR_r, FR_Two_to_M3
2494         mov   b0 = GR_SAVE_B0                  // Restore return address
2495 };;
2496 { .mib
2497         mov   gp = GR_SAVE_GP                  // Restore gp
2498         mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
2499         nop.b 0
2500 };;
2501 { .mfi
2502         nop.m 999
2503 (p6)    fcmp.gt.unc.s1  p6, p0 = FR_r, FR_Neg_Two_to_M3
2504         nop.i 999 ;;
2505 }
2506 { .mib
2507         nop.m 999
2508         nop.i 999
2509 (p6)    br.cond.spnt L(SINCOSL_SMALL_R) ;;
2510 }
2511 { .mib
2512         nop.m 999
2513         nop.i 999
2514 (p0)    br.cond.sptk L(SINCOSL_NORMAL_R) ;;
2515 }
2516 .endp __libm_callout
2517 ASM_SIZE_DIRECTIVE(__libm_callout)
2518 .type   __libm_pi_by_2_reduce#,@function
2519 .global __libm_pi_by_2_reduce#