sysdeps/ia64/fpu/libm_reduce.S

   1 .file "libm_reduce.s"
   2
   3 // Copyright (C) 2000, 2001, Intel Corporation
   4 // All rights reserved.
   5 //
   6 // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
   7 // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
   8 //
   9 // Redistribution and use in source and binary forms, with or without
  10 // modification, are permitted provided that the following conditions are
  11 // met:
  12 //
  13 // * Redistributions of source code must retain the above copyright
  14 // notice, this list of conditions and the following disclaimer.
  15 //
  16 // * Redistributions in binary form must reproduce the above copyright
  17 // notice, this list of conditions and the following disclaimer in the
  18 // documentation and/or other materials provided with the distribution.
  19 //
  20 // * The name of Intel Corporation may not be used to endorse or promote
  21 // products derived from this software without specific prior written
  22 // permission.
  23 //
  24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  35 //
  36 // Intel Corporation is the author of this code, and requests that all
  37 // problem reports or change requests be submitted to it directly at
  38 // http://developer.intel.com/opensource.
  39 //
  40 // History:  02/02/00 Initial Version
  41 //
  42 // *********************************************************************
  43 // *********************************************************************
  44 //
  45 // Function:   __libm_pi_by_two_reduce(x) return r, c, and N where
  46 //             x = N * pi/4 + (r+c) , where |r+c| <= pi/4.
  47 //             This function is not designed to be used by the
  48 //             general user.
  49 //
  50 // *********************************************************************
  51 //
  52 // Accuracy:       Returns double-precision values
  53 //
  54 // *********************************************************************
  55 //
  56 // Resources Used:
  57 //
  58 //    Floating-Point Registers: f32-f70
  59 //
  60 //    General Purpose Registers:
  61 //      r8  = return value N
  62 //      r32 = Address of x
  63 //      r33 = Address of where to place r and then c
  64 //      r34-r64
  65 //
  66 //    Predicate Registers:      p6-p14
  67 //
  68 // *********************************************************************
  69 //
  70 // IEEE Special Conditions:
  71 //
  72 //    No condions should be raised.
  73 //
  74 // *********************************************************************
  75 //
  76 // I. Introduction
  77 // ===============
  78 //
  79 // For the forward trigonometric functions sin, cos, sincos, and
  80 // tan, the original algorithms for IA 64 handle arguments up to
  81 // 1 ulp less than 2^63 in magnitude. For double-extended arguments x,
  82 // |x| >= 2^63, this routine returns CASE, N and r_hi, r_lo where
  83 //
  84 //    x  is accurately approximated by
  85 //    2*K*pi  +  N * pi/2  +  r_hi + r_lo,  |r_hi+r_lo| <= pi/4.
  86 //    CASE = 1 or 2.
  87 //    CASE is 1 unless |r_hi + r_lo| < 2^(-33).
  88 //
  89 // The exact value of K is not determined, but that information is
  90 // not required in trigonometric function computations.
  91 //
  92 // We first assume the argument x in question satisfies x >= 2^(63).
  93 // In particular, it is positive. Negative x can be handled by symmetry:
  94 //
  95 //   -x  is accurately approximated by
  96 //         -2*K*pi  +  (-N) * pi/2  -  (r_hi + r_lo),  |r_hi+r_lo| <= pi/4.
  97 //
  98 // The idea of the reduction is that
  99 //
 100 //      x  *  2/pi   =   N_big  +  N  +  f,     |f| <= 1/2
 101 //
 102 // Moreover, for double extended x, |f| >= 2^(-75). (This is an
 103 // non-obvious fact found by enumeration using a special algorithm
 104 // involving continued fraction.) The algorithm described below
 105 // calculates N and an accurate approximation of f.
 106 //
 107 // Roughly speaking, an appropriate 256-bit (4 X 64) portion of
 108 // 2/pi is multiplied with x to give the desired information.
 109 //
 110 // II. Representation of 2/PI
 111 // ==========================
 112 //
 113 // The value of 2/pi in binary fixed-point is
 114 //
 115 //            .101000101111100110......
 116 //
 117 // We store 2/pi in a table, starting at the position corresponding
 118 // to bit position 63
 119 //
 120 //   bit position  63 62 ... 0   -1 -2 -3 -4 -5 -6 -7  ....  -16576
 121 //
 122 //              0  0  ... 0  . 1  0  1  0  1  0  1  ....    X
 123 //
 124 //                              ^
 125 //                           |__ implied binary pt
 126 //
 127 // III. Algorithm
 128 // ==============
 129 //
 130 // This describes the algorithm in the most natural way using
 131 // unsigned interger multiplication. The implementation section
 132 // describes how the integer arithmetic is simulated.
 133 //
 134 // STEP 0. Initialization
 135 // ----------------------
 136 //
 137 // Let the input argument x be
 138 //
 139 //     x = 2^m * ( 1. b_1 b_2 b_3 ... b_63 ),  63 <= m <= 16383.
 140 //
 141 // The first crucial step is to fetch four 64-bit portions of 2/pi.
 142 // To fulfill this goal, we calculate the bit position L of the
 143 // beginning of these 256-bit quantity by
 144 //
 145 //     L :=  62 - m.
 146 //
 147 // Note that -16321 <= L <= -1 because 63 <= m <= 16383; and that
 148 // the storage of 2/pi is adequate.
 149 //
 150 // Fetch P_1, P_2, P_3, P_4 beginning at bit position L thus:
 151 //
 152 //      bit position  L  L-1  L-2    ...  L-63
 153 //
 154 //      P_1    =      b   b    b     ...    b
 155 //
 156 // each b can be 0 or 1. Also, let P_0 be the two bits correspoding to
 157 // bit positions L+2 and L+1. So, when each of the P_j is interpreted
 158 // with appropriate scaling, we have
 159 //
 160 //      2/pi  =  P_big  + P_0 + (P_1 + P_2 + P_3 + P_4)  +  P_small
 161 //
 162 // Note that P_big and P_small can be ignored. The reasons are as follow.
 163 // First, consider P_big. If P_big = 0, we can certainly ignore it.
 164 // Otherwise, P_big >= 2^(L+3). Now,
 165 //
 166 //        P_big * ulp(x) >=  2^(L+3) * 2^(m-63)
 167 //                    >=  2^(65-m  +  m-63 )
 168 //                    >=  2^2
 169 //
 170 // Thus, P_big * x is an integer of the form 4*K. So
 171 //
 172 //      x = 4*K * (pi/2) + x*(P_0 + P_1 + P_2 + P_3 + P_4)*(pi/2)
 173 //                + x*P_small*(pi/2).
 174 //
 175 // Hence, P_big*x corresponds to information that can be ignored for
 176 // trigonometic function evaluation.
 177 //
 178 // Next, we must estimate the effect of ignoring P_small. The absolute
 179 // error made by ignoring P_small is bounded by
 180 //
 181 //       |P_small * x|  <=  ulp(P_4) * x
 182 //                   <=  2^(L-255) * 2^(m+1)
 183 //                   <=  2^(62-m-255 + m + 1)
 184 //                   <=  2^(-192)
 185 //
 186 // Since for double-extended precision, x * 2/pi = integer + f,
 187 // 0.5 >= |f| >= 2^(-75), the relative error introduced by ignoring
 188 // P_small is bounded by 2^(-192+75) <= 2^(-117), which is acceptable.
 189 //
 190 // Further note that if x is split into x_hi + x_lo where x_lo is the
 191 // two bits corresponding to bit positions 2^(m-62) and 2^(m-63); then
 192 //
 193 //      P_0 * x_hi
 194 //
 195 // is also an integer of the form 4*K; and thus can also be ignored.
 196 // Let M := P_0 * x_lo which is a small integer. The main part of the
 197 // calculation is really the multiplication of x with the four pieces
 198 // P_1, P_2, P_3, and P_4.
 199 //
 200 // Unless the reduced argument is extremely small in magnitude, it
 201 // suffices to carry out the multiplication of x with P_1, P_2, and
 202 // P_3. x*P_4 will be carried out and added on as a correction only
 203 // when it is found to be needed. Note also that x*P_4 need not be
 204 // computed exactly. A straightforward multiplication suffices since
 205 // the rounding error thus produced would be bounded by 2^(-3*64),
 206 // that is 2^(-192) which is small enough as the reduced argument
 207 // is bounded from below by 2^(-75).
 208 //
 209 // Now that we have four 64-bit data representing 2/pi and a
 210 // 64-bit x. We first need to calculate a highly accurate product
 211 // of x and P_1, P_2, P_3. This is best understood as integer
 212 // multiplication.
 213 //
 214 //
 215 // STEP 1. Multiplication
 216 // ----------------------
 217 //
 218 //
 219 //                     ---------   ---------   ---------
 220 //                   |  P_1  |   |  P_2  |   |  P_3  |
 221 //                   ---------   ---------   ---------
 222 //
 223 //                                            ---------
 224 //            X                              |   X   |
 225 //                                           ---------
 226 //      ----------------------------------------------------
 227 //
 228 //                                 ---------   ---------
 229 //                               |  A_hi |   |  A_lo |
 230 //                               ---------   ---------
 231 //
 232 //
 233 //                    ---------   ---------
 234 //                   |  B_hi |   |  B_lo |
 235 //                   ---------   ---------
 236 //
 237 //
 238 //        ---------   ---------
 239 //       |  C_hi |   |  C_lo |
 240 //       ---------   ---------
 241 //
 242 //      ====================================================
 243 //       ---------   ---------   ---------   ---------
 244 //       |  S_0  |   |  S_1  |   |  S_2  |   |  S_3  |
 245 //       ---------   ---------   ---------   ---------
 246 //
 247 //
 248 //
 249 // STEP 2. Get N and f
 250 // -------------------
 251 //
 252 // Conceptually, after the individual pieces S_0, S_1, ..., are obtained,
 253 // we have to sum them and obtain an integer part, N, and a fraction, f.
 254 // Here, |f| <= 1/2, and N is an integer. Note also that N need only to
 255 // be known to module 2^k, k >= 2. In the case when |f| is small enough,
 256 // we would need to add in the value x*P_4.
 257 //
 258 //
 259 // STEP 3. Get reduced argument
 260 // ----------------------------
 261 //
 262 // The value f is not yet the reduced argument that we seek. The
 263 // equation
 264 //
 265 //      x * 2/pi = 4K  + N  + f
 266 //
 267 // says that
 268 //
 269 //         x   =  2*K*pi  + N * pi/2  +  f * (pi/2).
 270 //
 271 // Thus, the reduced argument is given by
 272 //
 273 //      reduced argument =  f * pi/2.
 274 //
 275 // This multiplication must be performed to extra precision.
 276 //
 277 // IV. Implementation
 278 // ==================
 279 //
 280 // Step 0. Initialization
 281 // ----------------------
 282 //
 283 // Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x.
 284 //
 285 // In memory, 2/pi is stored contigously as
 286 //
 287 //  0x00000000 0x00000000 0xA2F....
 288 //                       ^
 289 //                       |__ implied binary bit
 290 //
 291 // Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m. Thus
 292 // -1 <= L <= -16321. We fetch from memory 5 integer pieces of data.
 293 //
 294 // P_0 is the two bits corresponding to bit positions L+2 and L+1
 295 // P_1 is the 64-bit starting at bit position  L
 296 // P_2 is the 64-bit starting at bit position  L-64
 297 // P_3 is the 64-bit starting at bit position  L-128
 298 // P_4 is the 64-bit starting at bit position  L-192
 299 //
 300 // For example, if m = 63, P_0 would be 0 and P_1 would look like
 301 // 0xA2F...
 302 //
 303 // If m = 65, P_0 would be the two msb of 0xA, thus, P_0 is 10 in binary.
 304 // P_1 in binary would be  1 0 0 0 1 0 1 1 1 1 ....
 305 //
 306 // Step 1. Multiplication
 307 // ----------------------
 308 //
 309 // At this point, P_1, P_2, P_3, P_4 are integers. They are
 310 // supposed to be interpreted as
 311 //
 312 //  2^(L-63)     * P_1;
 313 //  2^(L-63-64)  * P_2;
 314 //  2^(L-63-128) * P_3;
 315 // 2^(L-63-192) * P_4;
 316 //
 317 // Since each of them need to be multiplied to x, we would scale
 318 // both x and the P_j's by some convenient factors: scale each
 319 // of P_j's up by 2^(63-L), and scale x down by 2^(L-63).
 320 //
 321 //   p_1 := fcvt.xf ( P_1 )
 322 //   p_2 := fcvt.xf ( P_2 ) * 2^(-64)
 323 //   p_3 := fcvt.xf ( P_3 ) * 2^(-128)
 324 //   p_4 := fcvt.xf ( P_4 ) * 2^(-192)
 325 //   x   := replace exponent of x by -1
 326 //          because 2^m    * 1.xxxx...xxx  * 2^(L-63)
 327 //          is      2^(-1) * 1.xxxx...xxx
 328 //
 329 // We are now faced with the task of computing the following
 330 //
 331 //                     ---------   ---------   ---------
 332 //                   |  P_1  |   |  P_2  |   |  P_3  |
 333 //                   ---------   ---------   ---------
 334 //
 335 //                                             ---------
 336 //            X                              |   X   |
 337 //                                           ---------
 338 //       ----------------------------------------------------
 339 //
 340 //                                 ---------   ---------
 341 //                               |  A_hi |   |  A_lo |
 342 //                               ---------   ---------
 343 //
 344 //                     ---------   ---------
 345 //                   |  B_hi |   |  B_lo |
 346 //                   ---------   ---------
 347 //
 348 //         ---------   ---------
 349 //       |  C_hi |   |  C_lo |
 350 //       ---------   ---------
 351 //
 352 //      ====================================================
 353 //       -----------   ---------   ---------   ---------
 354 //       |    S_0  |   |  S_1  |   |  S_2  |   |  S_3  |
 355 //       -----------   ---------   ---------   ---------
 356 //        ^          ^
 357 //        |          |___ binary point
 358 //        |
 359 //        |___ possibly one more bit
 360 //
 361 // Let FPSR3 be set to round towards zero with widest precision
 362 // and exponent range. Unless an explicit FPSR is given,
 363 // round-to-nearest with widest precision and exponent range is
 364 // used.
 365 //
 366 // Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_C := 2^(-65).
 367 //
 368 // Tmp_C := fmpy.fpsr3( x, p_1 );
 369 // If Tmp_C >= sigma_C then
 370 //    C_hi := Tmp_C;
 371 //    C_lo := x*p_1 - C_hi ...fma, exact
 372 // Else
 373 //    C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C
 374 //                      ...subtraction is exact, regardless
 375 //                      ...of rounding direction
 376 //    C_lo := x*p_1 - C_hi ...fma, exact
 377 // End If
 378 //
 379 // Tmp_B := fmpy.fpsr3( x, p_2 );
 380 // If Tmp_B >= sigma_B then
 381 //    B_hi := Tmp_B;
 382 //    B_lo := x*p_2 - B_hi ...fma, exact
 383 // Else
 384 //    B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B
 385 //                      ...subtraction is exact, regardless
 386 //                      ...of rounding direction
 387 //    B_lo := x*p_2 - B_hi ...fma, exact
 388 // End If
 389 //
 390 // Tmp_A := fmpy.fpsr3( x, p_3 );
 391 // If Tmp_A >= sigma_A then
 392 //    A_hi := Tmp_A;
 393 //    A_lo := x*p_3 - A_hi ...fma, exact
 394 // Else
 395 //    A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A
 396 //                      ...subtraction is exact, regardless
 397 //                      ...of rounding direction
 398 //    A_lo := x*p_3 - A_hi ...fma, exact
 399 // End If
 400 //
 401 // ...Note that C_hi is of integer value. We need only the
 402 // ...last few bits. Thus we can ensure C_hi is never a big
 403 // ...integer, freeing us from overflow worry.
 404 //
 405 // Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70);
 406 // ...Tmp_C is the upper portion of C_hi
 407 // C_hi := C_hi - Tmp_C
 408 // ...0 <= C_hi < 2^7
 409 //
 410 // Step 2. Get N and f
 411 // -------------------
 412 //
 413 // At this point, we have all the components to obtain
 414 // S_0, S_1, S_2, S_3 and thus N and f. We start by adding
 415 // C_lo and B_hi. This sum together with C_hi gives a good
 416 // estimation of N and f.
 417 //
 418 // A := fadd.fpsr3( B_hi, C_lo )
 419 // B := max( B_hi, C_lo )
 420 // b := min( B_hi, C_lo )
 421 //
 422 // a := (B - A) + b     ...exact. Note that a is either 0
 423 //                      ...or 2^(-64).
 424 //
 425 // N := round_to_nearest_integer_value( A );
 426 // f := A - N;          ...exact because lsb(A) >= 2^(-64)
 427 //                      ...and |f| <= 1/2.
 428 //
 429 // f := f + a           ...exact because a is 0 or 2^(-64);
 430 //                      ...the msb of the sum is <= 1/2
 431 //                      ...lsb >= 2^(-64).
 432 //
 433 // N := convert to integer format( C_hi + N );
 434 // M := P_0 * x_lo;
 435 // N := N + M;
 436 //
 437 // If sgn_x == 1 (that is original x was negative)
 438 // N := 2^10 - N
 439 // ...this maintains N to be non-negative, but still
 440 // ...equivalent to the (negated N) mod 4.
 441 // End If
 442 //
 443 // If |f| >= 2^(-33)
 444 //
 445 // ...Case 1
 446 // CASE := 1
 447 // g := A_hi + B_lo;
 448 // s_hi := f + g;
 449 // s_lo := (f - s_hi) + g;
 450 //
 451 // Else
 452 //
 453 // ...Case 2
 454 // CASE := 2
 455 // A := fadd.fpsr3( A_hi, B_lo )
 456 // B := max( A_hi, B_lo )
 457 // b := min( A_hi, B_lo )
 458 //
 459 // a := (B - A) + b     ...exact. Note that a is either 0
 460 //                      ...or 2^(-128).
 461 //
 462 // f_hi := A + f;
 463 // f_lo := (f - f_hi) + A;
 464 // ...this is exact.
 465 // ...f-f_hi is exact because either |f| >= |A|, in which
 466 // ...case f-f_hi is clearly exact; or otherwise, 0<|f|<|A|
 467 // ...means msb(f) <= msb(A) = 2^(-64) => |f| = 2^(-64).
 468 // ...If f = 2^(-64), f-f_hi involves cancellation and is
 469 // ...exact. If f = -2^(-64), then A + f is exact. Hence
 470 // ...f-f_hi is -A exactly, giving f_lo = 0.
 471 //
 472 // f_lo := f_lo + a;
 473 //
 474 // If |f| >= 2^(-50) then
 475 //    s_hi := f_hi;
 476 //    s_lo := f_lo;
 477 // Else
 478 //    f_lo := (f_lo + A_lo) + x*p_4
 479 //    s_hi := f_hi + f_lo
 480 //    s_lo := (f_hi - s_hi) + f_lo
 481 // End If
 482 //
 483 // End If
 484 //
 485 // Step 3. Get reduced argument
 486 // ----------------------------
 487 //
 488 // If sgn_x == 0 (that is original x is positive)
 489 //
 490 // D_hi := Pi_by_2_hi
 491 // D_lo := Pi_by_2_lo
 492 // ...load from table
 493 //
 494 // Else
 495 //
 496 // D_hi := neg_Pi_by_2_hi
 497 // D_lo := neg_Pi_by_2_lo
 498 // ...load from table
 499 // End If
 500 //
 501 // r_hi :=  s_hi*D_hi
 502 // r_lo :=  s_hi*D_hi - r_hi    ...fma
 503 // r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi
 504 //
 505 // Return  CASE, N, r_hi, r_lo
 506 //
 507
 508 #include "libm_support.h"
 509
 510 FR_X       = f32
 511 FR_N       = f33
 512 FR_p_1     = f34
 513 FR_TWOM33  = f35
 514 FR_TWOM50  = f36
 515 FR_g       = f37
 516 FR_p_2     = f38
 517 FR_f       = f39
 518 FR_s_lo    = f40
 519 FR_p_3     = f41
 520 FR_f_abs   = f42
 521 FR_D_lo    = f43
 522 FR_p_4     = f44
 523 FR_D_hi    = f45
 524 FR_Tmp2_C  = f46
 525 FR_s_hi    = f47
 526 FR_sigma_A = f48
 527 FR_A       = f49
 528 FR_sigma_B = f50
 529 FR_B       = f51
 530 FR_sigma_C = f52
 531 FR_b       = f53
 532 FR_ScaleP2 = f54
 533 FR_ScaleP3 = f55
 534 FR_ScaleP4 = f56
 535 FR_Tmp_A   = f57
 536 FR_Tmp_B   = f58
 537 FR_Tmp_C   = f59
 538 FR_A_hi    = f60
 539 FR_f_hi    = f61
 540 FR_r_hi    = f62
 541 FR_A_lo    = f63
 542 FR_B_hi    = f64
 543 FR_a       = f65
 544 FR_B_lo    = f66
 545 FR_f_lo    = f67
 546 FR_r_lo    = f68
 547 FR_C_hi    = f69
 548 FR_C_lo    = f70
 549
 550 GR_N       = r8
 551 GR_Address_of_Input  = r32
 552 GR_Address_of_Outputs = r33
 553 GR_Exp_x   = r36
 554 GR_Temp    = r37
 555 GR_BIASL63 = r38
 556 GR_CASE    = r39
 557 GR_x_lo    = r40
 558 GR_sgn_x   = r41
 559 GR_M       = r42
 560 GR_BASE    = r43
 561 GR_LENGTH1 = r44
 562 GR_LENGTH2 = r45
 563 GR_ASUB    = r46
 564 GR_P_0     = r47
 565 GR_P_1     = r48
 566 GR_P_2     = r49
 567 GR_P_3     = r50
 568 GR_P_4     = r51
 569 GR_START   = r52
 570 GR_SEGMENT = r53
 571 GR_A       = r54
 572 GR_B       = r55
 573 GR_C       = r56
 574 GR_D       = r57
 575 GR_E       = r58
 576 GR_TEMP1   = r59
 577 GR_TEMP2   = r60
 578 GR_TEMP3   = r61
 579 GR_TEMP4   = r62
 580 GR_TEMP5   = r63
 581 GR_TEMP6   = r64
 582
 583 .align 64
 584
 585 #ifdef _LIBC
 586 .rodata
 587 #else
 588 .data
 589 #endif
 590
 591 Constants_Bits_of_2_by_pi:
 592 ASM_TYPE_DIRECTIVE(Constants_Bits_of_2_by_pi,@object)
 593 data8 0x0000000000000000,0xA2F9836E4E441529
 594 data8 0xFC2757D1F534DDC0,0xDB6295993C439041
 595 data8 0xFE5163ABDEBBC561,0xB7246E3A424DD2E0
 596 data8 0x06492EEA09D1921C,0xFE1DEB1CB129A73E
 597 data8 0xE88235F52EBB4484,0xE99C7026B45F7E41
 598 data8 0x3991D639835339F4,0x9C845F8BBDF9283B
 599 data8 0x1FF897FFDE05980F,0xEF2F118B5A0A6D1F
 600 data8 0x6D367ECF27CB09B7,0x4F463F669E5FEA2D
 601 data8 0x7527BAC7EBE5F17B,0x3D0739F78A5292EA
 602 data8 0x6BFB5FB11F8D5D08,0x56033046FC7B6BAB
 603 data8 0xF0CFBC209AF4361D,0xA9E391615EE61B08
 604 data8 0x6599855F14A06840,0x8DFFD8804D732731
 605 data8 0x06061556CA73A8C9,0x60E27BC08C6B47C4
 606 data8 0x19C367CDDCE8092A,0x8359C4768B961CA6
 607 data8 0xDDAF44D15719053E,0xA5FF07053F7E33E8
 608 data8 0x32C2DE4F98327DBB,0xC33D26EF6B1E5EF8
 609 data8 0x9F3A1F35CAF27F1D,0x87F121907C7C246A
 610 data8 0xFA6ED5772D30433B,0x15C614B59D19C3C2
 611 data8 0xC4AD414D2C5D000C,0x467D862D71E39AC6
 612 data8 0x9B0062337CD2B497,0xA7B4D55537F63ED7
 613 data8 0x1810A3FC764D2A9D,0x64ABD770F87C6357
 614 data8 0xB07AE715175649C0,0xD9D63B3884A7CB23
 615 data8 0x24778AD623545AB9,0x1F001B0AF1DFCE19
 616 data8 0xFF319F6A1E666157,0x9947FBACD87F7EB7
 617 data8 0x652289E83260BFE6,0xCDC4EF09366CD43F
 618 data8 0x5DD7DE16DE3B5892,0x9BDE2822D2E88628
 619 data8 0x4D58E232CAC616E3,0x08CB7DE050C017A7
 620 data8 0x1DF35BE01834132E,0x6212830148835B8E
 621 data8 0xF57FB0ADF2E91E43,0x4A48D36710D8DDAA
 622 data8 0x425FAECE616AA428,0x0AB499D3F2A6067F
 623 data8 0x775C83C2A3883C61,0x78738A5A8CAFBDD7
 624 data8 0x6F63A62DCBBFF4EF,0x818D67C12645CA55
 625 data8 0x36D9CAD2A8288D61,0xC277C9121426049B
 626 data8 0x4612C459C444C5C8,0x91B24DF31700AD43
 627 data8 0xD4E5492910D5FDFC,0xBE00CC941EEECE70
 628 data8 0xF53E1380F1ECC3E7,0xB328F8C79405933E
 629 data8 0x71C1B3092EF3450B,0x9C12887B20AB9FB5
 630 data8 0x2EC292472F327B6D,0x550C90A7721FE76B
 631 data8 0x96CB314A1679E279,0x4189DFF49794E884
 632 data8 0xE6E29731996BED88,0x365F5F0EFDBBB49A
 633 data8 0x486CA46742727132,0x5D8DB8159F09E5BC
 634 data8 0x25318D3974F71C05,0x30010C0D68084B58
 635 data8 0xEE2C90AA4702E774,0x24D6BDA67DF77248
 636 data8 0x6EEF169FA6948EF6,0x91B45153D1F20ACF
 637 data8 0x3398207E4BF56863,0xB25F3EDD035D407F
 638 data8 0x8985295255C06437,0x10D86D324832754C
 639 data8 0x5BD4714E6E5445C1,0x090B69F52AD56614
 640 data8 0x9D072750045DDB3B,0xB4C576EA17F9877D
 641 data8 0x6B49BA271D296996,0xACCCC65414AD6AE2
 642 data8 0x9089D98850722CBE,0xA4049407777030F3
 643 data8 0x27FC00A871EA49C2,0x663DE06483DD9797
 644 data8 0x3FA3FD94438C860D,0xDE41319D39928C70
 645 data8 0xDDE7B7173BDF082B,0x3715A0805C93805A
 646 data8 0x921110D8E80FAF80,0x6C4BFFDB0F903876
 647 data8 0x185915A562BBCB61,0xB989C7BD401004F2
 648 data8 0xD2277549F6B6EBBB,0x22DBAA140A2F2689
 649 data8 0x768364333B091A94,0x0EAA3A51C2A31DAE
 650 data8 0xEDAF12265C4DC26D,0x9C7A2D9756C0833F
 651 data8 0x03F6F0098C402B99,0x316D07B43915200C
 652 data8 0x5BC3D8C492F54BAD,0xC6A5CA4ECD37A736
 653 data8 0xA9E69492AB6842DD,0xDE6319EF8C76528B
 654 data8 0x6837DBFCABA1AE31,0x15DFA1AE00DAFB0C
 655 data8 0x664D64B705ED3065,0x29BF56573AFF47B9
 656 data8 0xF96AF3BE75DF9328,0x3080ABF68C6615CB
 657 data8 0x040622FA1DE4D9A4,0xB33D8F1B5709CD36
 658 data8 0xE9424EA4BE13B523,0x331AAAF0A8654FA5
 659 data8 0xC1D20F3F0BCD785B,0x76F923048B7B7217
 660 data8 0x8953A6C6E26E6F00,0xEBEF584A9BB7DAC4
 661 data8 0xBA66AACFCF761D02,0xD12DF1B1C1998C77
 662 data8 0xADC3DA4886A05DF7,0xF480C62FF0AC9AEC
 663 data8 0xDDBC5C3F6DDED01F,0xC790B6DB2A3A25A3
 664 data8 0x9AAF009353AD0457,0xB6B42D297E804BA7
 665 data8 0x07DA0EAA76A1597B,0x2A12162DB7DCFDE5
 666 data8 0xFAFEDB89FDBE896C,0x76E4FCA90670803E
 667 data8 0x156E85FF87FD073E,0x2833676186182AEA
 668 data8 0xBD4DAFE7B36E6D8F,0x3967955BBF3148D7
 669 data8 0x8416DF30432DC735,0x6125CE70C9B8CB30
 670 data8 0xFD6CBFA200A4E46C,0x05A0DD5A476F21D2
 671 data8 0x1262845CB9496170,0xE0566B0152993755
 672 data8 0x50B7D51EC4F1335F,0x6E13E4305DA92E85
 673 data8 0xC3B21D3632A1A4B7,0x08D4B1EA21F716E4
 674 data8 0x698F77FF2780030C,0x2D408DA0CD4F99A5
 675 data8 0x20D3A2B30A5D2F42,0xF9B4CBDA11D0BE7D
 676 data8 0xC1DB9BBD17AB81A2,0xCA5C6A0817552E55
 677 data8 0x0027F0147F8607E1,0x640B148D4196DEBE
 678 data8 0x872AFDDAB6256B34,0x897BFEF3059EBFB9
 679 data8 0x4F6A68A82A4A5AC4,0x4FBCF82D985AD795
 680 data8 0xC7F48D4D0DA63A20,0x5F57A4B13F149538
 681 data8 0x800120CC86DD71B6,0xDEC9F560BF11654D
 682 data8 0x6B0701ACB08CD0C0,0xB24855510EFB1EC3
 683 data8 0x72953B06A33540C0,0x7BDC06CC45E0FA29
 684 data8 0x4EC8CAD641F3E8DE,0x647CD8649B31BED9
 685 data8 0xC397A4D45877C5E3,0x6913DAF03C3ABA46
 686 data8 0x18465F7555F5BDD2,0xC6926E5D2EACED44
 687 data8 0x0E423E1C87C461E9,0xFD29F3D6E7CA7C22
 688 data8 0x35916FC5E0088DD7,0xFFE26A6EC6FDB0C1
 689 data8 0x0893745D7CB2AD6B,0x9D6ECD7B723E6A11
 690 data8 0xC6A9CFF7DF7329BA,0xC9B55100B70DB2E2
 691 data8 0x24BA74607DE58AD8,0x742C150D0C188194
 692 data8 0x667E162901767A9F,0xBEFDFDEF4556367E
 693 data8 0xD913D9ECB9BA8BFC,0x97C427A831C36EF1
 694 data8 0x36C59456A8D8B5A8,0xB40ECCCF2D891234
 695 data8 0x576F89562CE3CE99,0xB920D6AA5E6B9C2A
 696 data8 0x3ECC5F114A0BFDFB,0xF4E16D3B8E2C86E2
 697 data8 0x84D4E9A9B4FCD1EE,0xEFC9352E61392F44
 698 data8 0x2138C8D91B0AFC81,0x6A4AFBD81C2F84B4
 699 data8 0x538C994ECC2254DC,0x552AD6C6C096190B
 700 data8 0xB8701A649569605A,0x26EE523F0F117F11
 701 data8 0xB5F4F5CBFC2DBC34,0xEEBC34CC5DE8605E
 702 data8 0xDD9B8E67EF3392B8,0x17C99B5861BC57E1
 703 data8 0xC68351103ED84871,0xDDDD1C2DA118AF46
 704 data8 0x2C21D7F359987AD9,0xC0549EFA864FFC06
 705 data8 0x56AE79E536228922,0xAD38DC9367AAE855
 706 data8 0x3826829BE7CAA40D,0x51B133990ED7A948
 707 data8 0x0569F0B265A7887F,0x974C8836D1F9B392
 708 data8 0x214A827B21CF98DC,0x9F405547DC3A74E1
 709 data8 0x42EB67DF9DFE5FD4,0x5EA4677B7AACBAA2
 710 data8 0xF65523882B55BA41,0x086E59862A218347
 711 data8 0x39E6E389D49EE540,0xFB49E956FFCA0F1C
 712 data8 0x8A59C52BFA94C5C1,0xD3CFC50FAE5ADB86
 713 data8 0xC5476243853B8621,0x94792C8761107B4C
 714 data8 0x2A1A2C8012BF4390,0x2688893C78E4C4A8
 715 data8 0x7BDBE5C23AC4EAF4,0x268A67F7BF920D2B
 716 data8 0xA365B1933D0B7CBD,0xDC51A463DD27DDE1
 717 data8 0x6919949A9529A828,0xCE68B4ED09209F44
 718 data8 0xCA984E638270237C,0x7E32B90F8EF5A7E7
 719 data8 0x561408F1212A9DB5,0x4D7E6F5119A5ABF9
 720 data8 0xB5D6DF8261DD9602,0x36169F3AC4A1A283
 721 data8 0x6DED727A8D39A9B8,0x825C326B5B2746ED
 722 data8 0x34007700D255F4FC,0x4D59018071E0E13F
 723 data8 0x89B295F364A8F1AE,0xA74B38FC4CEAB2BB
 724 ASM_SIZE_DIRECTIVE(Constants_Bits_of_2_by_pi)
 725
 726 Constants_Bits_of_pi_by_2:
 727 ASM_TYPE_DIRECTIVE(Constants_Bits_of_pi_by_2,@object)
 728 data4 0x2168C234,0xC90FDAA2,0x00003FFF,0x00000000
 729 data4 0x80DC1CD1,0xC4C6628B,0x00003FBF,0x00000000
 730 ASM_SIZE_DIRECTIVE(Constants_Bits_of_pi_by_2)
 731
 732 .section .text
 733 .proc __libm_pi_by_2_reduce#
 734 .global __libm_pi_by_2_reduce#
 735 .align 64
 736
 737 __libm_pi_by_2_reduce:
 738
 739 //    X is at the address in Address_of_Input
 740 //    Place the two-piece result at the address in Address_of_Outputs
 741 //    r followed by c
 742 //    N is returned
 743
 744 { .mmf
 745 alloc  r34 = ar.pfs,2,34,0,0
 746 (p0)  ldfe  FR_X = [GR_Address_of_Input]
 747 (p0)  fsetc.s3 0x00,0x7F ;;
 748 }
 749 { .mlx
 750         nop.m 999
 751 (p0)  movl GR_BIASL63 = 0x1003E
 752 }
 753 ;;
 754
 755
 756 //    L         -1-2-3-4
 757 //    0 0 0 0 0. 1 0 1 0
 758 //    M          0 1 2 .... 63, 64 65 ... 127, 128
 759 //     ---------------------------------------------
 760 //    Segment 0.        1     ,      2       ,    3
 761 //    START = M - 63                        M = 128 becomes 65
 762 //    LENGTH1  = START & 0x3F               65 become position 1
 763 //    SEGMENT  = shr(START,6) + 1      0 maps to 1,   64 maps to 2,
 764 //    LENGTH2  = 64 - LENGTH1
 765 //    Address_BASE = shladd(SEGMENT,3) + BASE
 766
 767
 768
 769 { .mmi
 770       nop.m 999
 771 (p0)  addl           GR_BASE   = @ltoff(Constants_Bits_of_2_by_pi#), gp
 772       nop.i 999
 773 }
 774 ;;
 775
 776 { .mmi
 777       ld8 GR_BASE = [GR_BASE]
 778       nop.m 999
 779       nop.i 999
 780 }
 781 ;;
 782
 783
 784 { .mlx
 785         nop.m 999
 786 (p0)  movl GR_TEMP5 = 0x000000000000FFFE
 787 }
 788 { .mmi
 789         nop.m 999 ;;
 790 (p0)  setf.exp FR_sigma_B = GR_TEMP5
 791         nop.i 999
 792 }
 793 { .mlx
 794         nop.m 999
 795 (p0)  movl GR_TEMP6 = 0x000000000000FFBE ;;
 796 }
 797 //    Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_A := 2^(-65).
 798 { .mfi
 799 (p0)  setf.exp FR_sigma_A = GR_TEMP6
 800         nop.f 999
 801         nop.i 999 ;;
 802 }
 803 //    Special Code for testing DE arguments
 804 //    (p0)  movl GR_BIASL63 = 0x0000000000013FFE
 805 //    (p0)  movl GR_x_lo = 0xFFFFFFFFFFFFFFFF
 806 //    (p0)  setf.exp FR_X = GR_BIASL63
 807 //    (p0)  setf.sig FR_ScaleP3 = GR_x_lo
 808 //    (p0)  fmerge.se FR_X = FR_X,FR_ScaleP3
 809 //    Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x.
 810 //    2/pi is stored contigously as
 811 //    0x00000000 0x00000000.0xA2F....
 812 //    M = EXP - BIAS  ( M >= 63)
 813 //    Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m.
 814 //    Thus -1 <= L <= -16321.
 815 { .mmf
 816 (p0)  getf.exp GR_Exp_x = FR_X
 817 (p0)  getf.sig GR_x_lo = FR_X
 818 (p0)  fabs FR_X = FR_X ;;
 819 }
 820 { .mii
 821 (p0)  and  GR_x_lo = 0x03,GR_x_lo
 822 (p0)  extr.u GR_M = GR_Exp_x,0,17 ;;
 823 (p0)  sub  GR_START = GR_M,GR_BIASL63
 824 }
 825 { .mmi
 826         nop.m 999 ;;
 827 (p0)  and  GR_LENGTH1 = 0x3F,GR_START
 828 (p0)  shr.u  GR_SEGMENT = GR_START,6
 829 }
 830 { .mmi
 831         nop.m 999 ;;
 832 (p0)  add  GR_SEGMENT = 0x1,GR_SEGMENT
 833 (p0)  sub  GR_LENGTH2 = 0x40,GR_LENGTH1
 834 }
 835 //    P_0 is the two bits corresponding to bit positions L+2 and L+1
 836 //    P_1 is the 64-bit starting at bit position  L
 837 //    P_2 is the 64-bit starting at bit position  L-64
 838 //    P_3 is the 64-bit starting at bit position  L-128
 839 //    P_4 is the 64-bit starting at bit position  L-192
 840 //    P_1 is made up of Alo and Bhi
 841 //    P_1 = deposit Alo, position 0, length2  into P_1,position length1
 842 //          deposit Bhi, position length2, length1 into P_1, position 0
 843 //    P_2 is made up of Blo and Chi
 844 //    P_2 = deposit Blo, position 0, length2  into P_2, position length1
 845 //          deposit Chi, position length2, length1 into P_2, position 0
 846 //    P_3 is made up of Clo and Dhi
 847 //    P_3 = deposit Clo, position 0, length2  into P_3, position length1
 848 //          deposit Dhi, position length2, length1 into P_3, position 0
 849 //    P_4 is made up of Clo and Dhi
 850 //    P_4 = deposit Dlo, position 0, length2  into P_4, position length1
 851 //          deposit Ehi, position length2, length1 into P_4, position 0
 852 { .mmi
 853 (p0)  cmp.le.unc p6,p7 = 0x2,GR_LENGTH1 ;;
 854 (p0)  shladd GR_BASE = GR_SEGMENT,3,GR_BASE
 855 (p7)  cmp.eq.unc p8,p9 = 0x1,GR_LENGTH1 ;;
 856 }
 857 { .mmi
 858         nop.m 999
 859 //    ld_64 A at Base and increment Base by 8
 860 //    ld_64 B at Base and increment Base by 8
 861 //    ld_64 C at Base and increment Base by 8
 862 //    ld_64 D at Base and increment Base by 8
 863 //    ld_64 E at Base and increment Base by 8
 864 //                                          A/B/C/D
 865 //                                    ---------------------
 866 //    A, B, C, D, and E look like    | length1 | length2   |
 867 //                                    ---------------------
 868 //                                       hi        lo
 869 (p0)  ld8 GR_A = [GR_BASE],8
 870 (p0)  extr.u GR_sgn_x = GR_Exp_x,17,1 ;;
 871 }
 872 { .mmf
 873         nop.m 999
 874 (p0)  ld8 GR_B = [GR_BASE],8
 875 (p0)  fmerge.se FR_X = FR_sigma_B,FR_X ;;
 876 }
 877 { .mii
 878 (p0)  ld8 GR_C = [GR_BASE],8
 879 (p8)  extr.u GR_Temp = GR_A,63,1 ;;
 880 (p0)  shl GR_TEMP1 = GR_A,GR_LENGTH1
 881 }
 882 { .mii
 883 (p0)  ld8 GR_D = [GR_BASE],8
 884 //    If length1 >= 2,
 885 //       P_0 = deposit Ahi, position length2, 2 bit into P_0 at position 0.
 886 (p6)     shr.u GR_P_0 = GR_A,GR_LENGTH2 ;;
 887 (p0)  shl GR_TEMP2 = GR_B,GR_LENGTH1
 888 }
 889 { .mii
 890 (p0)  ld8 GR_E = [GR_BASE],-40
 891 (p0)  shr.u GR_P_1 = GR_B,GR_LENGTH2 ;;
 892 (p0)  shr.u GR_P_2 = GR_C,GR_LENGTH2
 893 }
 894 //    Else
 895 //       Load 16 bit of ASUB from (Base_Address_of_A - 2)
 896 //       P_0 = ASUB & 0x3
 897 //       If length1 == 0,
 898 //          P_0 complete
 899 //       Else
 900 //          Deposit element 63 from Ahi and place in element 0 of P_0.
 901 //       Endif
 902 //    Endif
 903 { .mii
 904 (p7)  ld2 GR_ASUB = [GR_BASE],8
 905 (p0)  shl GR_TEMP3 = GR_C,GR_LENGTH1 ;;
 906 (p0)  shl GR_TEMP4 = GR_D,GR_LENGTH1
 907 }
 908 { .mii
 909         nop.m 999
 910 (p0)  shr.u GR_P_3 = GR_D,GR_LENGTH2 ;;
 911 (p0)  shr.u GR_P_4 = GR_E,GR_LENGTH2
 912 }
 913 { .mii
 914 (p7)  and GR_P_0 = 0x03,GR_ASUB
 915 (p6)     and GR_P_0 = 0x03,GR_P_0 ;;
 916 (p0)  or GR_P_1 = GR_P_1,GR_TEMP1
 917 }
 918 { .mmi
 919 (p8)  and GR_P_0 = 0x1,GR_P_0 ;;
 920 (p0)  or GR_P_2 = GR_P_2,GR_TEMP2
 921 (p8)  shl GR_P_0 = GR_P_0,0x1 ;;
 922 }
 923 { .mii
 924         nop.m 999
 925 (p0)  or GR_P_3 = GR_P_3,GR_TEMP3
 926 (p8)  or GR_P_0 = GR_P_0,GR_Temp
 927 }
 928 { .mmi
 929 (p0)  setf.sig FR_p_1 = GR_P_1 ;;
 930 (p0)  setf.sig FR_p_2 = GR_P_2
 931 (p0)  or GR_P_4 = GR_P_4,GR_TEMP4 ;;
 932 }
 933 { .mmi
 934         nop.m 999 ;;
 935 (p0)  setf.sig FR_p_3 = GR_P_3
 936 (p0)  pmpy2.r GR_M = GR_P_0,GR_x_lo
 937 }
 938 { .mlx
 939 (p0)  setf.sig FR_p_4 = GR_P_4
 940 //    P_1, P_2, P_3, P_4 are integers. They should be
 941 //    2^(L-63)     * P_1;
 942 //    2^(L-63-64)  * P_2;
 943 //    2^(L-63-128) * P_3;
 944 //    2^(L-63-192) * P_4;
 945 //    Since each of them need to be multiplied to x, we would scale
 946 //    both x and the P_j's by some convenient factors: scale each
 947 //    of P_j's up by 2^(63-L), and scale x down by 2^(L-63).
 948 //    p_1 := fcvt.xf ( P_1 )
 949 //    p_2 := fcvt.xf ( P_2 ) * 2^(-64)
 950 //    p_3 := fcvt.xf ( P_3 ) * 2^(-128)
 951 //    p_4 := fcvt.xf ( P_4 ) * 2^(-192)
 952 //    x= Set x's exp to -1 because 2^m*1.x...x *2^(L-63)=2^(-1)*1.x...xxx
 953 //             ---------   ---------   ---------
 954 //             |  P_1  |   |  P_2  |   |  P_3  |
 955 //             ---------   ---------   ---------
 956 //                                           ---------
 957 //            X                              |   X   |
 958 //                                           ---------
 959 //      ----------------------------------------------------
 960 //                               ---------   ---------
 961 //                               |  A_hi |   |  A_lo |
 962 //                               ---------   ---------
 963 //                   ---------   ---------
 964 //                   |  B_hi |   |  B_lo |
 965 //                   ---------   ---------
 966 //       ---------   ---------
 967 //       |  C_hi |   |  C_lo |
 968 //       ---------   ---------
 969 //     ====================================================
 970 //    -----------   ---------   ---------   ---------
 971 //    |    S_0  |   |  S_1  |   |  S_2  |   |  S_3  |
 972 //    -----------   ---------   ---------   ---------
 973 //    |            |___ binary point
 974 //    |___ possibly one more bit
 975 //
 976 //    Let FPSR3 be set to round towards zero with widest precision
 977 //    and exponent range. Unless an explicit FPSR is given,
 978 //    round-to-nearest with widest precision and exponent range is
 979 //    used.
 980 (p0)  movl GR_TEMP1 = 0x000000000000FFBF
 981 }
 982 { .mmi
 983         nop.m 999 ;;
 984 (p0)  setf.exp FR_ScaleP2 = GR_TEMP1
 985         nop.i 999
 986 }
 987 { .mlx
 988         nop.m 999
 989 (p0)  movl GR_TEMP4 = 0x000000000001003E
 990 }
 991 { .mmi
 992         nop.m 999 ;;
 993 (p0)  setf.exp FR_sigma_C = GR_TEMP4
 994         nop.i 999
 995 }
 996 { .mlx
 997         nop.m 999
 998 (p0)  movl GR_TEMP2 = 0x000000000000FF7F ;;
 999 }
1000 { .mmf
1001         nop.m 999
1002 (p0)  setf.exp FR_ScaleP3 = GR_TEMP2
1003 (p0)  fcvt.xuf.s1 FR_p_1 = FR_p_1 ;;
1004 }
1005 { .mfi
1006         nop.m 999
1007 (p0)  fcvt.xuf.s1 FR_p_2 = FR_p_2
1008         nop.i 999
1009 }
1010 { .mlx
1011         nop.m 999
1012 (p0)  movl GR_Temp = 0x000000000000FFDE ;;
1013 }
1014 { .mmf
1015         nop.m 999
1016 (p0)  setf.exp FR_TWOM33 = GR_Temp
1017 (p0)  fcvt.xuf.s1 FR_p_3 = FR_p_3 ;;
1018 }
1019 { .mfi
1020         nop.m 999
1021 (p0)  fcvt.xuf.s1 FR_p_4 = FR_p_4
1022         nop.i 999 ;;
1023 }
1024 { .mfi
1025         nop.m 999
1026 //    Tmp_C := fmpy.fpsr3( x, p_1 );
1027 //    Tmp_B := fmpy.fpsr3( x, p_2 );
1028 //    Tmp_A := fmpy.fpsr3( x, p_3 );
1029 //    If Tmp_C >= sigma_C then
1030 //      C_hi := Tmp_C;
1031 //      C_lo := x*p_1 - C_hi ...fma, exact
1032 //    Else
1033 //      C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C
1034 //      C_lo := x*p_1 - C_hi ...fma, exact
1035 //    End If
1036 //    If Tmp_B >= sigma_B then
1037 //      B_hi := Tmp_B;
1038 //      B_lo := x*p_2 - B_hi ...fma, exact
1039 //    Else
1040 //      B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B
1041 //      B_lo := x*p_2 - B_hi ...fma, exact
1042 //    End If
1043 //    If Tmp_A >= sigma_A then
1044 //      A_hi := Tmp_A;
1045 //      A_lo := x*p_3 - A_hi ...fma, exact
1046 //    Else
1047 //      A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A
1048 //      Exact, regardless ...of rounding direction
1049 //      A_lo := x*p_3 - A_hi ...fma, exact
1050 //    Endif
1051 (p0)  fmpy.s3 FR_Tmp_C = FR_X,FR_p_1
1052         nop.i 999 ;;
1053 }
1054 { .mfi
1055         nop.m 999
1056 (p0)  fmpy.s1 FR_p_2 = FR_p_2,FR_ScaleP2
1057         nop.i 999
1058 }
1059 { .mlx
1060         nop.m 999
1061 (p0)  movl GR_Temp = 0x0000000000000400
1062 }
1063 { .mlx
1064         nop.m 999
1065 (p0)  movl GR_TEMP3 = 0x000000000000FF3F ;;
1066 }
1067 { .mmf
1068         nop.m 999
1069 (p0)  setf.exp FR_ScaleP4 = GR_TEMP3
1070 (p0)  fmpy.s1 FR_p_3 = FR_p_3,FR_ScaleP3 ;;
1071 }
1072 { .mlx
1073         nop.m 999
1074 (p0)  movl GR_TEMP4 = 0x0000000000010045 ;;
1075 }
1076 { .mmf
1077         nop.m 999
1078 (p0)  setf.exp FR_Tmp2_C = GR_TEMP4
1079 (p0)  fmpy.s3 FR_Tmp_B = FR_X,FR_p_2 ;;
1080 }
1081 { .mfi
1082         nop.m 999
1083 (p0)  fcmp.ge.unc.s1 p12,  p9 = FR_Tmp_C,FR_sigma_C
1084         nop.i 999 ;;
1085 }
1086 { .mfi
1087         nop.m 999
1088 (p0)  fmpy.s3 FR_Tmp_A = FR_X,FR_p_3
1089         nop.i 999 ;;
1090 }
1091 { .mfi
1092         nop.m 999
1093 (p12) mov FR_C_hi = FR_Tmp_C
1094         nop.i 999 ;;
1095 }
1096 { .mfi
1097 (p0)  addl           GR_BASE   = @ltoff(Constants_Bits_of_pi_by_2#), gp
1098 (p9)  fadd.s3 FR_C_hi = FR_sigma_C,FR_Tmp_C
1099         nop.i 999
1100 }
1101 ;;
1102
1103
1104
1105 //   End If
1106 //   Step 3. Get reduced argument
1107 //   If sgn_x == 0 (that is original x is positive)
1108 //      D_hi := Pi_by_2_hi
1109 //      D_lo := Pi_by_2_lo
1110 //      Load from table
1111 //   Else
1112 //      D_hi := neg_Pi_by_2_hi
1113 //      D_lo := neg_Pi_by_2_lo
1114 //      Load from table
1115 //   End If
1116
1117
1118 { .mmi
1119       ld8 GR_BASE = [GR_BASE]
1120       nop.m 999
1121       nop.i 999
1122 }
1123 ;;
1124
1125
1126 { .mfi
1127 (p0) ldfe FR_D_hi = [GR_BASE],16
1128 (p0)  fmpy.s1 FR_p_4 = FR_p_4,FR_ScaleP4
1129         nop.i 999 ;;
1130 }
1131 { .mfi
1132 (p0) ldfe FR_D_lo = [GR_BASE],0
1133 (p0)  fcmp.ge.unc.s1 p13, p10 = FR_Tmp_B,FR_sigma_B
1134         nop.i 999 ;;
1135 }
1136 { .mfi
1137         nop.m 999
1138 (p13) mov FR_B_hi = FR_Tmp_B
1139         nop.i 999
1140 }
1141 { .mfi
1142         nop.m 999
1143 (p12) fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi
1144         nop.i 999 ;;
1145 }
1146 { .mfi
1147         nop.m 999
1148 (p10) fadd.s3 FR_B_hi = FR_sigma_B,FR_Tmp_B
1149         nop.i 999
1150 }
1151 { .mfi
1152         nop.m 999
1153 (p9)  fsub.s1 FR_C_hi = FR_C_hi,FR_sigma_C
1154         nop.i 999 ;;
1155 }
1156 { .mfi
1157         nop.m 999
1158 (p0)  fcmp.ge.unc.s1 p14, p11 = FR_Tmp_A,FR_sigma_A
1159         nop.i 999 ;;
1160 }
1161 { .mfi
1162         nop.m 999
1163 (p14) mov FR_A_hi = FR_Tmp_A
1164         nop.i 999 ;;
1165 }
1166 { .mfi
1167         nop.m 999
1168 (p11) fadd.s3 FR_A_hi = FR_sigma_A,FR_Tmp_A
1169         nop.i 999 ;;
1170 }
1171 { .mfi
1172         nop.m 999
1173 (p9)  fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi
1174 (p0)  cmp.eq.unc p12,p9 = 0x1,GR_sgn_x
1175 }
1176 { .mfi
1177         nop.m 999
1178 (p13) fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi
1179         nop.i 999 ;;
1180 }
1181 { .mfi
1182         nop.m 999
1183 (p10) fsub.s1 FR_B_hi = FR_B_hi,FR_sigma_B
1184         nop.i 999
1185 }
1186 { .mfi
1187         nop.m 999
1188 //    Note that C_hi is of integer value. We need only the
1189 //    last few bits. Thus we can ensure C_hi is never a big
1190 //    integer, freeing us from overflow worry.
1191 //    Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70);
1192 //    Tmp_C is the upper portion of C_hi
1193 (p0)  fadd.s3 FR_Tmp_C = FR_C_hi,FR_Tmp2_C
1194         nop.i 999 ;;
1195 }
1196 { .mfi
1197         nop.m 999
1198 (p14) fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi
1199         nop.i 999
1200 }
1201 { .mfi
1202         nop.m 999
1203 (p11) fsub.s1 FR_A_hi = FR_A_hi,FR_sigma_A
1204         nop.i 999 ;;
1205 }
1206 { .mfi
1207         nop.m 999
1208 //    *******************
1209 //    Step 2. Get N and f
1210 //    *******************
1211 //    We have all the components to obtain
1212 //    S_0, S_1, S_2, S_3 and thus N and f. We start by adding
1213 //    C_lo and B_hi. This sum together with C_hi estimates
1214 //    N and f well.
1215 //    A := fadd.fpsr3( B_hi, C_lo )
1216 //    B := max( B_hi, C_lo )
1217 //    b := min( B_hi, C_lo )
1218 (p0)  fadd.s3 FR_A = FR_B_hi,FR_C_lo
1219         nop.i 999
1220 }
1221 { .mfi
1222         nop.m 999
1223 (p10) fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi
1224         nop.i 999 ;;
1225 }
1226 { .mfi
1227         nop.m 999
1228 (p0)  fsub.s1 FR_Tmp_C = FR_Tmp_C,FR_Tmp2_C
1229         nop.i 999 ;;
1230 }
1231 { .mfi
1232         nop.m 999
1233 (p0)  fmax.s1 FR_B = FR_B_hi,FR_C_lo
1234         nop.i 999 ;;
1235 }
1236 { .mfi
1237         nop.m 999
1238 (p0)  fmin.s1 FR_b = FR_B_hi,FR_C_lo
1239         nop.i 999
1240 }
1241 { .mfi
1242         nop.m 999
1243 (p11) fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi
1244         nop.i 999 ;;
1245 }
1246 { .mfi
1247         nop.m 999
1248 //    N := round_to_nearest_integer_value( A );
1249 (p0)  fcvt.fx.s1 FR_N = FR_A
1250         nop.i 999 ;;
1251 }
1252 { .mfi
1253         nop.m 999
1254 //    C_hi := C_hi - Tmp_C ...0 <= C_hi < 2^7
1255 (p0)  fsub.s1 FR_C_hi = FR_C_hi,FR_Tmp_C
1256         nop.i 999 ;;
1257 }
1258 { .mfi
1259         nop.m 999
1260 //    a := (B - A) + b: Exact - note that a is either 0 or 2^(-64).
1261 (p0)  fsub.s1 FR_a = FR_B,FR_A
1262         nop.i 999 ;;
1263 }
1264 { .mfi
1265         nop.m 999
1266 //    f := A - N; Exact because lsb(A) >= 2^(-64) and |f| <= 1/2.
1267 (p0)  fnorm.s1 FR_N = FR_N
1268         nop.i 999
1269 }
1270 { .mfi
1271         nop.m 999
1272 (p0)  fadd.s1 FR_a = FR_a,FR_b
1273         nop.i 999 ;;
1274 }
1275 { .mfi
1276         nop.m 999
1277 (p0)  fsub.s1 FR_f = FR_A,FR_N
1278         nop.i 999
1279 }
1280 { .mfi
1281         nop.m 999
1282 //    N := convert to integer format( C_hi + N );
1283 //    M := P_0 * x_lo;
1284 //    N := N + M;
1285 (p0)  fadd.s1 FR_N = FR_N,FR_C_hi
1286         nop.i 999 ;;
1287 }
1288 { .mfi
1289         nop.m 999
1290 //    f = f + a Exact because a is 0 or 2^(-64);
1291 //    the msb of the sum is <= 1/2 and lsb >= 2^(-64).
1292 (p0)  fadd.s1 FR_f = FR_f,FR_a
1293         nop.i 999
1294 }
1295 { .mfi
1296         nop.m 999
1297 //
1298 //    Create 2**(-33)
1299 //
1300 (p0)  fcvt.fx.s1 FR_N = FR_N
1301         nop.i 999 ;;
1302 }
1303 { .mfi
1304         nop.m 999
1305 (p0)  fabs FR_f_abs = FR_f
1306         nop.i 999 ;;
1307 }
1308 { .mfi
1309 (p0)  getf.sig GR_N = FR_N
1310         nop.f 999
1311         nop.i 999 ;;
1312 }
1313 { .mii
1314         nop.m 999
1315         nop.i 999 ;;
1316 (p0)  add GR_N = GR_N,GR_M ;;
1317 }
1318 //    If sgn_x == 1 (that is original x was negative)
1319 //       N := 2^10 - N
1320 //       this maintains N to be non-negative, but still
1321 //       equivalent to the (negated N) mod 4.
1322 //    End If
1323 { .mii
1324 (p12) sub GR_N = GR_Temp,GR_N
1325 (p0) cmp.eq.unc p12,p9 = 0x0,GR_sgn_x ;;
1326         nop.i 999
1327 }
1328 { .mfi
1329         nop.m 999
1330 (p0)  fcmp.ge.unc.s1 p13, p10 = FR_f_abs,FR_TWOM33
1331         nop.i 999 ;;
1332 }
1333 { .mfi
1334         nop.m 999
1335 (p9) fsub.s1 FR_D_hi = f0, FR_D_hi
1336         nop.i 999 ;;
1337 }
1338 { .mfi
1339         nop.m 999
1340 (p10)    fadd.s3 FR_A = FR_A_hi,FR_B_lo
1341         nop.i 999
1342 }
1343 { .mfi
1344         nop.m 999
1345 (p13)    fadd.s1 FR_g = FR_A_hi,FR_B_lo
1346         nop.i 999 ;;
1347 }
1348 { .mfi
1349         nop.m 999
1350 (p10)    fmax.s1 FR_B = FR_A_hi,FR_B_lo
1351         nop.i 999
1352 }
1353 { .mfi
1354         nop.m 999
1355 (p9) fsub.s1 FR_D_lo = f0, FR_D_lo
1356         nop.i 999 ;;
1357 }
1358 { .mfi
1359         nop.m 999
1360 (p10)    fmin.s1 FR_b = FR_A_hi,FR_B_lo
1361         nop.i 999 ;;
1362 }
1363 { .mfi
1364         nop.m 999
1365 (p0) fsetc.s3 0x7F,0x40
1366         nop.i 999
1367 }
1368 { .mlx
1369         nop.m 999
1370 (p10)    movl GR_Temp = 0x000000000000FFCD ;;
1371 }
1372 { .mmf
1373         nop.m 999
1374 (p10)    setf.exp FR_TWOM50 = GR_Temp
1375 (p10)    fadd.s1 FR_f_hi = FR_A,FR_f ;;
1376 }
1377 { .mfi
1378         nop.m 999
1379 //       a := (B - A) + b       Exact.
1380 //       Note that a is either 0 or 2^(-128).
1381 //       f_hi := A + f;
1382 //       f_lo := (f - f_hi) + A
1383 //       f_lo=f-f_hi is exact because either |f| >= |A|, in which
1384 //       case f-f_hi is clearly exact; or otherwise, 0<|f|<|A|
1385 //       means msb(f) <= msb(A) = 2^(-64) => |f| = 2^(-64).
1386 //       If f = 2^(-64), f-f_hi involves cancellation and is
1387 //       exact. If f = -2^(-64), then A + f is exact. Hence
1388 //       f-f_hi is -A exactly, giving f_lo = 0.
1389 //       f_lo := f_lo + a;
1390 (p10)    fsub.s1 FR_a = FR_B,FR_A
1391         nop.i 999
1392 }
1393 { .mfi
1394         nop.m 999
1395 (p13)    fadd.s1 FR_s_hi = FR_f,FR_g
1396         nop.i 999 ;;
1397 }
1398 { .mlx
1399         nop.m 999
1400 //    If |f| >= 2^(-33)
1401 //       Case 1
1402 //       CASE := 1
1403 //       g := A_hi + B_lo;
1404 //       s_hi := f + g;
1405 //       s_lo := (f - s_hi) + g;
1406 (p13)    movl GR_CASE = 0x1 ;;
1407 }
1408 { .mlx
1409         nop.m 999
1410 //   Else
1411 //       Case 2
1412 //       CASE := 2
1413 //       A := fadd.fpsr3( A_hi, B_lo )
1414 //       B := max( A_hi, B_lo )
1415 //       b := min( A_hi, B_lo )
1416 (p10)    movl GR_CASE = 0x2
1417 }
1418 { .mfi
1419         nop.m 999
1420 (p10)    fsub.s1 FR_f_lo = FR_f,FR_f_hi
1421         nop.i 999 ;;
1422 }
1423 { .mfi
1424         nop.m 999
1425 (p10)    fadd.s1 FR_a = FR_a,FR_b
1426         nop.i 999
1427 }
1428 { .mfi
1429         nop.m 999
1430 (p13)    fsub.s1 FR_s_lo = FR_f,FR_s_hi
1431         nop.i 999 ;;
1432 }
1433 { .mfi
1434         nop.m 999
1435 (p13)    fadd.s1 FR_s_lo = FR_s_lo,FR_g
1436         nop.i 999 ;;
1437 }
1438 { .mfi
1439         nop.m 999
1440 (p10)    fcmp.ge.unc.s1 p14, p11 = FR_f_abs,FR_TWOM50
1441         nop.i 999 ;;
1442 }
1443 { .mfi
1444         nop.m 999
1445 //
1446 //       Create 2**(-50)
1447 (p10)    fadd.s1 FR_f_lo = FR_f_lo,FR_A
1448         nop.i 999 ;;
1449 }
1450 { .mfi
1451         nop.m 999
1452 //       If |f| >= 2^(-50) then
1453 //          s_hi := f_hi;
1454 //          s_lo := f_lo;
1455 //       Else
1456 //          f_lo := (f_lo + A_lo) + x*p_4
1457 //          s_hi := f_hi + f_lo
1458 //          s_lo := (f_hi - s_hi) + f_lo
1459 //       End If
1460 (p14)  mov FR_s_hi = FR_f_hi
1461         nop.i 999 ;;
1462 }
1463 { .mfi
1464         nop.m 999
1465 (p10)    fadd.s1 FR_f_lo = FR_f_lo,FR_a
1466         nop.i 999 ;;
1467 }
1468 { .mfi
1469         nop.m 999
1470 (p14)  mov FR_s_lo = FR_f_lo
1471         nop.i 999
1472 }
1473 { .mfi
1474         nop.m 999
1475 (p11)  fadd.s1 FR_f_lo = FR_f_lo,FR_A_lo
1476         nop.i 999 ;;
1477 }
1478 { .mfi
1479         nop.m 999
1480 (p11)  fma.s1 FR_f_lo = FR_X,FR_p_4,FR_f_lo
1481         nop.i 999 ;;
1482 }
1483 { .mfi
1484         nop.m 999
1485 (p11)  fadd.s1 FR_s_hi = FR_f_hi,FR_f_lo
1486         nop.i 999 ;;
1487 }
1488 { .mfi
1489         nop.m 999
1490 //   r_hi :=  s_hi*D_hi
1491 //   r_lo :=  s_hi*D_hi - r_hi  with fma
1492 //   r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi
1493 (p0) fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi
1494         nop.i 999
1495 }
1496 { .mfi
1497         nop.m 999
1498 (p11)  fsub.s1 FR_s_lo = FR_f_hi,FR_s_hi
1499         nop.i 999 ;;
1500 }
1501 { .mfi
1502         nop.m 999
1503 (p0) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi
1504         nop.i 999
1505 }
1506 { .mfi
1507         nop.m 999
1508 (p11)  fadd.s1 FR_s_lo = FR_s_lo,FR_f_lo
1509         nop.i 999 ;;
1510 }
1511 { .mmi
1512         nop.m 999 ;;
1513 //   Return  N, r_hi, r_lo
1514 //   We do not return CASE
1515 (p0) stfe [GR_Address_of_Outputs] = FR_r_hi,16
1516         nop.i 999 ;;
1517 }
1518 { .mfi
1519         nop.m 999
1520 (p0) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo
1521         nop.i 999 ;;
1522 }
1523 { .mfi
1524         nop.m 999
1525 (p0) fma.s1 FR_r_lo = FR_s_lo,FR_D_hi,FR_r_lo
1526         nop.i 999 ;;
1527 }
1528 { .mmi
1529         nop.m 999 ;;
1530 (p0) stfe [GR_Address_of_Outputs] = FR_r_lo,-16
1531         nop.i 999
1532 }
1533 { .mib
1534         nop.m 999
1535         nop.i 999
1536 (p0) br.ret.sptk   b0 ;;
1537 }
1538
1539 .endp __libm_pi_by_2_reduce
1540 ASM_SIZE_DIRECTIVE(__libm_pi_by_2_reduce)