3 // Copyright (c) 2000, 2001, Intel Corporation
4 // All rights reserved.
6 // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
7 // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
11 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
12 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
13 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
14 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
15 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
16 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
17 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
18 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
19 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
20 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
21 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 // Intel Corporation is the author of this code, and requests that all
24 // problem reports or change requests be submitted to it directly at
25 // http://developer.intel.com/opensource.
27 // *********************************************************************
30 // 02/02/00 Initial Version
31 // 4/04/00 Unwind support added
32 // 12/28/00 Fixed false invalid flags
34 // *********************************************************************
36 // Function: tan(x) = tangent(x), for double precision x values
38 // *********************************************************************
40 // Accuracy: Very accurate for double-precision values
42 // *********************************************************************
46 // Floating-Point Registers: f8 (Input and Return Value)
50 // General Purpose Registers:
52 // r49-r50 (Used to pass arguments to pi_by_2 reduce routine)
54 // Predicate Registers: p6-p15
56 // *********************************************************************
58 // IEEE Special Conditions:
60 // Denormal fault raised on denormal inputs
61 // Overflow exceptions do not occur
62 // Underflow exceptions raised when appropriate for tan
63 // (No specialized error handling for this routine)
64 // Inexact raised when appropriate by algorithm
71 // *********************************************************************
73 // Mathematical Description
75 // We consider the computation of FPTAN of Arg. Now, given
77 // Arg = N pi/2 + alpha, |alpha| <= pi/4,
79 // basic mathematical relationship shows that
81 // tan( Arg ) = tan( alpha ) if N is even;
82 // = -cot( alpha ) otherwise.
84 // The value of alpha is obtained by argument reduction and
85 // represented by two working precision numbers r and c where
87 // alpha = r + c accurately.
89 // The reduction method is described in a previous write up.
90 // The argument reduction scheme identifies 4 cases. For Cases 2
91 // and 4, because |alpha| is small, tan(r+c) and -cot(r+c) can be
92 // computed very easily by 2 or 3 terms of the Taylor series
93 // expansion as follows:
98 // tan(r + c) = r + c + r^3/3 ...accurately
99 // -cot(r + c) = -1/(r+c) + r/3 ...accurately
104 // tan(r + c) = r + c + r^3/3 + 2r^5/15 ...accurately
105 // -cot(r + c) = -1/(r+c) + r/3 + r^3/45 ...accurately
108 // The only cases left are Cases 1 and 3 of the argument reduction
109 // procedure. These two cases will be merged since after the
110 // argument is reduced in either cases, we have the reduced argument
111 // represented as r + c and that the magnitude |r + c| is not small
112 // enough to allow the usage of a very short approximation.
114 // The greatest challenge of this task is that the second terms of
115 // the Taylor series for tan(r) and -cot(r)
117 // r + r^3/3 + 2 r^5/15 + ...
121 // -1/r + r/3 + r^3/45 + ...
123 // are not very small when |r| is close to pi/4 and the rounding
124 // errors will be a concern if simple polynomial accumulation is
125 // used. When |r| < 2^(-2), however, the second terms will be small
126 // enough (5 bits or so of right shift) that a normal Horner
127 // recurrence suffices. Hence there are two cases that we consider
128 // in the accurate computation of tan(r) and cot(r), |r| <= pi/4.
130 // Case small_r: |r| < 2^(-2)
131 // --------------------------
133 // Since Arg = N pi/4 + r + c accurately, we have
135 // tan(Arg) = tan(r+c) for N even,
136 // = -cot(r+c) otherwise.
138 // Here for this case, both tan(r) and -cot(r) can be approximated
139 // by simple polynomials:
141 // tan(r) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
142 // -cot(r) = -1/r + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13
144 // accurately. Since |r| is relatively small, tan(r+c) and
145 // -cot(r+c) can be accurately approximated by replacing r with
146 // r+c only in the first two terms of the corresponding polynomials.
148 // Note that P1_1 (and Q1_1 for that matter) approximates 1/3 to
149 // almost 64 sig. bits, thus
151 // P1_1 (r+c)^3 = P1_1 r^3 + c * r^2 accurately.
155 // tan(r+c) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
158 // -cot(r+c) = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13
162 // Case normal_r: 2^(-2) <= |r| <= pi/4
163 // ------------------------------------
165 // This case is more likely than the previous one if one considers
166 // r to be uniformly distributed in [-pi/4 pi/4].
168 // The required calculation is either
170 // tan(r + c) = tan(r) + correction, or
171 // -cot(r + c) = -cot(r) + correction.
175 // tan(r + c) = tan(r) + c tan'(r) + O(c^2)
176 // = tan(r) + c sec^2(r) + O(c^2)
177 // = tan(r) + c SEC_sq ...accurately
178 // as long as SEC_sq approximates sec^2(r)
179 // to, say, 5 bits or so.
183 // -cot(r + c) = -cot(r) - c cot'(r) + O(c^2)
184 // = -cot(r) + c csc^2(r) + O(c^2)
185 // = -cot(r) + c CSC_sq ...accurately
186 // as long as CSC_sq approximates csc^2(r)
187 // to, say, 5 bits or so.
189 // We therefore concentrate on accurately calculating tan(r) and
190 // cot(r) for a working-precision number r, |r| <= pi/4 to within
193 // We will employ a table-driven approach. Let
195 // r = sgn_r * 2^k * 1.b_1 b_2 ... b_5 ... b_63
196 // = sgn_r * ( B + x )
200 // B = 2^k * 1.b_1 b_2 ... b_5 1
205 // tan( B + x ) = ------------------------
209 // | tan(B) + tan(x) |
211 // = tan(B) + | ------------------------ - tan(B) |
212 // | 1 - tan(B)*tan(x) |
216 // = tan(B) + ------------------------
219 // (1/[sin(B)*cos(B)]) * tan(x)
220 // = tan(B) + --------------------------------
224 // Clearly, the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
225 // calculated beforehand and stored in a table. Since
227 // |x| <= 2^k * 2^(-6) <= 2^(-7) (because k = -1, -2)
229 // a very short polynomial will be sufficient to approximate tan(x)
230 // accurately. The details involved in computing the last expression
231 // will be given in the next section on algorithm description.
234 // Now, we turn to the case where cot( B + x ) is needed.
238 // cot( B + x ) = ------------------------
242 // | 1 - tan(B)*tan(x) |
244 // = cot(B) + | ----------------------- - cot(B) |
245 // | tan(B) + tan(x) |
248 // [tan(B) + cot(B)] * tan(x)
249 // = cot(B) - ----------------------------
252 // (1/[sin(B)*cos(B)]) * tan(x)
253 // = cot(B) - --------------------------------
257 // Note that the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) that
258 // are needed are the same set of values needed in the previous
261 // Finally, we can put all the ingredients together as follows:
263 // Arg = N * pi/2 + r + c ...accurately
265 // tan(Arg) = tan(r) + correction if N is even;
266 // = -cot(r) + correction otherwise.
268 // For Cases 2 and 4,
271 // tan(Arg) = tan(r + c) = r + c + r^3/3 N even
272 // = -cot(r + c) = -1/(r+c) + r/3 N odd
274 // tan(Arg) = tan(r + c) = r + c + r^3/3 + 2r^5/15 N even
275 // = -cot(r + c) = -1/(r+c) + r/3 + r^3/45 N odd
278 // For Cases 1 and 3,
280 // Case small_r: |r| < 2^(-2)
282 // tan(Arg) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
283 // + c*(1 + r^2) N even
285 // = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13
288 // Case normal_r: 2^(-2) <= |r| <= pi/4
290 // tan(Arg) = tan(r) + c * sec^2(r) N even
291 // = -cot(r) + c * csc^2(r) otherwise
295 // tan(Arg) = tan(r) + c*sec^2(r)
296 // = tan( sgn_r * (B+x) ) + c * sec^2(|r|)
297 // = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(|r|) )
298 // = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(B) )
300 // since B approximates |r| to 2^(-6) in relative accuracy.
302 // / (1/[sin(B)*cos(B)]) * tan(x)
303 // tan(Arg) = sgn_r * | tan(B) + --------------------------------
311 // CORR = sgn_r*c*tan(B)*SC_inv(B); SC_inv(B) = 1/(sin(B)*cos(B)).
315 // tan(Arg) = -cot(r) + c*csc^2(r)
316 // = -cot( sgn_r * (B+x) ) + c * csc^2(|r|)
317 // = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(|r|) )
318 // = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(B) )
320 // since B approximates |r| to 2^(-6) in relative accuracy.
322 // / (1/[sin(B)*cos(B)]) * tan(x)
323 // tan(Arg) = sgn_r * | -cot(B) + --------------------------------
331 // CORR = sgn_r*c*cot(B)*SC_inv(B); SC_inv(B) = 1/(sin(B)*cos(B)).
334 // The actual algorithm prescribes how all the mathematical formulas
338 // 2. Algorithmic Description
339 // ==========================
341 // 2.1 Computation for Cases 2 and 4.
342 // ----------------------------------
344 // For Case 2, we use two-term polynomials.
349 // Result := c + r * rsq * P1_1
350 // Result := r + Result ...in user-defined rounding
353 // S_hi := -frcpa(r) ...8 bits
354 // S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits
355 // S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits
356 // S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits
357 // S_lo := S_hi*( (1 + S_hi*r) + S_hi*c )
358 // ...S_hi + S_lo is -1/(r+c) to extra precision
359 // S_lo := S_lo + Q1_1*r
361 // Result := S_hi + S_lo ...in user-defined rounding
363 // For Case 4, we use three-term polynomials
368 // Result := c + r * rsq * (P1_1 + rsq * P1_2)
369 // Result := r + Result ...in user-defined rounding
372 // S_hi := -frcpa(r) ...8 bits
373 // S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits
374 // S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits
375 // S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits
376 // S_lo := S_hi*( (1 + S_hi*r) + S_hi*c )
377 // ...S_hi + S_lo is -1/(r+c) to extra precision
379 // P := Q1_1 + rsq*Q1_2
380 // S_lo := S_lo + r*P
382 // Result := S_hi + S_lo ...in user-defined rounding
385 // Note that the coefficients P1_1, P1_2, Q1_1, and Q1_2 are
386 // the same as those used in the small_r case of Cases 1 and 3
390 // 2.2 Computation for Cases 1 and 3.
391 // ----------------------------------
392 // This is further divided into the case of small_r,
393 // where |r| < 2^(-2), and the case of normal_r, where |r| lies between
396 // Algorithm for the case of small_r
397 // ---------------------------------
401 // Poly1 := rsq*(P1_1 + rsq*(P1_2 + rsq*P1_3))
402 // r_to_the_8 := rsq * rsq
403 // r_to_the_8 := r_to_the_8 * r_to_the_8
404 // Poly2 := P1_4 + rsq*(P1_5 + rsq*(P1_6 + ... rsq*P1_9))
405 // CORR := c * ( 1 + rsq )
406 // Poly := Poly1 + r_to_the_8*Poly2
407 // Result := r*Poly + CORR
408 // Result := r + Result ...in user-defined rounding
409 // ...note that Poly1 and r_to_the_8 can be computed in parallel
410 // ...with Poly2 (Poly1 is intentionally set to be much
411 // ...shorter than Poly2 so that r_to_the_8 and CORR can be hidden)
414 // S_hi := -frcpa(r) ...8 bits
415 // S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits
416 // S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits
417 // S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits
418 // S_lo := S_hi*( (1 + S_hi*r) + S_hi*c )
419 // ...S_hi + S_lo is -1/(r+c) to extra precision
420 // S_lo := S_lo + Q1_1*c
422 // ...S_hi and S_lo are computed in parallel with
425 // P := Q1_1 + rsq*(Q1_2 + rsq*(Q1_3 + ... + rsq*Q1_7))
427 // Result := r*P + S_lo
428 // Result := S_hi + Result ...in user-defined rounding
431 // Algorithm for the case of normal_r
432 // ----------------------------------
434 // Here, we first consider the computation of tan( r + c ). As
435 // presented in the previous section,
437 // tan( r + c ) = tan(r) + c * sec^2(r)
438 // = sgn_r * [ tan(B+x) + CORR ]
439 // CORR = sgn_r * c * tan(B) * 1/[sin(B)*cos(B)]
441 // because sec^2(r) = sec^(|r|), and B approximate |r| to 6.5 bits.
444 // / (1/[sin(B)*cos(B)]) * tan(x)
445 // sgn_r * | tan(B) + -------------------------------- +
452 // The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
453 // calculated beforehand and stored in a table. Specifically,
454 // the table values are
456 // tan(B) as T_hi + T_lo;
457 // cot(B) as C_hi + C_lo;
458 // 1/[sin(B)*cos(B)] as SC_inv
460 // T_hi, C_hi are in double-precision memory format;
461 // T_lo, C_lo are in single-precision memory format;
462 // SC_inv is in extended-precision memory format.
464 // The value of tan(x) will be approximated by a short polynomial of
467 // tan(x) as x + x * P, where
468 // P = x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
470 // Because |x| <= 2^(-7), cot(B) - x approximates cot(B) - tan(x)
471 // to a relative accuracy better than 2^(-20). Thus, a good
472 // initial guess of 1/( cot(B) - tan(x) ) to initiate the iterative
475 // 1/(cot(B) - tan(x)) is approximately
477 // tan(B)/(1 - x*tan(B)) is approximately
478 // T_hi / ( 1 - T_hi * x ) is approximately
480 // T_hi * [ 1 + (Thi * x) + (T_hi * x)^2 ]
482 // The calculation of tan(r+c) therefore proceed as follows:
487 // V_hi := T_hi*(1 + Tx*(1 + Tx))
488 // P := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
489 // ...V_hi serves as an initial guess of 1/(cot(B) - tan(x))
490 // ...good to about 20 bits of accuracy
494 // ...D is a double precision denominator: cot(B) - tan(x)
496 // V_hi := V_hi + V_hi*(1 - V_hi*D)
497 // ....V_hi approximates 1/(cot(B)-tan(x)) to 40 bits
499 // V_lo := V_hi * ( [ (1 - V_hi*C_hi) + V_hi*tanx ]
500 // - V_hi*C_lo ) ...observe all order
501 // ...V_hi + V_lo approximates 1/(cot(B) - tan(x))
502 // ...to extra accuracy
504 // ... SC_inv(B) * (x + x*P)
505 // ... tan(B) + ------------------------- + CORR
506 // ... cot(B) - (x + x*P)
508 // ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
512 // CORR := sgn_r * c * SC_inv * T_hi
514 // ...put the ingredients together to compute
515 // ... SC_inv(B) * (x + x*P)
516 // ... tan(B) + ------------------------- + CORR
517 // ... cot(B) - (x + x*P)
519 // ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
521 // ... = T_hi + T_lo + CORR +
522 // ... Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
524 // CORR := CORR + T_lo
525 // tail := V_lo + P*(V_hi + V_lo)
526 // tail := Sx * tail + CORR
527 // tail := Sx * V_hi + tail
528 // T_hi := sgn_r * T_hi
530 // ...T_hi + sgn_r*tail now approximate
531 // ...sgn_r*(tan(B+x) + CORR) accurately
533 // Result := T_hi + sgn_r*tail ...in user-defined
534 // ...rounding control
535 // ...It is crucial that independent paths be fully
536 // ...exploited for performance's sake.
539 // Next, we consider the computation of -cot( r + c ). As
540 // presented in the previous section,
542 // -cot( r + c ) = -cot(r) + c * csc^2(r)
543 // = sgn_r * [ -cot(B+x) + CORR ]
544 // CORR = sgn_r * c * cot(B) * 1/[sin(B)*cos(B)]
546 // because csc^2(r) = csc^(|r|), and B approximate |r| to 6.5 bits.
549 // / (1/[sin(B)*cos(B)]) * tan(x)
550 // sgn_r * | -cot(B) + -------------------------------- +
557 // The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
558 // calculated beforehand and stored in a table. Specifically,
559 // the table values are
561 // tan(B) as T_hi + T_lo;
562 // cot(B) as C_hi + C_lo;
563 // 1/[sin(B)*cos(B)] as SC_inv
565 // T_hi, C_hi are in double-precision memory format;
566 // T_lo, C_lo are in single-precision memory format;
567 // SC_inv is in extended-precision memory format.
569 // The value of tan(x) will be approximated by a short polynomial of
572 // tan(x) as x + x * P, where
573 // P = x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
575 // Because |x| <= 2^(-7), tan(B) + x approximates tan(B) + tan(x)
576 // to a relative accuracy better than 2^(-18). Thus, a good
577 // initial guess of 1/( tan(B) + tan(x) ) to initiate the iterative
580 // 1/(tan(B) + tan(x)) is approximately
582 // cot(B)/(1 + x*cot(B)) is approximately
583 // C_hi / ( 1 + C_hi * x ) is approximately
585 // C_hi * [ 1 - (C_hi * x) + (C_hi * x)^2 ]
587 // The calculation of -cot(r+c) therefore proceed as follows:
592 // V_hi := C_hi*(1 - Cx*(1 - Cx))
593 // P := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
594 // ...V_hi serves as an initial guess of 1/(tan(B) + tan(x))
595 // ...good to about 18 bits of accuracy
599 // ...D is a double precision denominator: tan(B) + tan(x)
601 // V_hi := V_hi + V_hi*(1 - V_hi*D)
602 // ....V_hi approximates 1/(tan(B)+tan(x)) to 40 bits
604 // V_lo := V_hi * ( [ (1 - V_hi*T_hi) - V_hi*tanx ]
605 // - V_hi*T_lo ) ...observe all order
606 // ...V_hi + V_lo approximates 1/(tan(B) + tan(x))
607 // ...to extra accuracy
609 // ... SC_inv(B) * (x + x*P)
610 // ... -cot(B) + ------------------------- + CORR
611 // ... tan(B) + (x + x*P)
613 // ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
617 // CORR := sgn_r * c * SC_inv * C_hi
619 // ...put the ingredients together to compute
620 // ... SC_inv(B) * (x + x*P)
621 // ... -cot(B) + ------------------------- + CORR
622 // ... tan(B) + (x + x*P)
624 // ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
626 // ... =-C_hi - C_lo + CORR +
627 // ... Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
629 // CORR := CORR - C_lo
630 // tail := V_lo + P*(V_hi + V_lo)
631 // tail := Sx * tail + CORR
632 // tail := Sx * V_hi + tail
633 // C_hi := -sgn_r * C_hi
635 // ...C_hi + sgn_r*tail now approximates
636 // ...sgn_r*(-cot(B+x) + CORR) accurately
638 // Result := C_hi + sgn_r*tail in user-defined rounding control
639 // ...It is crucial that independent paths be fully
640 // ...exploited for performance's sake.
642 // 3. Implementation Notes
643 // =======================
645 // Table entries T_hi, T_lo; C_hi, C_lo; SC_inv
647 // Recall that 2^(-2) <= |r| <= pi/4;
649 // r = sgn_r * 2^k * 1.b_1 b_2 ... b_63
653 // B = 2^k * 1.b_1 b_2 b_3 b_4 b_5 1
655 // Thus, for k = -2, possible values of B are
657 // B = 2^(-2) * ( 1 + index/32 + 1/64 ),
658 // index ranges from 0 to 31
660 // For k = -1, however, since |r| <= pi/4 = 0.78...
661 // possible values of B are
663 // B = 2^(-1) * ( 1 + index/32 + 1/64 )
664 // index ranges from 0 to 19.
668 #include "libm_support.h"
679 ASM_TYPE_DIRECTIVE(TAN_BASE_CONSTANTS,@object)
680 data4 0x4B800000, 0xCB800000, 0x38800000, 0xB8800000 // two**24, -two**24
681 // two**-14, -two**-14
682 data4 0x4E44152A, 0xA2F9836E, 0x00003FFE, 0x00000000 // two_by_pi
683 data4 0xCE81B9F1, 0xC84D32B0, 0x00004016, 0x00000000 // P_0
684 data4 0x2168C235, 0xC90FDAA2, 0x00003FFF, 0x00000000 // P_1
685 data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD, 0x00000000 // P_2
686 data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C, 0x00000000 // P_3
687 data4 0x5F000000, 0xDF000000, 0x00000000, 0x00000000 // two_to_63, -two_to_63
688 data4 0x6EC6B45A, 0xA397E504, 0x00003FE7, 0x00000000 // Inv_P_0
689 data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF, 0x00000000 // d_1
690 data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C, 0x00000000 // d_2
691 data4 0x2168C234, 0xC90FDAA2, 0x00003FFE, 0x00000000 // PI_BY_4
692 data4 0x2168C234, 0xC90FDAA2, 0x0000BFFE, 0x00000000 // MPI_BY_4
693 data4 0x3E800000, 0xBE800000, 0x00000000, 0x00000000 // two**-2, -two**-2
694 data4 0x2F000000, 0xAF000000, 0x00000000, 0x00000000 // two**-33, -two**-33
695 data4 0xAAAAAABD, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P1_1
696 data4 0x88882E6A, 0x88888888, 0x00003FFC, 0x00000000 // P1_2
697 data4 0x0F0177B6, 0xDD0DD0DD, 0x00003FFA, 0x00000000 // P1_3
698 data4 0x646B8C6D, 0xB327A440, 0x00003FF9, 0x00000000 // P1_4
699 data4 0x1D5F7D20, 0x91371B25, 0x00003FF8, 0x00000000 // P1_5
700 data4 0x61C67914, 0xEB69A5F1, 0x00003FF6, 0x00000000 // P1_6
701 data4 0x019318D2, 0xBEDD37BE, 0x00003FF5, 0x00000000 // P1_7
702 data4 0x3C794015, 0x9979B146, 0x00003FF4, 0x00000000 // P1_8
703 data4 0x8C6EB58A, 0x8EBD21A3, 0x00003FF3, 0x00000000 // P1_9
704 data4 0xAAAAAAB4, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // Q1_1
705 data4 0x0B5FC93E, 0xB60B60B6, 0x00003FF9, 0x00000000 // Q1_2
706 data4 0x0C9BBFBF, 0x8AB355E0, 0x00003FF6, 0x00000000 // Q1_3
707 data4 0xCBEE3D4C, 0xDDEBBC89, 0x00003FF2, 0x00000000 // Q1_4
708 data4 0x5F80BBB6, 0xB3548A68, 0x00003FEF, 0x00000000 // Q1_5
709 data4 0x4CED5BF1, 0x91362560, 0x00003FEC, 0x00000000 // Q1_6
710 data4 0x8EE92A83, 0xF189D95A, 0x00003FE8, 0x00000000 // Q1_7
711 data4 0xAAAB362F, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P2_1
712 data4 0xE97A6097, 0x88888886, 0x00003FFC, 0x00000000 // P2_2
713 data4 0x25E716A1, 0xDD108EE0, 0x00003FFA, 0x00000000 // P2_3
715 // Entries T_hi double-precision memory format
716 // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
717 // Entries T_lo single-precision memory format
718 // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
720 data4 0x62400794, 0x3FD09BC3, 0x23A05C32, 0x00000000
721 data4 0xDFFBC074, 0x3FD124A9, 0x240078B2, 0x00000000
722 data4 0x5BD4920F, 0x3FD1AE23, 0x23826B8E, 0x00000000
723 data4 0x15E2701D, 0x3FD23835, 0x22D31154, 0x00000000
724 data4 0x63739C2D, 0x3FD2C2E4, 0x2265C9E2, 0x00000000
725 data4 0xAFEEA48B, 0x3FD34E36, 0x245C05EB, 0x00000000
726 data4 0x7DBB35D1, 0x3FD3DA31, 0x24749F2D, 0x00000000
727 data4 0x67321619, 0x3FD466DA, 0x2462CECE, 0x00000000
728 data4 0x1F94A4D5, 0x3FD4F437, 0x246D0DF1, 0x00000000
729 data4 0x740C3E6D, 0x3FD5824D, 0x240A85B5, 0x00000000
730 data4 0x4CB1E73D, 0x3FD61123, 0x23F96E33, 0x00000000
731 data4 0xAD9EA64B, 0x3FD6A0BE, 0x247C5393, 0x00000000
732 data4 0xB804FD01, 0x3FD73125, 0x241F3B29, 0x00000000
733 data4 0xAB53EE83, 0x3FD7C25E, 0x2479989B, 0x00000000
734 data4 0xE6640EED, 0x3FD8546F, 0x23B343BC, 0x00000000
735 data4 0xE8AF1892, 0x3FD8E75F, 0x241454D1, 0x00000000
736 data4 0x53928BDA, 0x3FD97B35, 0x238613D9, 0x00000000
737 data4 0xEB9DE4DE, 0x3FDA0FF6, 0x22859FA7, 0x00000000
738 data4 0x99ECF92D, 0x3FDAA5AB, 0x237A6D06, 0x00000000
739 data4 0x6D8F1796, 0x3FDB3C5A, 0x23952F6C, 0x00000000
740 data4 0x9CFB8BE4, 0x3FDBD40A, 0x2280FC95, 0x00000000
741 data4 0x87943100, 0x3FDC6CC3, 0x245D2EC0, 0x00000000
742 data4 0xB736C500, 0x3FDD068C, 0x23C4AD7D, 0x00000000
743 data4 0xE1DDBC31, 0x3FDDA16D, 0x23D076E6, 0x00000000
744 data4 0xEB515A93, 0x3FDE3D6E, 0x244809A6, 0x00000000
745 data4 0xE6E9E5F1, 0x3FDEDA97, 0x220856C8, 0x00000000
746 data4 0x1963CE69, 0x3FDF78F1, 0x244BE993, 0x00000000
747 data4 0x7D635BCE, 0x3FE00C41, 0x23D21799, 0x00000000
748 data4 0x1C302CD3, 0x3FE05CAB, 0x248A1B1D, 0x00000000
749 data4 0xDB6A1FA0, 0x3FE0ADB9, 0x23D53E33, 0x00000000
750 data4 0x4A20BA81, 0x3FE0FF72, 0x24DB9ED5, 0x00000000
751 data4 0x153FA6F5, 0x3FE151D9, 0x24E9E451, 0x00000000
753 // Entries T_hi double-precision memory format
754 // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
755 // Entries T_lo single-precision memory format
756 // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
758 data4 0xBA1BE39E, 0x3FE1CEC4, 0x24B60F9E, 0x00000000
759 data4 0x5ABD9B2D, 0x3FE277E4, 0x248C2474, 0x00000000
760 data4 0x0272B110, 0x3FE32418, 0x247B8311, 0x00000000
761 data4 0x890E2DF0, 0x3FE3D38B, 0x24C55751, 0x00000000
762 data4 0x46236871, 0x3FE4866D, 0x24E5BC34, 0x00000000
763 data4 0x45E044B0, 0x3FE53CEE, 0x24001BA4, 0x00000000
764 data4 0x82EC06E4, 0x3FE5F742, 0x24B973DC, 0x00000000
765 data4 0x25DF43F9, 0x3FE6B5A1, 0x24895440, 0x00000000
766 data4 0xCAFD348C, 0x3FE77844, 0x240021CA, 0x00000000
767 data4 0xCEED6B92, 0x3FE83F6B, 0x24C45372, 0x00000000
768 data4 0xA34F3665, 0x3FE90B58, 0x240DAD33, 0x00000000
769 data4 0x2C1E56B4, 0x3FE9DC52, 0x24F846CE, 0x00000000
770 data4 0x27041578, 0x3FEAB2A4, 0x2323FB6E, 0x00000000
771 data4 0x9DD8C373, 0x3FEB8E9F, 0x24B3090B, 0x00000000
772 data4 0x65C9AA7B, 0x3FEC709B, 0x2449F611, 0x00000000
773 data4 0xACCF8435, 0x3FED58F4, 0x23616A7E, 0x00000000
774 data4 0x97635082, 0x3FEE480F, 0x24C2FEAE, 0x00000000
775 data4 0xF0ACC544, 0x3FEF3E57, 0x242CE964, 0x00000000
776 data4 0xF7E06E4B, 0x3FF01E20, 0x2480D3EE, 0x00000000
777 data4 0x8A798A69, 0x3FF0A125, 0x24DB8967, 0x00000000
779 // Entries C_hi double-precision memory format
780 // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
781 // Entries C_lo single-precision memory format
782 // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
784 data4 0xE63EFBD0, 0x400ED3E2, 0x259D94D4, 0x00000000
785 data4 0xC515DAB5, 0x400DDDB4, 0x245F0537, 0x00000000
786 data4 0xBE19A79F, 0x400CF57A, 0x25D4EA9F, 0x00000000
787 data4 0xD15298ED, 0x400C1A06, 0x24AE40A0, 0x00000000
788 data4 0x164B2708, 0x400B4A4C, 0x25A5AAB6, 0x00000000
789 data4 0x5285B068, 0x400A855A, 0x25524F18, 0x00000000
790 data4 0x3FFA549F, 0x4009CA5A, 0x24C999C0, 0x00000000
791 data4 0x646AF623, 0x4009188A, 0x254FD801, 0x00000000
792 data4 0x6084D0E7, 0x40086F3C, 0x2560F5FD, 0x00000000
793 data4 0xA29A76EE, 0x4007CDD2, 0x255B9D19, 0x00000000
794 data4 0x6C8ECA95, 0x400733BE, 0x25CB021B, 0x00000000
795 data4 0x1F8DDC52, 0x4006A07E, 0x24AB4722, 0x00000000
796 data4 0xC298AD58, 0x4006139B, 0x252764E2, 0x00000000
797 data4 0xBAD7164B, 0x40058CAB, 0x24DAF5DB, 0x00000000
798 data4 0xAE31A5D3, 0x40050B4B, 0x25EA20F4, 0x00000000
799 data4 0x89F85A8A, 0x40048F21, 0x2583A3E8, 0x00000000
800 data4 0xA862380D, 0x400417DA, 0x25DCC4CC, 0x00000000
801 data4 0x1088FCFE, 0x4003A52B, 0x2430A492, 0x00000000
802 data4 0xCD3527D5, 0x400336CC, 0x255F77CF, 0x00000000
803 data4 0x5760766D, 0x4002CC7F, 0x25DA0BDA, 0x00000000
804 data4 0x11CE02E3, 0x40026607, 0x256FF4A2, 0x00000000
805 data4 0xD37BBE04, 0x4002032C, 0x25208AED, 0x00000000
806 data4 0x7F050775, 0x4001A3BD, 0x24B72DD6, 0x00000000
807 data4 0xA554848A, 0x40014789, 0x24AB4DAA, 0x00000000
808 data4 0x323E81B7, 0x4000EE65, 0x2584C440, 0x00000000
809 data4 0x21CF1293, 0x40009827, 0x25C9428D, 0x00000000
810 data4 0x3D415EEB, 0x400044A9, 0x25DC8482, 0x00000000
811 data4 0xBD72C577, 0x3FFFE78F, 0x257F5070, 0x00000000
812 data4 0x75EFD28E, 0x3FFF4AC3, 0x23EBBF7A, 0x00000000
813 data4 0x60B52DDE, 0x3FFEB2AF, 0x22EECA07, 0x00000000
814 data4 0x35204180, 0x3FFE1F19, 0x24191079, 0x00000000
815 data4 0x54F7E60A, 0x3FFD8FCA, 0x248D3058, 0x00000000
817 // Entries C_hi double-precision memory format
818 // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
819 // Entries C_lo single-precision memory format
820 // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
822 data4 0x79F6FADE, 0x3FFCC06A, 0x239C7886, 0x00000000
823 data4 0x891662A6, 0x3FFBB91F, 0x250BD191, 0x00000000
824 data4 0x529F155D, 0x3FFABFB6, 0x256CC3E6, 0x00000000
825 data4 0x2E964AE9, 0x3FF9D300, 0x250843E3, 0x00000000
826 data4 0x89DCB383, 0x3FF8F1EF, 0x2277C87E, 0x00000000
827 data4 0x7C87DBD6, 0x3FF81B93, 0x256DA6CF, 0x00000000
828 data4 0x1042EDE4, 0x3FF74F14, 0x2573D28A, 0x00000000
829 data4 0x1784B360, 0x3FF68BAF, 0x242E489A, 0x00000000
830 data4 0x7C923C4C, 0x3FF5D0B5, 0x2532D940, 0x00000000
831 data4 0xF418EF20, 0x3FF51D88, 0x253C7DD6, 0x00000000
832 data4 0x02F88DAE, 0x3FF4719A, 0x23DB59BF, 0x00000000
833 data4 0x49DA0788, 0x3FF3CC66, 0x252B4756, 0x00000000
834 data4 0x0B980DB8, 0x3FF32D77, 0x23FE585F, 0x00000000
835 data4 0xE56C987A, 0x3FF2945F, 0x25378A63, 0x00000000
836 data4 0xB16523F6, 0x3FF200BD, 0x247BB2E0, 0x00000000
837 data4 0x8CE27778, 0x3FF17235, 0x24446538, 0x00000000
838 data4 0xFDEFE692, 0x3FF0E873, 0x2514638F, 0x00000000
839 data4 0x33154062, 0x3FF0632C, 0x24A7FC27, 0x00000000
840 data4 0xB3EF115F, 0x3FEFC42E, 0x248FD0FE, 0x00000000
841 data4 0x135D26F6, 0x3FEEC9E8, 0x2385C719, 0x00000000
843 // Entries SC_inv in Swapped IEEE format (extended)
844 // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
846 data4 0x1BF30C9E, 0x839D6D4A, 0x00004001, 0x00000000
847 data4 0x554B0EB0, 0x80092804, 0x00004001, 0x00000000
848 data4 0xA1CF0DE9, 0xF959F94C, 0x00004000, 0x00000000
849 data4 0x77378677, 0xF3086BA0, 0x00004000, 0x00000000
850 data4 0xCCD4723C, 0xED154515, 0x00004000, 0x00000000
851 data4 0x1C27CF25, 0xE7790944, 0x00004000, 0x00000000
852 data4 0x8DDACB88, 0xE22D037D, 0x00004000, 0x00000000
853 data4 0x89C73522, 0xDD2B2D8A, 0x00004000, 0x00000000
854 data4 0xBB2C1171, 0xD86E1A23, 0x00004000, 0x00000000
855 data4 0xDFF5E0F9, 0xD3F0E288, 0x00004000, 0x00000000
856 data4 0x283BEBD5, 0xCFAF16B1, 0x00004000, 0x00000000
857 data4 0x0D88DD53, 0xCBA4AFAA, 0x00004000, 0x00000000
858 data4 0xCA67C43D, 0xC7CE03CC, 0x00004000, 0x00000000
859 data4 0x0CA0DDB0, 0xC427BC82, 0x00004000, 0x00000000
860 data4 0xF13D8CAB, 0xC0AECD57, 0x00004000, 0x00000000
861 data4 0x71ECE6B1, 0xBD606C38, 0x00004000, 0x00000000
862 data4 0xA44C4929, 0xBA3A0A96, 0x00004000, 0x00000000
863 data4 0xE5CCCEC1, 0xB7394F6F, 0x00004000, 0x00000000
864 data4 0x9637D8BC, 0xB45C1203, 0x00004000, 0x00000000
865 data4 0x92CB051B, 0xB1A05528, 0x00004000, 0x00000000
866 data4 0x6BA2FFD0, 0xAF04432B, 0x00004000, 0x00000000
867 data4 0x7221235F, 0xAC862A23, 0x00004000, 0x00000000
868 data4 0x5F00A9D1, 0xAA2478AF, 0x00004000, 0x00000000
869 data4 0x81E082BF, 0xA7DDBB0C, 0x00004000, 0x00000000
870 data4 0x45684FEE, 0xA5B0987D, 0x00004000, 0x00000000
871 data4 0x627A8F53, 0xA39BD0F5, 0x00004000, 0x00000000
872 data4 0x6EC5C8B0, 0xA19E3B03, 0x00004000, 0x00000000
873 data4 0x91CD7C66, 0x9FB6C1F0, 0x00004000, 0x00000000
874 data4 0x1FA3DF8A, 0x9DE46410, 0x00004000, 0x00000000
875 data4 0xA8F6B888, 0x9C263139, 0x00004000, 0x00000000
876 data4 0xC27B0450, 0x9A7B4968, 0x00004000, 0x00000000
877 data4 0x5EE614EE, 0x98E2DB7E, 0x00004000, 0x00000000
879 // Entries SC_inv in Swapped IEEE format (extended)
880 // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
882 data4 0x13B2B5BA, 0x969F335C, 0x00004000, 0x00000000
883 data4 0xD4C0F548, 0x93D446D9, 0x00004000, 0x00000000
884 data4 0x61B798AF, 0x9147094F, 0x00004000, 0x00000000
885 data4 0x758787AC, 0x8EF317CC, 0x00004000, 0x00000000
886 data4 0xB99EEFDB, 0x8CD498B3, 0x00004000, 0x00000000
887 data4 0xDFF8BC37, 0x8AE82A7D, 0x00004000, 0x00000000
888 data4 0xE3C55D42, 0x892AD546, 0x00004000, 0x00000000
889 data4 0xD15573C1, 0x8799FEA9, 0x00004000, 0x00000000
890 data4 0x435A4B4C, 0x86335F88, 0x00004000, 0x00000000
891 data4 0x3E93A87B, 0x84F4FB6E, 0x00004000, 0x00000000
892 data4 0x80A382FB, 0x83DD1952, 0x00004000, 0x00000000
893 data4 0xA4CB8C9E, 0x82EA3D7F, 0x00004000, 0x00000000
894 data4 0x6861D0A8, 0x821B247C, 0x00004000, 0x00000000
895 data4 0x63E8D244, 0x816EBED1, 0x00004000, 0x00000000
896 data4 0x27E4CFC6, 0x80E42D91, 0x00004000, 0x00000000
897 data4 0x28E64AFD, 0x807ABF8D, 0x00004000, 0x00000000
898 data4 0x863B4FD8, 0x8031EF26, 0x00004000, 0x00000000
899 data4 0xAE8C11FD, 0x800960AD, 0x00004000, 0x00000000
900 data4 0x5FDBEC21, 0x8000E147, 0x00004000, 0x00000000
901 data4 0xA07791FA, 0x80186650, 0x00004000, 0x00000000
965 NEGTWO_TO_NEG14 = f76
966 NEGTWO_TO_NEG33 = f77
1022 GR_Parameter_X = r49
1023 GR_Parameter_r = r50
1035 alloc r32 = ar.pfs, 0,17,2,0
1036 (p0) fclass.m.unc p6,p0 = Arg, 0x1E7
1043 (p0) fclass.nm.unc p7,p0 = Arg, 0x1FF
1049 (p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp
1056 ld8 table_ptr1 = [table_ptr1]
1057 setf.sig fp_tmp = gr_tmp // Make a constant so fmpy produces inexact
1063 // Check for NatVals, Infs , NaNs, and Zeros
1064 // Check for everything - if false, then must be pseudo-zero
1066 // Local table pointer
1070 (p0) add table_ptr2 = 96, table_ptr1
1071 (p6) br.cond.spnt __libm_TAN_SPECIAL
1072 (p7) br.cond.spnt __libm_TAN_SPECIAL ;;
1076 // Branch out to deal with unsupporteds and special values.
1080 (p0) ldfs TWO_TO_24 = [table_ptr1],4
1081 (p0) ldfs TWO_TO_63 = [table_ptr2],4
1083 // Load -2**24, load -2**63.
1085 (p0) fcmp.eq.s0 p0, p6 = Arg, f1 ;;
1089 (p0) ldfs NEGTWO_TO_63 = [table_ptr2],12
1090 (p0) fnorm.s1 Arg = Arg
1094 // Load 2**24, Load 2**63.
1098 (p0) ldfs NEGTWO_TO_24 = [table_ptr1],12 ;;
1100 // Do fcmp to generate Denormal exception
1101 // - can't do FNORM (will generate Underflow when U is unmasked!)
1102 // Normalize input argument.
1104 (p0) ldfe two_by_PI = [table_ptr1],16
1109 (p0) ldfe Inv_P_0 = [table_ptr2],16 ;;
1110 (p0) ldfe d_1 = [table_ptr2],16
1114 // Decide about the paths to take:
1115 // PR_1 and PR_3 set if -2**24 < Arg < 2**24 - CASE 1 OR 2
1116 // OTHERWISE - CASE 3 OR 4
1117 // Load inverse of P_0 .
1118 // Set PR_6 if Arg <= -2**63
1119 // Are there any Infs, NaNs, or zeros?
1123 (p0) ldfe P_0 = [table_ptr1],16 ;;
1124 (p0) ldfe d_2 = [table_ptr2],16
1128 // Set PR_8 if Arg <= -2**24
1129 // Set PR_6 if Arg >= 2**63
1133 (p0) ldfe P_1 = [table_ptr1],16 ;;
1134 (p0) ldfe PI_BY_4 = [table_ptr2],16
1138 // Set PR_8 if Arg >= 2**24
1142 (p0) ldfe P_2 = [table_ptr1],16 ;;
1143 (p0) ldfe MPI_BY_4 = [table_ptr2],16
1147 // Load P_2 and PI_BY_4
1151 (p0) ldfe P_3 = [table_ptr1],16
1158 (p0) fcmp.le.unc.s1 p6,p7 = Arg,NEGTWO_TO_63
1164 (p0) fcmp.le.unc.s1 p8,p9 = Arg,NEGTWO_TO_24
1170 (p7) fcmp.ge.s1 p6,p0 = Arg,TWO_TO_63
1176 (p9) fcmp.ge.s1 p8,p0 = Arg,TWO_TO_24
1184 // Load P_3 and -PI_BY_4
1186 (p6) br.cond.spnt TAN_ARG_TOO_LARGE ;;
1195 // Branch out if we have a special argument.
1196 // Branch out if the magnitude of the input argument is too large
1197 // - do this branch before the next.
1199 (p8) br.cond.spnt TAN_LARGER_ARG ;;
1202 // Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
1206 (p0) ldfs TWO_TO_NEG2 = [table_ptr2],4
1207 // ARGUMENT REDUCTION CODE - CASE 1 and 2
1210 (p0) fmpy.s1 N = Arg,two_by_PI
1215 (p0) ldfs NEGTWO_TO_NEG2 = [table_ptr2],12
1219 (p0) fcmp.lt.unc.s1 p8,p9= Arg,PI_BY_4
1226 // if Arg < pi/4, set PR_8.
1228 (p8) fcmp.gt.s1 p8,p9= Arg,MPI_BY_4
1232 // Case 1: Is |r| < 2**(-2).
1233 // Arg is the same as r in this case.
1239 (p8) mov N_fix_gr = r0
1241 // if Arg > -pi/4, reset PR_8.
1242 // Select the case when |Arg| < pi/4 - set PR[8] = true.
1243 // Else Select the case when |Arg| >= pi/4 - set PR[9] = true.
1245 (p0) fcvt.fx.s1 N_fix = N
1252 // Grab the integer part of N .
1266 (p8) fcmp.lt.unc.s1 p10, p11 = Arg, TWO_TO_NEG2
1272 (p10) fcmp.gt.s1 p10,p0 = Arg, NEGTWO_TO_NEG2
1279 // Case 2: Place integer part of N in GP register.
1281 (p9) fcvt.xf N = N_fix
1286 (p9) getf.sig N_fix_gr = N_fix
1289 // Case 2: Convert integer N_fix back to normalized floating-point value.
1291 (p10) br.cond.spnt TAN_SMALL_R ;;
1297 (p8) br.cond.sptk TAN_NORMAL_R ;;
1300 // Case 1: PR_3 is only affected when PR_1 is set.
1304 (p9) ldfs TWO_TO_NEG33 = [table_ptr2], 4 ;;
1306 // Case 2: Load 2**(-33).
1308 (p9) ldfs NEGTWO_TO_NEG33 = [table_ptr2], 4
1315 // Case 2: Load -2**(-33).
1317 (p9) fnma.s1 s_val = N, P_1, Arg
1323 (p9) fmpy.s1 w = N, P_2
1330 // Case 2: w = N * P_2
1331 // Case 2: s_val = -N * P_1 + Arg
1333 (p0) fcmp.lt.unc.s1 p9,p8 = s_val, TWO_TO_NEG33
1340 // Decide between case_1 and case_2 reduce:
1342 (p9) fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33
1349 // Case 1_reduce: s <= -2**(-33) or s >= 2**(-33)
1350 // Case 2_reduce: -2**(-33) < s < 2**(-33)
1352 (p8) fsub.s1 r = s_val, w
1358 (p9) fmpy.s1 w = N, P_3
1364 (p9) fma.s1 U_1 = N, P_2, w
1371 // Case 1_reduce: Is |r| < 2**(-2), if so set PR_10
1374 (p8) fsub.s1 c = s_val, r
1381 // Case 1_reduce: r = s + w (change sign)
1382 // Case 2_reduce: w = N * P_3 (change sign)
1384 (p8) fcmp.lt.unc.s1 p10, p11 = r, TWO_TO_NEG2
1390 (p10) fcmp.gt.s1 p10, p11 = r, NEGTWO_TO_NEG2
1396 (p9) fsub.s1 r = s_val, U_1
1403 // Case 1_reduce: c is complete here.
1404 // c = c + w (w has not been negated.)
1405 // Case 2_reduce: r is complete here - continue to calculate c .
1408 (p9) fms.s1 U_2 = N, P_2, U_1
1415 // Case 1_reduce: c = s - r
1416 // Case 2_reduce: U_1 = N * P_2 + w
1418 (p8) fsub.s1 c = c, w
1424 (p9) fsub.s1 s_val = s_val, r
1432 // U_2 = N * P_2 - U_1
1433 // Not needed until later.
1435 (p9) fadd.s1 U_2 = U_2, w
1441 (p10) br.cond.spnt TAN_SMALL_R ;;
1447 (p11) br.cond.sptk TAN_NORMAL_R ;;
1455 // c is complete here
1456 // Argument reduction ends here.
1458 (p9) extr.u i_1 = N_fix_gr, 0, 1 ;;
1459 (p9) cmp.eq.unc p11, p12 = 0x0000,i_1 ;;
1465 // Is i_1 even or odd?
1466 // if i_1 == 0, set p11, else set p12.
1468 (p11) fmpy.s1 rsq = r, Z
1474 (p12) frcpa.s1 S_hi,p0 = f1, r
1479 // Case 1: Branch to SMALL_R or NORMAL_R.
1480 // Case 1 is done now.
1484 (p9) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp
1485 (p9) fsub.s1 c = s_val, U_1
1491 (p9) ld8 table_ptr1 = [table_ptr1]
1498 (p9) add table_ptr1 = 224, table_ptr1 ;;
1499 (p9) ldfe P1_1 = [table_ptr1],144
1503 // Get [i_1] - lsb of N_fix_gr .
1504 // Load P1_1 and point to Q1_1 .
1508 (p9) ldfe Q1_1 = [table_ptr1] , 0
1510 // N even: rsq = r * Z
1511 // N odd: S_hi = frcpa(r)
1513 (p12) fmerge.ns S_hi = S_hi, S_hi
1523 (p9) fsub.s1 c = c, U_2
1529 (p12) fma.s1 poly1 = S_hi, r, f1
1536 // N odd: Change sign of S_hi
1538 (p11) fmpy.s1 rsq = rsq, P1_1
1544 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
1551 // N even: rsq = rsq * P1_1
1552 // N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary
1554 (p11) fma.s1 Result = r, rsq, c
1561 // N even: Result = c + r * rsq
1562 // N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary
1564 (p12) fma.s1 poly1 = S_hi, r, f1
1571 // N even: Result = Result + r
1572 // N odd: poly1 = 1.0 + S_hi * r 32 bits partial
1574 (p11) fadd.s0 Result = r, Result
1580 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
1587 // N even: Result1 = Result + r
1588 // N odd: S_hi = S_hi * poly1 + S_hi 32 bits
1590 (p12) fma.s1 poly1 = S_hi, r, f1
1597 // N odd: poly1 = S_hi * r + 1.0 64 bits partial
1599 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
1606 // N odd: poly1 = S_hi * poly + 1.0 64 bits
1608 (p12) fma.s1 poly1 = S_hi, r, f1
1615 // N odd: poly1 = S_hi * r + 1.0
1617 (p12) fma.s1 poly1 = S_hi, c, poly1
1624 // N odd: poly1 = S_hi * c + poly1
1626 (p12) fmpy.s1 S_lo = S_hi, poly1
1633 // N odd: S_lo = S_hi * poly1
1635 (p12) fma.s1 S_lo = Q1_1, r, S_lo
1642 // N odd: Result = S_hi + S_lo
1644 (p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
1651 // N odd: S_lo = S_lo + Q1_1 * r
1653 (p12) fadd.s0 Result = S_hi, S_lo
1654 (p0) br.ret.sptk b0 ;;
1661 (p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp
1663 (p0) fmpy.s1 N_0 = Arg, Inv_P_0
1668 // ARGUMENT REDUCTION CODE - CASE 3 and 4
1671 // Adjust table_ptr1 to beginning of table.
1672 // N_0 = Arg * Inv_P_0
1677 (p0) ld8 table_ptr1 = [table_ptr1]
1685 (p0) add table_ptr1 = 8, table_ptr1 ;;
1689 (p0) ldfs TWO_TO_NEG14 = [table_ptr1], 4
1697 (p0) ldfs NEGTWO_TO_NEG14 = [table_ptr1], 180 ;;
1699 // N_0_fix = integer part of N_0 .
1700 // Adjust table_ptr1 to beginning of table.
1702 (p0) ldfs TWO_TO_NEG2 = [table_ptr1], 4
1706 // Make N_0 the integer part.
1710 (p0) ldfs NEGTWO_TO_NEG2 = [table_ptr1]
1714 (p0) fcvt.fx.s1 N_0_fix = N_0
1720 (p0) fcvt.xf N_0 = N_0_fix
1726 (p0) fnma.s1 ArgPrime = N_0, P_0, Arg
1732 (p0) fmpy.s1 w = N_0, d_1
1739 // ArgPrime = -N_0 * P_0 + Arg
1742 (p0) fmpy.s1 N = ArgPrime, two_by_PI
1749 // N = ArgPrime * 2/pi
1751 (p0) fcvt.fx.s1 N_fix = N
1758 // N_fix is the integer part.
1760 (p0) fcvt.xf N = N_fix
1765 (p0) getf.sig N_fix_gr = N_fix
1773 // N is the integer part of the reduced-reduced argument.
1774 // Put the integer in a GP register.
1776 (p0) fnma.s1 s_val = N, P_1, ArgPrime
1782 (p0) fnma.s1 w = N, P_2, w
1789 // s_val = -N*P_1 + ArgPrime
1792 (p0) fcmp.lt.unc.s1 p11, p10 = s_val, TWO_TO_NEG14
1798 (p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14
1805 // Case 3: r = s_val + w (Z complete)
1806 // Case 4: U_hi = N_0 * d_1
1808 (p10) fmpy.s1 V_hi = N, P_2
1814 (p11) fmpy.s1 U_hi = N_0, d_1
1821 // Case 3: r = s_val + w (Z complete)
1822 // Case 4: U_hi = N_0 * d_1
1824 (p11) fmpy.s1 V_hi = N, P_2
1830 (p11) fmpy.s1 U_hi = N_0, d_1
1837 // Decide between case 3 and 4:
1838 // Case 3: s <= -2**(-14) or s >= 2**(-14)
1839 // Case 4: -2**(-14) < s < 2**(-14)
1841 (p10) fadd.s1 r = s_val, w
1847 (p11) fmpy.s1 w = N, P_3
1854 // Case 4: We need abs of both U_hi and V_hi - dont
1855 // worry about switched sign of V_hi .
1857 (p11) fsub.s1 A = U_hi, V_hi
1864 // Case 4: A = U_hi + V_hi
1865 // Note: Worry about switched sign of V_hi, so subtract instead of add.
1867 (p11) fnma.s1 V_lo = N, P_2, V_hi
1873 (p11) fms.s1 U_lo = N_0, d_1, U_hi
1879 (p11) fabs V_hiabs = V_hi
1886 // Case 4: V_hi = N * P_2
1888 // Note the product does not include the (-) as in the writeup
1889 // so (-) missing for V_hi and w .
1890 (p10) fadd.s1 r = s_val, w
1897 // Case 3: c = s_val - r
1898 // Case 4: U_lo = N_0 * d_1 - U_hi
1900 (p11) fabs U_hiabs = U_hi
1906 (p11) fmpy.s1 w = N, P_3
1913 // Case 4: Set P_12 if U_hiabs >= V_hiabs
1915 (p11) fadd.s1 C_hi = s_val, A
1922 // Case 4: C_hi = s_val + A
1924 (p11) fadd.s1 t = U_lo, V_lo
1931 // Case 3: Is |r| < 2**(-2), if so set PR_7
1933 // Case 3: If PR_7 is set, prepare to branch to Small_R.
1934 // Case 3: If PR_8 is set, prepare to branch to Normal_R.
1936 (p10) fsub.s1 c = s_val, r
1943 // Case 3: c = (s - r) + w (c complete)
1945 (p11) fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs
1951 (p11) fms.s1 w = N_0, d_2, w
1958 // Case 4: V_hi = N * P_2
1960 // Note the product does not include the (-) as in the writeup
1961 // so (-) missing for V_hi and w .
1963 (p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2
1969 (p14) fcmp.gt.s1 p14, p15 = r, NEGTWO_TO_NEG2
1976 // Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup)
1977 // Note: the (-) is still missing for V_hi .
1978 // Case 4: w = w + N_0 * d_2
1979 // Note: the (-) is now incorporated in w .
1981 (p10) fadd.s1 c = c, w
1983 // Case 4: t = U_lo + V_lo
1984 // Note: remember V_lo should be (-), subtract instead of add. NO
1986 (p14) br.cond.spnt TAN_SMALL_R ;;
1992 (p15) br.cond.spnt TAN_NORMAL_R ;;
1998 // Case 3: Vector off when |r| < 2**(-2). Recall that PR_3 will be true.
1999 // The remaining stuff is for Case 4.
2001 (p12) fsub.s1 a = U_hi, A
2002 (p11) extr.u i_1 = N_fix_gr, 0, 1 ;;
2008 // Case 4: C_lo = s_val - C_hi
2010 (p11) fadd.s1 t = t, w
2016 (p13) fadd.s1 a = V_hi, A
2021 // Case 4: a = U_hi - A
2022 // a = V_hi - A (do an add to account for missing (-) on V_hi
2026 (p11) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp
2027 (p11) fsub.s1 C_lo = s_val, C_hi
2033 (p11) ld8 table_ptr1 = [table_ptr1]
2040 // Case 4: a = (U_hi - A) + V_hi
2041 // a = (V_hi - A) + U_hi
2042 // In each case account for negative missing form V_hi .
2045 // Case 4: C_lo = (s_val - C_hi) + A
2049 (p11) add table_ptr1 = 224, table_ptr1 ;;
2050 (p11) ldfe P1_1 = [table_ptr1], 16
2055 (p11) ldfe P1_2 = [table_ptr1], 128
2057 // Case 4: w = U_lo + V_lo + w
2059 (p12) fsub.s1 a = a, V_hi
2063 // Case 4: r = C_hi + C_lo
2067 (p11) ldfe Q1_1 = [table_ptr1], 16
2068 (p11) fadd.s1 C_lo = C_lo, A
2072 // Case 4: c = C_hi - r
2073 // Get [i_1] - lsb of N_fix_gr.
2077 (p11) ldfe Q1_2 = [table_ptr1], 16
2084 (p13) fsub.s1 a = U_hi, a
2090 (p11) fadd.s1 t = t, a
2097 // Case 4: t = t + a
2099 (p11) fadd.s1 C_lo = C_lo, t
2106 // Case 4: C_lo = C_lo + t
2108 (p11) fadd.s1 r = C_hi, C_lo
2114 (p11) fsub.s1 c = C_hi, r
2121 // Case 4: c = c + C_lo finished.
2122 // Is i_1 even or odd?
2123 // if i_1 == 0, set PR_4, else set PR_5.
2125 // r and c have been computed.
2126 // We known whether this is the sine or cosine routine.
2127 // Make sure ftz mode is set - should be automatic when using wre
2128 (p0) fmpy.s1 rsq = r, r
2134 (p11) fadd.s1 c = c , C_lo
2135 (p11) cmp.eq.unc p11, p12 = 0x0000, i_1 ;;
2140 (p12) frcpa.s1 S_hi, p0 = f1, r
2147 // N odd: Change sign of S_hi
2149 (p11) fma.s1 Result = rsq, P1_2, P1_1
2155 (p12) fma.s1 P = rsq, Q1_2, Q1_1
2162 // N odd: Result = S_hi + S_lo (User supplied rounding mode for C1)
2164 (p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
2171 // N even: rsq = r * r
2172 // N odd: S_hi = frcpa(r)
2174 (p12) fmerge.ns S_hi = S_hi, S_hi
2181 // N even: rsq = rsq * P1_2 + P1_1
2182 // N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary
2184 (p11) fmpy.s1 Result = rsq, Result
2190 (p12) fma.s1 poly1 = S_hi, r,f1
2197 // N even: Result = Result * rsq
2198 // N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary
2200 (p11) fma.s1 Result = r, Result, c
2206 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2213 // N odd: S_hi = S_hi * poly1 + S_hi 32 bits
2215 (p11) fadd.s0 Result= r, Result
2221 (p12) fma.s1 poly1 = S_hi, r, f1
2228 // N even: Result = Result * r + c
2229 // N odd: poly1 = 1.0 + S_hi * r 32 bits partial
2231 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2237 (p12) fma.s1 poly1 = S_hi, r, f1
2244 // N even: Result1 = Result + r (Rounding mode S0)
2245 // N odd: poly1 = S_hi * r + 1.0 64 bits partial
2247 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2254 // N odd: poly1 = S_hi * poly + S_hi 64 bits
2256 (p12) fma.s1 poly1 = S_hi, r, f1
2263 // N odd: poly1 = S_hi * r + 1.0
2265 (p12) fma.s1 poly1 = S_hi, c, poly1
2272 // N odd: poly1 = S_hi * c + poly1
2274 (p12) fmpy.s1 S_lo = S_hi, poly1
2281 // N odd: S_lo = S_hi * poly1
2283 (p12) fma.s1 S_lo = P, r, S_lo
2290 // N odd: S_lo = S_lo + r * P
2292 (p12) fadd.s0 Result = S_hi, S_lo
2293 (p0) br.ret.sptk b0 ;;
2301 (p0) extr.u i_1 = N_fix_gr, 0, 1 ;;
2302 (p0) cmp.eq.unc p11, p12 = 0x0000, i_1
2307 (p0) fmpy.s1 rsq = r, r
2313 (p12) frcpa.s1 S_hi, p0 = f1, r
2318 (p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp
2325 (p0) ld8 table_ptr1 = [table_ptr1]
2331 // *****************************************************************
2332 // *****************************************************************
2333 // *****************************************************************
2336 (p0) add table_ptr1 = 224, table_ptr1 ;;
2337 (p0) ldfe P1_1 = [table_ptr1], 16
2340 // r and c have been computed.
2341 // We known whether this is the sine or cosine routine.
2342 // Make sure ftz mode is set - should be automatic when using wre
2346 (p0) ldfe P1_2 = [table_ptr1], 16
2347 (p11) fmpy.s1 r_to_the_8 = rsq, rsq
2351 // Set table_ptr1 to beginning of constant table.
2352 // Get [i_1] - lsb of N_fix_gr.
2356 (p0) ldfe P1_3 = [table_ptr1], 96
2358 // N even: rsq = r * r
2359 // N odd: S_hi = frcpa(r)
2361 (p12) fmerge.ns S_hi = S_hi, S_hi
2365 // Is i_1 even or odd?
2366 // if i_1 == 0, set PR_11.
2367 // if i_1 != 0, set PR_12.
2371 (p11) ldfe P1_9 = [table_ptr1], -16
2373 // N even: Poly2 = P1_7 + Poly2 * rsq
2374 // N odd: poly2 = Q1_5 + poly2 * rsq
2376 (p11) fadd.s1 CORR = rsq, f1
2381 (p11) ldfe P1_8 = [table_ptr1], -16 ;;
2383 // N even: Poly1 = P1_2 + P1_3 * rsq
2384 // N odd: poly1 = 1.0 + S_hi * r
2385 // 16 bits partial account for necessary (-1)
2387 (p11) ldfe P1_7 = [table_ptr1], -16
2391 // N even: Poly1 = P1_1 + Poly1 * rsq
2392 // N odd: S_hi = S_hi + S_hi * poly1) 16 bits account for necessary
2396 (p11) ldfe P1_6 = [table_ptr1], -16
2398 // N even: Poly2 = P1_5 + Poly2 * rsq
2399 // N odd: poly2 = Q1_3 + poly2 * rsq
2401 (p11) fmpy.s1 r_to_the_8 = r_to_the_8, r_to_the_8
2405 // N even: Poly1 = Poly1 * rsq
2406 // N odd: poly1 = 1.0 + S_hi * r 32 bits partial
2410 (p11) ldfe P1_5 = [table_ptr1], -16
2411 (p12) fma.s1 poly1 = S_hi, r, f1
2415 // N even: CORR = CORR * c
2416 // N odd: S_hi = S_hi * poly1 + S_hi 32 bits
2420 // N even: Poly2 = P1_6 + Poly2 * rsq
2421 // N odd: poly2 = Q1_4 + poly2 * rsq
2424 (p0) addl table_ptr2 = @ltoff(TAN_BASE_CONSTANTS), gp
2425 (p11) ldfe P1_4 = [table_ptr1], -16
2426 (p11) fmpy.s1 CORR = CORR, c
2432 (p0) ld8 table_ptr2 = [table_ptr2]
2440 (p0) add table_ptr2 = 464, table_ptr2
2447 (p11) fma.s1 Poly1 = P1_3, rsq, P1_2
2452 (p0) ldfe Q1_7 = [table_ptr2], -16
2453 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2458 (p0) ldfe Q1_6 = [table_ptr2], -16
2459 (p11) fma.s1 Poly2 = P1_9, rsq, P1_8
2464 (p0) ldfe Q1_5 = [table_ptr2], -16 ;;
2465 (p12) ldfe Q1_4 = [table_ptr2], -16
2470 (p12) ldfe Q1_3 = [table_ptr2], -16
2472 // N even: Poly2 = P1_8 + P1_9 * rsq
2473 // N odd: poly2 = Q1_6 + Q1_7 * rsq
2475 (p11) fma.s1 Poly1 = Poly1, rsq, P1_1
2480 (p12) ldfe Q1_2 = [table_ptr2], -16
2481 (p12) fma.s1 poly1 = S_hi, r, f1
2486 (p12) ldfe Q1_1 = [table_ptr2], -16
2487 (p11) fma.s1 Poly2 = Poly2, rsq, P1_7
2494 // N even: CORR = rsq + 1
2495 // N even: r_to_the_8 = rsq * rsq
2497 (p11) fmpy.s1 Poly1 = Poly1, rsq
2503 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2509 (p12) fma.s1 poly2 = Q1_7, rsq, Q1_6
2515 (p11) fma.s1 Poly2 = Poly2, rsq, P1_6
2521 (p12) fma.s1 poly1 = S_hi, r, f1
2527 (p12) fma.s1 poly2 = poly2, rsq, Q1_5
2533 (p11) fma.s1 Poly2= Poly2, rsq, P1_5
2539 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2545 (p12) fma.s1 poly2 = poly2, rsq, Q1_4
2552 // N even: r_to_the_8 = r_to_the_8 * r_to_the_8
2553 // N odd: poly1 = S_hi * r + 1.0 64 bits partial
2555 (p11) fma.s1 Poly2 = Poly2, rsq, P1_4
2562 // N even: Result = CORR + Poly * r
2563 // N odd: P = Q1_1 + poly2 * rsq
2565 (p12) fma.s1 poly1 = S_hi, r, f1
2571 (p12) fma.s1 poly2 = poly2, rsq, Q1_3
2578 // N even: Poly2 = P1_4 + Poly2 * rsq
2579 // N odd: poly2 = Q1_2 + poly2 * rsq
2581 (p11) fma.s1 Poly = Poly2, r_to_the_8, Poly1
2587 (p12) fma.s1 poly1 = S_hi, c, poly1
2593 (p12) fma.s1 poly2 = poly2, rsq, Q1_2
2600 // N even: Poly = Poly1 + Poly2 * r_to_the_8
2601 // N odd: S_hi = S_hi * poly1 + S_hi 64 bits
2603 (p11) fma.s1 Result = Poly, r, CORR
2610 // N even: Result = r + Result (User supplied rounding mode)
2611 // N odd: poly1 = S_hi * c + poly1
2613 (p12) fmpy.s1 S_lo = S_hi, poly1
2619 (p12) fma.s1 P = poly2, rsq, Q1_1
2626 // N odd: poly1 = S_hi * r + 1.0
2628 (p11) fadd.s0 Result = Result, r
2635 // N odd: S_lo = S_hi * poly1
2637 (p12) fma.s1 S_lo = Q1_1, c, S_lo
2644 // N odd: Result = Result + S_hi (user supplied rounding mode)
2646 (p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
2653 // N odd: S_lo = Q1_1 * c + S_lo
2655 (p12) fma.s1 Result = P, r, S_lo
2662 // N odd: Result = S_lo + r * P
2664 (p12) fadd.s0 Result = Result, S_hi
2665 (p0) br.ret.sptk b0 ;;
2672 (p0) getf.sig sig_r = r
2673 // *******************************************************************
2674 // *******************************************************************
2675 // *******************************************************************
2677 // r and c have been computed.
2678 // Make sure ftz mode is set - should be automatic when using wre
2681 // Get [i_1] - lsb of N_fix_gr alone.
2683 (p0) fmerge.s Pos_r = f1, r
2684 (p0) extr.u i_1 = N_fix_gr, 0, 1 ;;
2689 (p0) fmerge.s sgn_r = r, f1
2690 (p0) cmp.eq.unc p11, p12 = 0x0000, i_1 ;;
2696 (p0) extr.u lookup = sig_r, 58, 5
2701 (p0) movl Create_B = 0x8200000000000000 ;;
2705 (p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp
2707 (p0) dep Create_B = lookup, Create_B, 58, 5
2712 // Get [i_1] - lsb of N_fix_gr alone.
2718 ld8 table_ptr1 = [table_ptr1]
2727 (p0) setf.sig B = Create_B
2729 // Set table_ptr1 and table_ptr2 to base address of
2732 (p0) add table_ptr1 = 480, table_ptr1 ;;
2738 // Is i_1 or i_0 == 0 ?
2739 // Create the constant 1 00000 1000000000000000000000...
2741 (p0) ldfe P2_1 = [table_ptr1], 16
2747 (p0) getf.exp exp_r = Pos_r
2752 // Get r's significand
2756 (p0) ldfe P2_2 = [table_ptr1], 16 ;;
2758 // Get the 5 bits or r for the lookup. 1.xxxxx ....
2760 // Grab lsb of exp of B
2762 (p0) ldfe P2_3 = [table_ptr1], 16
2768 (p0) andcm table_offset = 0x0001, exp_r ;;
2769 (p0) shl table_offset = table_offset, 9 ;;
2775 // Deposit 0 00000 1000000000000000000000... on
2776 // 1 xxxxx yyyyyyyyyyyyyyyyyyyyyy...,
2777 // getting rid of the ys.
2778 // Is B = 2** -2 or B= 2** -1? If 2**-1, then
2779 // we want an offset of 512 for table addressing.
2781 (p0) shladd table_offset = lookup, 4, table_offset ;;
2783 // B = ........ 1xxxxx 1000000000000000000...
2785 (p0) add table_ptr1 = table_ptr1, table_offset ;;
2791 // B = ........ 1xxxxx 1000000000000000000...
2792 // Convert B so it has the same exponent as Pos_r
2794 (p0) ldfd T_hi = [table_ptr1], 8
2805 (p0) addl table_ptr2 = @ltoff(TAN_BASE_CONSTANTS), gp
2806 (p0) ldfs T_lo = [table_ptr1]
2807 (p0) fmerge.se B = Pos_r, B
2812 ld8 table_ptr2 = [table_ptr2]
2819 (p0) add table_ptr2 = 1360, table_ptr2
2821 (p0) add table_ptr2 = table_ptr2, table_offset ;;
2825 (p0) ldfd C_hi = [table_ptr2], 8
2826 (p0) fsub.s1 x = Pos_r, B
2831 (p0) ldfs C_lo = [table_ptr2],255
2835 // N even: Tx = T_hi * x
2837 // Load C_lo - increment pointer to get SC_inv
2838 // - cant get all the way, do an add later.
2840 (p0) add table_ptr2 = 569, table_ptr2 ;;
2843 // N even: Tx1 = Tx + 1
2844 // N odd: Cx1 = 1 - Cx
2848 (p0) ldfe SC_inv = [table_ptr2], 0
2855 (p0) fmpy.s1 xsq = x, x
2861 (p11) fmpy.s1 Tx = T_hi, x
2867 (p12) fmpy.s1 Cx = C_hi, x
2874 // N odd: Cx = C_hi * x
2876 (p0) fma.s1 P = P2_3, xsq, P2_2
2883 // N even and odd: P = P2_3 + P2_2 * xsq
2885 (p11) fadd.s1 Tx1 = Tx, f1
2892 // N even: D = C_hi - tanx
2893 // N odd: D = T_hi + tanx
2895 (p11) fmpy.s1 CORR = SC_inv, T_hi
2901 (p0) fmpy.s1 Sx = SC_inv, x
2907 (p12) fmpy.s1 CORR = SC_inv, C_hi
2913 (p12) fsub.s1 V_hi = f1, Cx
2919 (p0) fma.s1 P = P, xsq, P2_1
2926 // N even and odd: P = P2_1 + P * xsq
2928 (p11) fma.s1 V_hi = Tx, Tx1, f1
2935 // N even: Result = sgn_r * tail + T_hi (user rounding mode for C1)
2936 // N odd: Result = sgn_r * tail + C_hi (user rounding mode for C1)
2938 (p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
2944 (p0) fmpy.s1 CORR = CORR, c
2950 (p12) fnma.s1 V_hi = Cx,V_hi,f1
2957 // N even: V_hi = Tx * Tx1 + 1
2958 // N odd: Cx1 = 1 - Cx * Cx1
2960 (p0) fmpy.s1 P = P, xsq
2967 // N even and odd: P = P * xsq
2969 (p11) fmpy.s1 V_hi = V_hi, T_hi
2976 // N even and odd: tail = P * tail + V_lo
2978 (p11) fmpy.s1 T_hi = sgn_r, T_hi
2984 (p0) fmpy.s1 CORR = CORR, sgn_r
2990 (p12) fmpy.s1 V_hi = V_hi,C_hi
2997 // N even: V_hi = T_hi * V_hi
2998 // N odd: V_hi = C_hi * V_hi
3000 (p0) fma.s1 tanx = P, x, x
3006 (p12) fnmpy.s1 C_hi = sgn_r, C_hi
3013 // N even: V_lo = 1 - V_hi + C_hi
3014 // N odd: V_lo = 1 - V_hi + T_hi
3016 (p11) fadd.s1 CORR = CORR, T_lo
3022 (p12) fsub.s1 CORR = CORR, C_lo
3029 // N even and odd: tanx = x + x * P
3030 // N even and odd: Sx = SC_inv * x
3032 (p11) fsub.s1 D = C_hi, tanx
3038 (p12) fadd.s1 D = T_hi, tanx
3045 // N odd: CORR = SC_inv * C_hi
3046 // N even: CORR = SC_inv * T_hi
3048 (p0) fnma.s1 D = V_hi, D, f1
3055 // N even and odd: D = 1 - V_hi * D
3056 // N even and odd: CORR = CORR * c
3058 (p0) fma.s1 V_hi = V_hi, D, V_hi
3065 // N even and odd: V_hi = V_hi + V_hi * D
3066 // N even and odd: CORR = sgn_r * CORR
3068 (p11) fnma.s1 V_lo = V_hi, C_hi, f1
3074 (p12) fnma.s1 V_lo = V_hi, T_hi, f1
3081 // N even: CORR = COOR + T_lo
3082 // N odd: CORR = CORR - C_lo
3084 (p11) fma.s1 V_lo = tanx, V_hi, V_lo
3090 (p12) fnma.s1 V_lo = tanx, V_hi, V_lo
3097 // N even: V_lo = V_lo + V_hi * tanx
3098 // N odd: V_lo = V_lo - V_hi * tanx
3100 (p11) fnma.s1 V_lo = C_lo, V_hi, V_lo
3106 (p12) fnma.s1 V_lo = T_lo, V_hi, V_lo
3113 // N even: V_lo = V_lo - V_hi * C_lo
3114 // N odd: V_lo = V_lo - V_hi * T_lo
3116 (p0) fmpy.s1 V_lo = V_hi, V_lo
3123 // N even and odd: V_lo = V_lo * V_hi
3125 (p0) fadd.s1 tail = V_hi, V_lo
3132 // N even and odd: tail = V_hi + V_lo
3134 (p0) fma.s1 tail = tail, P, V_lo
3141 // N even: T_hi = sgn_r * T_hi
3142 // N odd : C_hi = -sgn_r * C_hi
3144 (p0) fma.s1 tail = tail, Sx, CORR
3151 // N even and odd: tail = Sx * tail + CORR
3153 (p0) fma.s1 tail = V_hi, Sx, tail
3160 // N even an odd: tail = Sx * V_hi + tail
3162 (p11) fma.s0 Result = sgn_r, tail, T_hi
3168 (p12) fma.s0 Result = sgn_r, tail, C_hi
3169 (p0) br.ret.sptk b0 ;;
3173 ASM_SIZE_DIRECTIVE(__libm_tan)
3177 // *******************************************************************
3178 // *******************************************************************
3179 // *******************************************************************
3181 // Special Code to handle very large argument case.
3182 // Call int pi_by_2_reduce(&x,&r)
3183 // for |arguments| >= 2**63
3184 // (Arg or x) is in f8
3185 // Address to save r and c as double
3187 // (1) (2) (3) (call) (4)
3188 // sp -> + psp -> + psp -> + sp -> +
3190 // | r50 ->| <- r50 f0 ->| r50 -> | -> c
3192 // sp-32 -> | <- r50 f0 ->| f0 ->| <- r50 r49 -> | -> r
3194 // | r49 ->| <- r49 Arg ->| <- r49 | -> x
3196 // sp -64 ->| sp -64 ->| sp -64 ->| |
3198 // save pfs save b0 restore gp
3199 // save gp restore b0
3204 .proc __libm_callout
3210 add GR_Parameter_r =-32,sp // Parameter: r address
3212 .save ar.pfs,GR_SAVE_PFS
3213 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
3217 add sp=-64,sp // Create new stack
3219 mov GR_SAVE_GP=gp // Save gp
3224 stfe [GR_Parameter_r ] = f0,16 // Clear Parameter r on stack
3225 add GR_Parameter_X = 16,sp // Parameter x address
3226 .save b0, GR_SAVE_B0
3227 mov GR_SAVE_B0=b0 // Save b0
3233 stfe [GR_Parameter_r ] = f0,-16 // Clear Parameter c on stack
3238 stfe [GR_Parameter_X] = Arg // Store Parameter x on stack
3240 (p0) br.call.sptk b0=__libm_pi_by_2_reduce#
3247 mov gp = GR_SAVE_GP // Restore gp
3248 (p0) mov N_fix_gr = r8
3254 (p0) ldfe Arg =[GR_Parameter_X],16
3255 (p0) ldfs TWO_TO_NEG2 = [table_ptr2],4
3262 (p0) ldfe r =[GR_Parameter_r ],16
3263 (p0) ldfs NEGTWO_TO_NEG2 = [table_ptr2],4
3268 (p0) ldfe c =[GR_Parameter_r ]
3278 (p0) fcmp.lt.unc.s1 p6, p0 = r, TWO_TO_NEG2
3279 mov b0 = GR_SAVE_B0 // Restore return address
3285 (p6) fcmp.gt.unc.s1 p6, p0 = r, NEGTWO_TO_NEG2
3286 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
3292 add sp = 64,sp // Restore stack pointer
3293 (p6) br.cond.spnt TAN_SMALL_R
3294 (p0) br.cond.sptk TAN_NORMAL_R
3297 .endp __libm_callout
3298 ASM_SIZE_DIRECTIVE(__libm_callout)
3301 .proc __libm_TAN_SPECIAL
3305 // Code for NaNs, Unsupporteds, Infs, or +/- zero ?
3306 // Invalid raised for Infs and SNaNs.
3311 (p0) fmpy.s0 Arg = Arg, f0
3314 .endp __libm_TAN_SPECIAL
3315 ASM_SIZE_DIRECTIVE(__libm_TAN_SPECIAL)
3318 .type __libm_pi_by_2_reduce#,@function
3319 .global __libm_pi_by_2_reduce#