3 // Copyright (c) 2000, 2001, Intel Corporation
4 // All rights reserved.
6 // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
7 // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
11 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
12 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
13 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
14 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
15 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
16 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
17 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
18 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
19 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
20 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
21 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 // Intel Corporation is the author of this code, and requests that all
24 // problem reports or change requests be submitted to it directly at
25 // http://developer.intel.com/opensource.
28 // *********************************************************************
31 // 2/02/00 (hand-optimized)
32 // 4/04/00 Unwind support added
33 // 8/15/00 Bundle added after call to __libm_error_support to properly
34 // set [the previously overwritten] GR_Parameter_RESULT.
36 // *********************************************************************
38 // Function: atanl(x) = inverse tangent(x), for double extended x values
39 // Function: atan2l(y,x) = atan(y/x), for double extended x values
41 // *********************************************************************
45 // Floating-Point Registers: f8 (Input and Return Value)
49 // General Purpose Registers:
51 // r49,r50,r51,r52 (Arguments to error support for 0,0 case)
53 // Predicate Registers: p6-p15
55 // *********************************************************************
57 // IEEE Special Conditions:
59 // Denormal fault raised on denormal inputs
60 // Underflow exceptions may occur
61 // Special error handling for the y=0 and x=0 case
62 // Inexact raised when appropriate by algorithm
66 // atanl(+/-0) = +/- 0
67 // atanl(+/-Inf) = +/-pi/2
69 // atan2l(Any NaN for x or y) = QNaN
70 // atan2l(+/-0,x) = +/-0 for x > 0
71 // atan2l(+/-0,x) = +/-pi for x < 0
72 // atan2l(+/-0,+0) = +/-0
73 // atan2l(+/-0,-0) = +/-pi
74 // atan2l(y,+/-0) = pi/2 y > 0
75 // atan2l(y,+/-0) = -pi/2 y < 0
76 // atan2l(+/-y, Inf) = +/-0 for finite y > 0
77 // atan2l(+/-Inf, x) = +/-pi/2 for finite x
78 // atan2l(+/-y, -Inf) = +/-pi for finite y > 0
79 // atan2l(+/-Inf, Inf) = +/-pi/4
80 // atan2l(+/-Inf, -Inf) = +/-3pi/4
82 // *********************************************************************
84 // Mathematical Description
85 // ---------------------------
87 // The function ATANL( Arg_Y, Arg_X ) returns the "argument"
88 // or the "phase" of the complex number
92 // or equivalently, the angle in radians from the positive
93 // x-axis to the line joining the origin and the point
102 // \ angle between is ATANL(Arg_Y,Arg_X)
108 // ------------------> X-axis
112 // Moreover, this angle is reported in the range [-pi,pi] thus
114 // -pi <= ATANL( Arg_Y, Arg_X ) <= pi.
116 // From the geometry, it is easy to define ATANL when one of
117 // Arg_X or Arg_Y is +-0 or +-inf:
121 // X \ | +0 | -0 | +inf | -inf | finite non-zero
123 // ______________________________________________________
125 // +-0 | Invalid/ | pi/2 | -pi/2 | sign(Y)*pi/2
127 // --------------------------------------------------------
129 // +inf | +0 | -0 | pi/4 | -pi/4 | sign(Y)*0
130 // --------------------------------------------------------
132 // -inf | +pi | -pi | 3pi/4 | -3pi/4 | sign(Y)*pi
133 // --------------------------------------------------------
134 // finite | X>0? | pi/2 | -pi/2 | normal case
135 // non-zero| sign(Y)*0: | | |
136 // | sign(Y)*pi | | |
139 // One must take note that ATANL is NOT the arctangent of the
140 // value Arg_Y/Arg_X; but rather ATANL and arctan are related
141 // in a slightly more complicated way as follows:
143 // Let U := max(|Arg_X|, |Arg_Y|); V := min(|Arg_X|, |Arg_Y|);
144 // sign_X be the sign bit of Arg_X, i.e., sign_X is 0 or 1;
145 // s_X be the sign of Arg_X, i.e., s_X = (-1)^sign_X;
147 // sign_Y be the sign bit of Arg_Y, i.e., sign_Y is 0 or 1;
148 // s_Y be the sign of Arg_Y, i.e., s_Y = (-1)^sign_Y;
150 // swap be 0 if |Arg_X| >= |Arg_Y| and 1 otherwise.
152 // Then, ATANL(Arg_Y, Arg_X) =
154 // / arctan(V/U) \ sign_X = 0 & swap = 0
155 // | pi/2 - arctan(V/U) | sign_X = 0 & swap = 1
157 // | pi - arctan(V/U) | sign_X = 1 & swap = 0
158 // \ pi/2 + arctan(V/U) / sign_X = 1 & swap = 1
161 // This relationship also suggest that the algorithm's major
162 // task is to calculate arctan(V/U) for 0 < V <= U; and the
163 // final Result is given by
165 // s_Y * { (P_hi + P_lo) + sigma * arctan(V/U) }
169 // (P_hi,P_lo) represents M(sign_X,swap)*(pi/2) accurately
171 // M(sign_X,swap) = 0 for sign_X = 0 and swap = 0
173 // 2 for sign_X = 1 and swap = 0
177 // sigma = { (sign_X XOR swap) : -1.0 : 1.0 }
179 // = (-1) ^ ( sign_X XOR swap )
181 // Both (P_hi,P_lo) and sigma can be stored in a table and fetched
182 // using (sign_X,swap) as an index. (P_hi, P_lo) can be stored as a
183 // double-precision, and single-precision pair; and sigma can
184 // obviously be just a single-precision number.
186 // In the algorithm we propose, arctan(V/U) is calculated to high accuracy
187 // as A_hi + A_lo. Consequently, the Result ATANL( Arg_Y, Arg_X ) is
190 // s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo)
192 // We now discuss the calculation of arctan(V/U) for 0 < V <= U.
194 // For (V/U) < 2^(-3), we use a simple polynomial of the form
196 // z + z^3*(P_1 + z^2*(P_2 + z^2*(P_3 + ... + P_8)))
200 // For the sake of accuracy, the first term "z" must approximate V/U to
201 // extra precision. For z^3 and higher power, a working precision
202 // approximation to V/U suffices. Thus, we obtain:
204 // z_hi + z_lo = V/U to extra precision and
205 // z = V/U to working precision
207 // The value arctan(V/U) is delivered as two pieces (A_hi, A_lo)
209 // (A_hi,A_lo) = (z_hi, z^3*(P_1 + ... + P_8) + z_lo).
212 // For 2^(-3) <= (V/U) <= 1, we use a table-driven approach.
215 // (V/U) = 2^k * 1.b_1 b_2 .... b_63 b_64 b_65 ....
219 // z_hi = 2^k * 1.b_1 b_2 b_3 b_4 1
225 // arctan(V/U) = arctan(z_hi) + acrtan| -------------- |
226 // | 1 + (V/U)*z_hi |
232 // = arctan(z_hi) + acrtan| -------------- |
236 // = arctan(z_hi) + acrtan( V' / U' )
241 // V' = V - U*z_hi; U' = U + V*z_hi.
245 // w_hi + w_lo = V'/U' to extra precision and
246 // w = V'/U' to working precision
248 // then we can approximate arctan(V'/U') by
250 // arctan(V'/U') = w_hi + w_lo
251 // + w^3*(Q_1 + w^2*(Q_2 + w^2*(Q_3 + w^2*Q_4)))
253 // = w_hi + w_lo + poly
255 // Finally, arctan(z_hi) is calculated beforehand and stored in a table
256 // as Tbl_hi, Tbl_lo. Thus,
258 // (A_hi, A_lo) = (Tbl_hi, w_hi+(poly+(w_lo+Tbl_lo)))
260 // This completes the mathematical description.
266 // Step 0. Check for unsupported format.
269 // ( expo(Arg_X) not zero AND msb(Arg_X) = 0 ) OR
270 // ( expo(Arg_Y) not zero AND msb(Arg_Y) = 0 )
272 // then one of the arguments is unsupported. Generate an
273 // invalid and return qNaN.
275 // Step 1. Initialize
277 // Normalize Arg_X and Arg_Y and set the following
279 // sign_X := sign_bit(Arg_X)
280 // s_Y := (sign_bit(Arg_Y)==0? 1.0 : -1.0)
281 // swap := (|Arg_X| >= |Arg_Y|? 0 : 1 )
282 // U := max( |Arg_X|, |Arg_Y| )
283 // V := min( |Arg_X|, |Arg_Y| )
285 // execute: frcap E, pred, V, U
286 // If pred is 0, go to Step 5 for special cases handling.
288 // Step 2. Decide on branch.
291 // If Q < 2^(-3) go to Step 4 for simple polynomial case.
293 // Step 3. Table-driven algorithm.
295 // Q is represented as
297 // 2^(-k) * 1.b_1 b_2 b_3 ... b_63; k = 0,-1,-2,-3
299 // and that if k = 0, b_1 = b_2 = b_3 = b_4 = 0.
303 // z_hi := 2^(-k) * 1.b_1 b_2 b_3 b_4 1
305 // (note that there are 49 possible values of z_hi).
307 // ...We now calculate V' and U'. While V' is representable
308 // ...as a 64-bit number because of cancellation, U' is
309 // ...not in general a 64-bit number. Obtaining U' accurately
310 // ...requires two working precision numbers
312 // U_prime_hi := U + V * z_hi ...WP approx. to U'
313 // U_prime_lo := ( U - U_prime_hi ) + V*z_hi ...observe order
314 // V_prime := V - U * z_hi ...this is exact
316 // C_hi := frcpa (1.0, U_prime_hi) ...C_hi approx 1/U'_hi
319 // C_hi := C_hi + C_hi*(1.0 - C_hi*U_prime_hi)
321 // ...at this point C_hi is (1/U_prime_hi) to roughly 64 bits
323 // w_hi := V_prime * C_hi ...w_hi is V_prime/U_prime to
324 // ...roughly working precision
326 // ...note that we want w_hi + w_lo to approximate
327 // ...V_prime/(U_prime_hi + U_prime_lo) to extra precision
328 // ...but for now, w_hi is good enough for the polynomial
332 // poly := w_hi*wsq*(Q_1 + wsq*(Q_2 + wsq*(Q_3 + wsq*Q_4)))
335 // (Tbl_hi, Tbl_lo) = atan(z_hi) indexed by (k,b_1,b_2,b_3,b_4)
336 // ...Tbl_hi is a double-precision number
337 // ...Tbl_lo is a single-precision number
339 // (P_hi, P_lo) := M(sign_X,swap)*(Pi_by_2_hi, Pi_by_2_lo)
340 // ...as discussed previous. Again; the implementation can
341 // ...chose to fetch P_hi and P_lo from a table indexed by
342 // ...(sign_X, swap).
343 // ...P_hi is a double-precision number;
344 // ...P_lo is a single-precision number.
346 // ...calculate w_lo so that w_hi + w_lo is V'/U' accurately
347 // w_lo := ((V_prime - w_hi*U_prime_hi) -
348 // w_hi*U_prime_lo) * C_hi ...observe order
351 // ...Ready to deliver arctan(V'/U') as A_hi, A_lo
353 // A_lo := w_hi + (poly + (Tbl_lo + w_lo)) ...observe order
355 // ...Deliver final Result
356 // ...s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo)
358 // sigma := ( (sign_X XOR swap) ? -1.0 : 1.0 )
359 // ...sigma can be obtained by a table lookup using
360 // ...(sign_X,swap) as index and stored as single precision
361 // ...sigma should be calculated earlier
366 // Res_hi := P_hi + sigma*A_hi ...this is exact because
367 // ...both P_hi and Tbl_hi
368 // ...are double-precision
369 // ...and |Tbl_hi| > 2^(-4)
370 // ...P_hi is either 0 or
373 // Res_lo := sigma*A_lo + P_lo
375 // Return Res_hi + s_Y*Res_lo in user-defined rounding control
377 // Step 4. Simple polynomial case.
379 // ...E and Q are inherited from Step 2.
381 // A_hi := Q ...Q is inherited from Step 2 Q approx V/U
384 // E := E + E2(1.0 - E*U1
385 // ...at this point E approximates 1/U to roughly working precision
387 // z := V * E ...z approximates V/U to roughly working precision
389 // z8 := zsq * zsq; z8 := z8 * z8
391 // poly1 := P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
392 // poly2 := zsq*(P_1 + zsq*(P_2 + zsq*P_3))
394 // poly := poly1 + z8*poly2
396 // z_lo := (V - A_hi*U)*E
398 // A_lo := z*poly + z_lo
399 // ...A_hi, A_lo approximate arctan(V/U) accurately
401 // (P_hi, P_lo) := M(sign_X,swap)*(Pi_by_2_hi, Pi_by_2_lo)
402 // ...one can store the M(sign_X,swap) as single precision
405 // ...Deliver final Result
406 // ...s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo)
408 // sigma := ( (sign_X XOR swap) ? -1.0 : 1.0 )
409 // ...sigma can be obtained by a table lookup using
410 // ...(sign_X,swap) as index and stored as single precision
411 // ...sigma should be calculated earlier
416 // Res_hi := P_hi + sigma*A_hi ...need to compute
417 // ...P_hi + sigma*A_hi
420 // tmp := (P_hi - Res_hi) + sigma*A_hi
422 // Res_lo := s_Y*(sigma*A_lo + P_lo) + tmp
424 // Return Res_hi + Res_lo in user-defined rounding control
426 // Step 5. Special Cases
428 // If pred is 0 where pred is obtained in
429 // frcap E, pred, V, U
431 // we are in one of those special cases of 0,+-inf or NaN
433 // If one of U and V is NaN, return U+V (which will generate
434 // invalid in case one is a signaling NaN). Otherwise,
435 // return the Result as described in the table
440 // X \ | +0 | -0 | +inf | -inf | finite non-zero
442 // ______________________________________________________
444 // +-0 | Invalid/ | pi/2 | -pi/2 | sign(Y)*pi/2
446 // --------------------------------------------------------
448 // +inf | +0 | -0 | pi/4 | -pi/4 | sign(Y)*0
449 // --------------------------------------------------------
451 // -inf | +pi | -pi | 3pi/4 | -3pi/4 | sign(Y)*pi
452 // --------------------------------------------------------
453 // finite | X>0? | pi/2 | -pi/2 |
454 // non-zero| sign(Y)*0: | | | N/A
455 // | sign(Y)*pi | | |
459 #include "libm_support.h"
538 GR_Parameter_RESULT = r51
539 GR_Parameter_TAG = r52
550 ASM_TYPE_DIRECTIVE(Constants_atan,@object)
551 data4 0x54442D18, 0x3FF921FB, 0x248D3132, 0x3E000000
552 // double pi/2, single lo_pi/2, two**(-3)
553 data4 0xAAAAAAA3, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // P_1
554 data4 0xCCCC54B2, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // P_2
555 data4 0x47E4D0C2, 0x92492492, 0x0000BFFC, 0x00000000 // P_3
556 data4 0x58870889, 0xE38E38E0, 0x00003FFB, 0x00000000 // P_4
557 data4 0x290149F8, 0xBA2E895B, 0x0000BFFB, 0x00000000 // P_5
558 data4 0x250F733D, 0x9D88E6D4, 0x00003FFB, 0x00000000 // P_6
559 data4 0xFB8745A0, 0x884E51FF, 0x0000BFFB, 0x00000000 // P_7
560 data4 0x394396BD, 0xE1C7412B, 0x00003FFA, 0x00000000 // P_8
561 data4 0xAAAAA52F, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // Q_1
562 data4 0xC75B60D3, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // Q_2
563 data4 0x011F1940, 0x924923AD, 0x0000BFFC, 0x00000000 // Q_3
564 data4 0x2A5F89BD, 0xE36F716D, 0x00003FFB, 0x00000000 // Q_4
566 // Entries Tbl_hi (double precision)
567 // B = 1+Index/16+1/32 Index = 0
568 // Entries Tbl_lo (single precision)
569 // B = 1+Index/16+1/32 Index = 0
571 data4 0xA935BD8E, 0x3FE9A000, 0x23ACA08F, 0x00000000
573 // Entries Tbl_hi (double precision) Index = 0,1,...,15
574 // B = 2^(-1)*(1+Index/16+1/32)
575 // Entries Tbl_lo (single precision)
576 // Index = 0,1,...,15 B = 2^(-1)*(1+Index/16+1/32)
578 data4 0x7F175A34, 0x3FDE77EB, 0x238729EE, 0x00000000
579 data4 0x73C1A40B, 0x3FE0039C, 0x249334DB, 0x00000000
580 data4 0x5B5B43DA, 0x3FE0C614, 0x22CBA7D1, 0x00000000
581 data4 0x88BE7C13, 0x3FE1835A, 0x246310E7, 0x00000000
582 data4 0xE2CC9E6A, 0x3FE23B71, 0x236210E5, 0x00000000
583 data4 0x8406CBCA, 0x3FE2EE62, 0x2462EAF5, 0x00000000
584 data4 0x1CD41719, 0x3FE39C39, 0x24B73EF3, 0x00000000
585 data4 0x5B795B55, 0x3FE44506, 0x24C11260, 0x00000000
586 data4 0x5BB6EC04, 0x3FE4E8DE, 0x242519EE, 0x00000000
587 data4 0x1F732FBA, 0x3FE587D8, 0x24D4346C, 0x00000000
588 data4 0x115D7B8D, 0x3FE6220D, 0x24ED487B, 0x00000000
589 data4 0x920B3D98, 0x3FE6B798, 0x2495FF1E, 0x00000000
590 data4 0x8FBA8E0F, 0x3FE74897, 0x223D9531, 0x00000000
591 data4 0x289FA093, 0x3FE7D528, 0x242B0411, 0x00000000
592 data4 0x576CC2C5, 0x3FE85D69, 0x2335B374, 0x00000000
593 data4 0xA99CC05D, 0x3FE8E17A, 0x24C27CFB, 0x00000000
595 // Entries Tbl_hi (double precision) Index = 0,1,...,15
596 // B = 2^(-2)*(1+Index/16+1/32)
597 // Entries Tbl_lo (single precision)
598 // Index = 0,1,...,15 B = 2^(-2)*(1+Index/16+1/32)
600 data4 0x510665B5, 0x3FD025FA, 0x24263482, 0x00000000
601 data4 0x362431C9, 0x3FD1151A, 0x242C8DC9, 0x00000000
602 data4 0x67E47C95, 0x3FD20255, 0x245CF9BA, 0x00000000
603 data4 0x7A823CFE, 0x3FD2ED98, 0x235C892C, 0x00000000
604 data4 0x29271134, 0x3FD3D6D1, 0x2389BE52, 0x00000000
605 data4 0x586890E6, 0x3FD4BDEE, 0x24436471, 0x00000000
606 data4 0x175E0F4E, 0x3FD5A2E0, 0x2389DBD4, 0x00000000
607 data4 0x9F5FA6FD, 0x3FD68597, 0x2476D43F, 0x00000000
608 data4 0x52817501, 0x3FD76607, 0x24711774, 0x00000000
609 data4 0xB8DF95D7, 0x3FD84422, 0x23EBB501, 0x00000000
610 data4 0x7CD0C662, 0x3FD91FDE, 0x23883A0C, 0x00000000
611 data4 0x66168001, 0x3FD9F930, 0x240DF63F, 0x00000000
612 data4 0x5422058B, 0x3FDAD00F, 0x23FE261A, 0x00000000
613 data4 0x378624A5, 0x3FDBA473, 0x23A8CD0E, 0x00000000
614 data4 0x0AAD71F8, 0x3FDC7655, 0x2422D1D0, 0x00000000
615 data4 0xC9EC862B, 0x3FDD45AE, 0x2344A109, 0x00000000
617 // Entries Tbl_hi (double precision) Index = 0,1,...,15
618 // B = 2^(-3)*(1+Index/16+1/32)
619 // Entries Tbl_lo (single precision)
620 // Index = 0,1,...,15 B = 2^(-3)*(1+Index/16+1/32)
622 data4 0x84212B3D, 0x3FC068D5, 0x239874B6, 0x00000000
623 data4 0x41060850, 0x3FC16465, 0x2335E774, 0x00000000
624 data4 0x171A535C, 0x3FC25F6E, 0x233E36BE, 0x00000000
625 data4 0xEDEB99A3, 0x3FC359E8, 0x239680A3, 0x00000000
626 data4 0xC6092A9E, 0x3FC453CE, 0x230FB29E, 0x00000000
627 data4 0xBA11570A, 0x3FC54D18, 0x230C1418, 0x00000000
628 data4 0xFFB3AA73, 0x3FC645BF, 0x23F0564A, 0x00000000
629 data4 0xE8A7D201, 0x3FC73DBD, 0x23D4A5E1, 0x00000000
630 data4 0xE398EBC7, 0x3FC8350B, 0x23D4ADDA, 0x00000000
631 data4 0x7D050271, 0x3FC92BA3, 0x23BCB085, 0x00000000
632 data4 0x601081A5, 0x3FCA217E, 0x23BC841D, 0x00000000
633 data4 0x574D780B, 0x3FCB1696, 0x23CF4A8E, 0x00000000
634 data4 0x4D768466, 0x3FCC0AE5, 0x23BECC90, 0x00000000
635 data4 0x4E1D5395, 0x3FCCFE65, 0x2323DCD2, 0x00000000
636 data4 0x864C9D9D, 0x3FCDF110, 0x23F53F3A, 0x00000000
637 data4 0x451D980C, 0x3FCEE2E1, 0x23CCB11F, 0x00000000
639 data4 0x54442D18, 0x400921FB, 0x33145C07, 0x3CA1A626 // PI two doubles
640 data4 0x54442D18, 0x3FF921FB, 0x33145C07, 0x3C91A626 // PI_by_2 two dbles
641 data4 0x54442D18, 0x3FE921FB, 0x33145C07, 0x3C81A626 // PI_by_4 two dbles
642 data4 0x7F3321D2, 0x4002D97C, 0x4C9E8A0A, 0x3C9A7939 // 3PI_by_4 two dbles
643 ASM_SIZE_DIRECTIVE(Constants_atan)
654 (p0) mov ArgX_orig = f1
655 (p0) br.cond.sptk atan2l ;;
658 ASM_SIZE_DIRECTIVE(atanl)
666 .proc __ieee754_atan2l#
667 .global __ieee754_atan2l#
678 alloc r32 = ar.pfs, 0, 17 , 4, 0
679 (p0) mov ArgY = ArgY_orig
683 (p0) mov ArgX = ArgX_orig
688 (p0) fclass.m.unc p7,p0 = ArgY_orig, 0x103
695 // Save original input args and load table ptr.
697 (p0) fclass.m.unc p6,p0 = ArgX_orig, 0x103
701 (p0) addl table_ptr1 = @ltoff(Constants_atan#), gp
702 (p0) fclass.m.unc p0,p9 = ArgY_orig, 0x1FF
706 ld8 table_ptr1 = [table_ptr1]
707 (p0) fclass.m.unc p0,p8 = ArgX_orig, 0x1FF
712 (p0) fclass.m.unc p13,p0 = ArgY_orig, 0x0C3
716 (p0) fclass.m.unc p12,p0 = ArgX_orig, 0x0C3
722 // Check for NatVals.
723 // Check for everything - if false, then must be pseudo-zero
724 // or pseudo-nan (IA unsupporteds).
729 (p6) br.cond.spnt L(ATANL_NATVAL) ;;
735 (p7) br.cond.spnt L(ATANL_NATVAL) ;;
738 (p0) ldfd P_hi = [table_ptr1],8
740 (p8) br.cond.spnt L(ATANL_UNSUPPORTED) ;;
743 (p0) add table_ptr2 = 96, table_ptr1
744 (p9) br.cond.spnt L(ATANL_UNSUPPORTED)
746 // Load double precision high-order part of pi
748 (p12) br.cond.spnt L(ATANL_NAN) ;;
752 (p0) fnorm.s1 ArgX = ArgX
753 (p13) br.cond.spnt L(ATANL_NAN) ;;
756 // Normalize the input argument.
757 // Branch out if NaN inputs
760 (p0) ldfs P_lo = [table_ptr1], 4
762 (p0) fnorm.s1 ArgY = ArgY ;;
766 (p0) ldfs TWO_TO_NEG3 = [table_ptr1], 180
768 // U = max(ArgX_abs,ArgY_abs)
769 // V = min(ArgX_abs,ArgY_abs)
778 // Get exp and sign of ArgX
779 // Get exp and sign of ArgY
780 // Load 2**(-3) and increment ptr to Q_4.
782 (p0) fmerge.s ArgX_abs = f1, ArgX
786 // load single precision low-order part of pi = P_lo
789 (p0) getf.exp sign_X = ArgX
790 (p0) fmerge.s ArgY_abs = f1, ArgY
794 (p0) getf.exp sign_Y = ArgY
796 (p0) shr sign_X = sign_X, 17 ;;
800 (p0) shr sign_Y = sign_Y, 17 ;;
801 (p0) cmp.eq.unc p8, p9 = 0x00000, sign_Y ;;
806 // Is ArgX_abs >= ArgY_abs
809 (p0) fmax.s1 U = ArgX_abs, ArgY_abs
817 // sign_X is sign bit of ArgX
818 // sign_Y is sign bit of ArgY
820 (p0) fcmp.ge.s1 p6, p7 = ArgX_abs, ArgY_abs
825 (p0) fmin.s1 V = ArgX_abs, ArgY_abs
830 (p8) fadd.s1 s_Y = f0, f1
831 (p6) cmp.eq.unc p10, p11 = 0x00000, sign_X
834 (p6) add swap = r0, r0
836 (p7) add swap = 1, r0
845 (p10) fsub.s1 M = M, f1
850 (p9) fsub.s1 s_Y = f0, f1
855 (p0) frcpa.s1 E, p6 = V, U
863 (p6) br.cond.sptk L(ATANL_STEP2)
864 (p0) br.cond.spnt L(ATANL_SPECIAL_HANDLING) ;;
869 (p0) fmpy.s1 Q = E, V
874 (p0) fcmp.eq.s0 p0, p9 = f1, ArgY_orig
882 (p0) fcmp.eq.s0 p0, p8 = f1, ArgX_orig
887 (p11) fadd.s1 M = M, f1
892 // *************************************************
893 // ********************* STEP2 *********************
894 // *************************************************
895 (p0) movl special = 0x8400000000000000
900 // lookup = b_1 b_2 b_3 B_4
902 (p0) movl special1 = 0x0000000000000100 ;;
907 // Do fnorms to raise any denormal operand
910 (p0) fmpy.s1 P_hi = M, P_hi
915 (p0) fmpy.s1 P_lo = M, P_lo
923 (p0) fcmp.lt.unc.s1 p6, p7 = Q, TWO_TO_NEG3
927 (p0) getf.sig significand_Q = Q
928 (p0) getf.exp exponent_Q = Q
933 (p0) andcm k = 0x0003, exponent_Q
934 (p0) extr.u lookup = significand_Q, 59, 4 ;;
938 (p0) dep special = lookup, special, 59, 4
940 // Generate 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
942 (p6) br.cond.spnt L(ATANL_POLY) ;;
945 (p0) cmp.eq.unc p8, p9 = 0x0000, k
946 (p0) fmpy.s1 P_hi = s_Y, P_hi
948 // We waited a few extra cycles so P_lo and P_hi could be calculated.
949 // Load the constant 256 for loading up table entries.
951 // *************************************************
952 // ******************** STEP3 **********************
953 // *************************************************
954 (p0) add table_ptr2 = 16, table_ptr1
957 // Let z_hi have exponent and sign of original Q
958 // Load the Tbl_hi(0) else, increment pointer.
961 (p0) ldfe Q_4 = [table_ptr1], -16
962 (p0) xor swap = sign_X, swap ;;
963 (p9) sub k = k, r0, 1
966 (p0) setf.sig z_hi = special
967 (p0) ldfe Q_3 = [table_ptr1], -16
968 (p9) add table_ptr2 = 16, table_ptr2 ;;
971 // U_hold = U - U_prime_hi
972 // k = k * 256 - Result can be 0, 256, or 512.
975 (p0) ldfe Q_2 = [table_ptr1], -16
976 (p8) ldfd Tbl_hi = [table_ptr2], 8
980 // U_prime_lo = U_hold + V * z_hi
981 // lookup -> lookup * 16 + k
984 (p0) ldfe Q_1 = [table_ptr1], -16 ;;
985 (p8) ldfs Tbl_lo = [table_ptr2], 8
987 // U_prime_hi = U + V * z_hi
988 // Load the Tbl_lo(0)
990 (p9) pmpy2.r k = k, special1 ;;
1010 (p9) shladd lookup = lookup, 0x0004, k ;;
1013 (p9) add table_ptr2 = table_ptr2, lookup ;;
1015 // V_prime = V - U * z_hi
1017 (p9) ldfd Tbl_hi = [table_ptr2], 8
1023 // C_hi = frcpa(1,U_prime_hi)
1025 (p9) ldfs Tbl_lo = [table_ptr2], 8
1027 // z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
1028 // Point to beginning of Tbl_hi entries - k = 0.
1030 (p0) fmerge.se z_hi = Q, z_hi ;;
1034 (p0) fma.s1 U_prime_hi = V, z_hi, U
1039 (p0) fnma.s1 V_prime = U, z_hi, V
1044 (p0) mov A_hi = Tbl_hi
1049 (p0) fsub.s1 U_hold = U, U_prime_hi
1054 (p0) frcpa.s1 C_hi, p6 = f1, U_prime_hi
1058 (p0) cmp.eq.unc p7, p6 = 0x00000, swap
1059 (p0) fmpy.s1 A_hi = s_Y, A_hi
1065 // poly = wsq * poly
1067 (p7) fadd.s1 sigma = f0, f1
1072 (p0) fma.s1 U_prime_lo = z_hi, V, U_hold
1077 (p6) fsub.s1 sigma = f0, f1
1082 (p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
1088 // A_lo = A_lo + w_hi
1089 // A_hi = s_Y * A_hi
1091 (p0) fma.s1 Res_hi = sigma, A_hi, P_hi
1097 // C_hi_hold = 1 - C_hi * U_prime_hi (1)
1099 (p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi
1105 // C_hi = C_hi + C_hi * C_hi_hold (1)
1107 (p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
1113 // C_hi_hold = 1 - C_hi * U_prime_hi (2)
1115 (p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi
1121 // C_hi = C_hi + C_hi * C_hi_hold (2)
1123 (p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
1129 // C_hi_hold = 1 - C_hi * U_prime_hi (3)
1131 (p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi
1137 // C_hi = C_hi + C_hi * C_hi_hold (3)
1139 (p0) fmpy.s1 w_hi = V_prime, C_hi
1145 // w_hi = V_prime * C_hi
1147 (p0) fmpy.s1 wsq = w_hi, w_hi
1152 (p0) fnma.s1 w_lo = w_hi, U_prime_hi, V_prime
1158 // wsq = w_hi * w_hi
1159 // w_lo = = V_prime - w_hi * U_prime_hi
1161 (p0) fma.s1 poly = wsq, Q_4, Q_3
1166 (p0) fnma.s1 w_lo = w_hi, U_prime_lo, w_lo
1172 // poly = Q_3 + wsq * Q_4
1173 // w_lo = = w_lo - w_hi * U_prime_lo
1175 (p0) fma.s1 poly = wsq, poly, Q_2
1180 (p0) fmpy.s1 w_lo = C_hi, w_lo
1186 // poly = Q_2 + wsq * poly
1187 // w_lo = = w_lo * C_hi
1189 (p0) fma.s1 poly = wsq, poly, Q_1
1194 (p0) fadd.s1 A_lo = Tbl_lo, w_lo
1200 // Result = Res_hi + Res_lo * s_Y (User Supplied Rounding Mode)
1202 (p0) fmpy.s0 Q_1 = Q_1, Q_1
1208 // poly = Q_1 + wsq * poly
1209 // A_lo = Tbl_lo + w_lo
1210 // swap = xor(swap,sign_X)
1212 (p0) fmpy.s1 poly = wsq, poly
1219 // poly = wsq * poly
1222 (p0) fmpy.s1 poly = w_hi, poly
1228 // if (PR_1) sigma = -1.0
1229 // if (PR_2) sigma = 1.0
1231 (p0) fadd.s1 A_lo = A_lo, poly
1237 // P_hi = s_Y * P_hi
1238 // A_lo = A_lo + poly
1240 (p0) fadd.s1 A_lo = A_lo, w_hi
1245 (p0) fma.s1 Res_lo = sigma, A_lo, P_lo
1251 // Res_hi = P_hi + sigma * A_hi
1252 // Res_lo = P_lo + sigma * A_lo
1254 (p0) fma.s0 Result = Res_lo, s_Y, Res_hi
1261 // poly1 = P_5 + zsq * poly1
1262 // poly2 = zsq * poly2
1266 (p0) xor swap = sign_X, swap
1268 (p0) fnma.s1 E_hold = E, U, f1 ;;
1274 // poly1 = P_4 + zsq * poly1
1275 // swap = xor(swap,sign_X)
1279 // poly1 = poly1 <== Done with poly1
1280 // poly1 = P_4 + zsq * poly1
1281 // swap = xor(swap,sign_X)
1283 (p0) cmp.eq.unc p7, p6 = 0x00000, swap
1287 (p0) fmpy.s1 P_hi = s_Y, P_hi
1292 (p6) fsub.s1 sigma = f0, f1
1297 (p7) fadd.s1 sigma = f0, f1
1301 // ***********************************************
1302 // ******************** STEP4 ********************
1303 // ***********************************************
1307 (p0) addl table_ptr1 = @ltoff(Constants_atan#), gp
1313 ld8 table_ptr1 = [table_ptr1]
1322 (p0) fma.s1 E = E, E_hold, E
1325 // Iterate 3 times E = E + E*(1.0 - E*U)
1326 // Also load P_8, P_7, P_6, P_5, P_4
1327 // E_hold = 1.0 - E * U (1)
1330 (p0) add table_ptr1 = 128, table_ptr1 ;;
1335 // E = E + E_hold*E (1)
1338 (p0) ldfe P_8 = [table_ptr1], -16
1340 // poly = z8*poly1 + poly2 (Typo in writeup)
1343 (p0) fnma.s1 z_lo = A_temp, U, V ;;
1348 // E_hold = 1.0 - E * U (2)
1350 (p0) ldfe P_7 = [table_ptr1], -16
1356 // E = E + E_hold*E (2)
1358 (p0) ldfe P_6 = [table_ptr1], -16
1364 // E_hold = 1.0 - E * U (3)
1366 (p0) ldfe P_5 = [table_ptr1], -16
1372 // E = E + E_hold*E (3)
1375 // At this point E approximates 1/U to roughly working precision
1376 // z = V*E approximates V/U
1378 (p0) ldfe P_4 = [table_ptr1], -16
1379 (p0) fnma.s1 E_hold = E, U, f1 ;;
1386 (p0) ldfe P_3 = [table_ptr1], -16
1394 (p0) ldfe P_2 = [table_ptr1], -16
1402 (p0) ldfe P_1 = [table_ptr1], -16
1407 (p0) movl int_temp = 0x24005
1411 (p0) fma.s1 E = E, E_hold, E
1416 (p0) fnma.s1 E_hold = E, U, f1
1421 (p0) fma.s1 E = E, E_hold, E
1426 (p0) fmpy.s1 Z = V, E
1432 // z_lo = V - A_temp * U
1433 // if (PR_2) sigma = 1.0
1435 (p0) fmpy.s1 z_lo = z_lo, E
1440 (p0) fmpy.s1 zsq = Z, Z
1447 // if (PR_1) sigma = -1.0
1449 (p0) fadd.s1 A_hi = A_temp, z_lo
1458 // Now what we want to do is
1459 // poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
1460 // poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3))
1462 (p0) fma.s1 poly1 = zsq, P_8, P_7
1467 (p0) fma.s1 poly2 = zsq, P_3, P_2
1472 (p0) fmpy.s1 z8 = zsq, zsq
1477 (p0) fsub.s1 A_temp = A_temp, A_hi
1483 // A_lo = Z * poly + z_lo
1485 (p0) fmerge.s tmp = A_hi, A_hi
1491 // poly1 = P_7 + zsq * P_8
1492 // poly2 = P_2 + zsq * P_3
1494 (p0) fma.s1 poly1 = zsq, poly1, P_6
1499 (p0) fma.s1 poly2 = zsq, poly2, P_1
1504 (p0) fmpy.s1 z8 = z8, z8
1509 (p0) fadd.s1 z_lo = A_temp, z_lo
1515 // poly1 = P_6 + zsq * poly1
1516 // poly2 = P_2 + zsq * poly2
1518 (p0) fma.s1 poly1 = zsq, poly1, P_5
1523 (p0) fmpy.s1 poly2 = poly2, zsq
1529 // Result = Res_hi + Res_lo (User Supplied Rounding Mode)
1531 (p0) fmpy.s1 P_5 = P_5, P_5
1536 (p0) fma.s1 poly1 = zsq, poly1, P_4
1541 (p0) fma.s1 poly = z8, poly1, poly2
1547 // Fixup added to force inexact later -
1548 // A_hi = A_temp + z_lo
1549 // z_lo = (A_temp - A_hi) + z_lo
1551 (p0) fma.s1 A_lo = Z, poly, z_lo
1556 (p0) fadd.s1 A_hi = tmp, A_lo
1561 (p0) fsub.s1 tmp = tmp, A_hi
1566 (p0) fmpy.s1 A_hi = s_Y, A_hi
1571 (p0) fadd.s1 A_lo = tmp, A_lo
1575 (p0) setf.exp tmp = int_temp
1577 // P_hi = s_Y * P_hi
1578 // A_hi = s_Y * A_hi
1580 (p0) fma.s1 Res_hi = sigma, A_hi, P_hi
1585 (p0) fclass.m.unc p6,p0 = A_lo, 0x007
1596 // Res_hi = P_hi + sigma * A_hi
1598 (p0) fsub.s1 tmp = P_hi, Res_hi
1604 // tmp = P_hi - Res_hi
1606 (p0) fma.s1 tmp = A_hi, sigma, tmp
1611 (p0) fma.s1 sigma = A_lo, sigma, P_lo
1617 // tmp = sigma * A_hi + tmp
1618 // sigma = A_lo * sigma + P_lo
1620 (p0) fma.s1 Res_lo = s_Y, sigma, tmp
1626 // Res_lo = s_Y * sigma + tmp
1628 (p0) fadd.s0 Result = Res_lo, Res_hi
1632 L(ATANL_UNSUPPORTED):
1636 (p0) fmpy.s0 Result = ArgX,ArgY
1637 (p0) br.ret.sptk b0 ;;
1639 L(ATANL_SPECIAL_HANDLING):
1642 (p0) fcmp.eq.s0 p0, p6 = f1, ArgY_orig
1647 (p0) fcmp.eq.s0 p0, p5 = f1, ArgX_orig
1652 (p0) fclass.m.unc p6, p7 = ArgY, 0x007
1657 (p0) movl special = 992
1664 (p0) addl table_ptr1 = @ltoff(Constants_atan#), gp
1670 ld8 table_ptr1 = [table_ptr1]
1678 (p0) add table_ptr1 = table_ptr1, special
1680 (p7) br.cond.spnt L(ATANL_ArgY_Not_ZERO) ;;
1683 (p0) ldfd Result = [table_ptr1], 8
1685 (p6) fclass.m.unc p14, p0 = ArgX, 0x035 ;;
1689 (p0) ldfd Result_lo = [table_ptr1], -8
1690 (p6) fclass.m.unc p15, p0 = ArgX, 0x036 ;;
1694 (p14) fmerge.s Result = ArgY, f0
1699 (p6) fclass.m.unc p13, p0 = ArgX, 0x007
1704 (p14) fmerge.s Result_lo = ArgY, f0
1708 (p13) mov GR_Parameter_TAG = 36
1715 // Return sign_Y * 0 when ArgX > +0
1717 (p15) fmerge.s Result = ArgY, Result
1722 (p15) fmerge.s Result_lo = ArgY, Result_lo
1728 // Return sign_Y * 0 when ArgX < -0
1730 (p0) fadd.s0 Result = Result, Result_lo
1731 (p13) br.cond.spnt __libm_error_region ;;
1737 // Call error support funciton for atan(0,0)
1739 (p0) br.ret.sptk b0 ;;
1741 L(ATANL_ArgY_Not_ZERO):
1744 (p0) fclass.m.unc p9, p10 = ArgY, 0x023
1750 (p10) br.cond.spnt L(ATANL_ArgY_Not_INF) ;;
1754 (p9) fclass.m.unc p6, p0 = ArgX, 0x017
1759 (p9) fclass.m.unc p7, p0 = ArgX, 0x021
1764 (p9) fclass.m.unc p8, p0 = ArgX, 0x022
1768 (p6) add table_ptr1 = 16, table_ptr1 ;;
1769 (p0) ldfd Result = [table_ptr1], 8
1773 (p0) ldfd Result_lo = [table_ptr1], -8
1779 (p6) fmerge.s Result = ArgY, Result
1784 (p6) fmerge.s Result_lo = ArgY, Result_lo
1789 (p6) fadd.s0 Result = Result, Result_lo
1790 (p6) br.ret.sptk b0 ;;
1793 // Load PI/2 and adjust its sign.
1794 // Return +PI/2 when ArgY = +Inf and ArgX = +/-0 or normal
1795 // Return -PI/2 when ArgY = -Inf and ArgX = +/-0 or normal
1798 (p7) add table_ptr1 = 32, table_ptr1 ;;
1799 (p7) ldfd Result = [table_ptr1], 8
1803 (p7) ldfd Result_lo = [table_ptr1], -8
1809 (p7) fmerge.s Result = ArgY, Result
1814 (p7) fmerge.s Result_lo = ArgY, Result_lo
1819 (p7) fadd.s0 Result = Result, Result_lo
1820 (p7) br.ret.sptk b0 ;;
1823 // Load PI/4 and adjust its sign.
1824 // Return +PI/4 when ArgY = +Inf and ArgX = +Inf
1825 // Return -PI/4 when ArgY = -Inf and ArgX = +Inf
1828 (p8) add table_ptr1 = 48, table_ptr1 ;;
1829 (p8) ldfd Result = [table_ptr1], 8
1833 (p8) ldfd Result_lo = [table_ptr1], -8
1839 (p8) fmerge.s Result = ArgY, Result
1844 (p8) fmerge.s Result_lo = ArgY, Result_lo
1849 (p8) fadd.s0 Result = Result, Result_lo
1850 (p8) br.ret.sptk b0 ;;
1852 L(ATANL_ArgY_Not_INF):
1856 // Load PI/4 and adjust its sign.
1857 // Return +3PI/4 when ArgY = +Inf and ArgX = -Inf
1858 // Return -3PI/4 when ArgY = -Inf and ArgX = -Inf
1860 (p0) fclass.m.unc p6, p0 = ArgX, 0x007
1865 (p0) fclass.m.unc p7, p0 = ArgX, 0x021
1870 (p0) fclass.m.unc p8, p0 = ArgX, 0x022
1874 (p6) add table_ptr1 = 16, table_ptr1 ;;
1875 (p6) ldfd Result = [table_ptr1], 8
1879 (p6) ldfd Result_lo = [table_ptr1], -8
1885 (p6) fmerge.s Result = ArgY, Result
1890 (p6) fmerge.s Result_lo = ArgY, Result_lo
1895 (p6) fadd.s0 Result = Result, Result_lo
1896 (p6) br.ret.spnt b0 ;;
1901 // return = sign_Y * PI/2 when ArgX = 0
1903 (p7) fmerge.s Result = ArgY, f0
1908 (p7) fnorm.s0 Result = Result
1909 (p7) br.ret.spnt b0 ;;
1912 // return = sign_Y * 0 when ArgX = Inf
1915 (p8) ldfd Result = [table_ptr1], 8 ;;
1916 (p8) ldfd Result_lo = [table_ptr1], -8
1921 (p8) fmerge.s Result = ArgY, Result
1926 (p8) fmerge.s Result_lo = ArgY, Result_lo
1931 (p8) fadd.s0 Result = Result, Result_lo
1932 (p8) br.ret.sptk b0 ;;
1935 // return = sign_Y * PI when ArgX = -Inf
1938 ASM_SIZE_DIRECTIVE(atan2l)
1939 ASM_SIZE_DIRECTIVE(__atan2l)
1940 ASM_SIZE_DIRECTIVE(__ieee754_atan2l)
1942 .proc __libm_error_region
1943 __libm_error_region:
1946 add GR_Parameter_Y=-32,sp // Parameter 2 value
1948 .save ar.pfs,GR_SAVE_PFS
1949 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
1953 add sp=-64,sp // Create new stack
1955 mov GR_SAVE_GP=gp // Save gp
1958 stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
1959 add GR_Parameter_X = 16,sp // Parameter 1 address
1960 .save b0, GR_SAVE_B0
1961 mov GR_SAVE_B0=b0 // Save b0
1965 stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
1966 add GR_Parameter_RESULT = 0,GR_Parameter_Y
1967 nop.b 0 // Parameter 3 address
1970 stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
1971 add GR_Parameter_Y = -16,GR_Parameter_Y
1972 br.call.sptk b0=__libm_error_support# // Call error handling function
1977 add GR_Parameter_RESULT = 48,sp
1980 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
1982 add sp = 64,sp // Restore stack pointer
1983 mov b0 = GR_SAVE_B0 // Restore return address
1986 mov gp = GR_SAVE_GP // Restore gp
1987 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
1988 br.ret.sptk b0 // Return
1991 .endp __libm_error_region
1992 ASM_SIZE_DIRECTIVE(__libm_error_region)
1993 .type __libm_error_support#,@function
1994 .global __libm_error_support#