3 // Copyright (C) 2000, 2001, Intel Corporation
4 // All rights reserved.
6 // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
7 // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
13 // * Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
16 // * Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
20 // * The name of Intel Corporation may not be used to endorse or promote
21 // products derived from this software without specific prior written
24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 // Intel Corporation is the author of this code, and requests that all
37 // problem reports or change requests be submitted to it directly at
38 // http://developer.intel.com/opensource.
41 // *********************************************************************
44 // 2/02/00 (hand-optimized)
45 // 4/04/00 Unwind support added
46 // 8/15/00 Bundle added after call to __libm_error_support to properly
47 // set [the previously overwritten] GR_Parameter_RESULT.
49 // *********************************************************************
51 // Function: atanl(x) = inverse tangent(x), for double extended x values
52 // Function: atan2l(y,x) = atan(y/x), for double extended x values
54 // *********************************************************************
58 // Floating-Point Registers: f8 (Input and Return Value)
62 // General Purpose Registers:
64 // r49,r50,r51,r52 (Arguments to error support for 0,0 case)
66 // Predicate Registers: p6-p15
68 // *********************************************************************
70 // IEEE Special Conditions:
72 // Denormal fault raised on denormal inputs
73 // Underflow exceptions may occur
74 // Special error handling for the y=0 and x=0 case
75 // Inexact raised when appropriate by algorithm
79 // atanl(+/-0) = +/- 0
80 // atanl(+/-Inf) = +/-pi/2
82 // atan2l(Any NaN for x or y) = QNaN
83 // atan2l(+/-0,x) = +/-0 for x > 0
84 // atan2l(+/-0,x) = +/-pi for x < 0
85 // atan2l(+/-0,+0) = +/-0
86 // atan2l(+/-0,-0) = +/-pi
87 // atan2l(y,+/-0) = pi/2 y > 0
88 // atan2l(y,+/-0) = -pi/2 y < 0
89 // atan2l(+/-y, Inf) = +/-0 for finite y > 0
90 // atan2l(+/-Inf, x) = +/-pi/2 for finite x
91 // atan2l(+/-y, -Inf) = +/-pi for finite y > 0
92 // atan2l(+/-Inf, Inf) = +/-pi/4
93 // atan2l(+/-Inf, -Inf) = +/-3pi/4
95 // *********************************************************************
97 // Mathematical Description
98 // ---------------------------
100 // The function ATANL( Arg_Y, Arg_X ) returns the "argument"
101 // or the "phase" of the complex number
105 // or equivalently, the angle in radians from the positive
106 // x-axis to the line joining the origin and the point
115 // \ angle between is ATANL(Arg_Y,Arg_X)
121 // ------------------> X-axis
125 // Moreover, this angle is reported in the range [-pi,pi] thus
127 // -pi <= ATANL( Arg_Y, Arg_X ) <= pi.
129 // From the geometry, it is easy to define ATANL when one of
130 // Arg_X or Arg_Y is +-0 or +-inf:
134 // X \ | +0 | -0 | +inf | -inf | finite non-zero
136 // ______________________________________________________
138 // +-0 | Invalid/ | pi/2 | -pi/2 | sign(Y)*pi/2
140 // --------------------------------------------------------
142 // +inf | +0 | -0 | pi/4 | -pi/4 | sign(Y)*0
143 // --------------------------------------------------------
145 // -inf | +pi | -pi | 3pi/4 | -3pi/4 | sign(Y)*pi
146 // --------------------------------------------------------
147 // finite | X>0? | pi/2 | -pi/2 | normal case
148 // non-zero| sign(Y)*0: | | |
149 // | sign(Y)*pi | | |
152 // One must take note that ATANL is NOT the arctangent of the
153 // value Arg_Y/Arg_X; but rather ATANL and arctan are related
154 // in a slightly more complicated way as follows:
156 // Let U := max(|Arg_X|, |Arg_Y|); V := min(|Arg_X|, |Arg_Y|);
157 // sign_X be the sign bit of Arg_X, i.e., sign_X is 0 or 1;
158 // s_X be the sign of Arg_X, i.e., s_X = (-1)^sign_X;
160 // sign_Y be the sign bit of Arg_Y, i.e., sign_Y is 0 or 1;
161 // s_Y be the sign of Arg_Y, i.e., s_Y = (-1)^sign_Y;
163 // swap be 0 if |Arg_X| >= |Arg_Y| and 1 otherwise.
165 // Then, ATANL(Arg_Y, Arg_X) =
167 // / arctan(V/U) \ sign_X = 0 & swap = 0
168 // | pi/2 - arctan(V/U) | sign_X = 0 & swap = 1
170 // | pi - arctan(V/U) | sign_X = 1 & swap = 0
171 // \ pi/2 + arctan(V/U) / sign_X = 1 & swap = 1
174 // This relationship also suggest that the algorithm's major
175 // task is to calculate arctan(V/U) for 0 < V <= U; and the
176 // final Result is given by
178 // s_Y * { (P_hi + P_lo) + sigma * arctan(V/U) }
182 // (P_hi,P_lo) represents M(sign_X,swap)*(pi/2) accurately
184 // M(sign_X,swap) = 0 for sign_X = 0 and swap = 0
186 // 2 for sign_X = 1 and swap = 0
190 // sigma = { (sign_X XOR swap) : -1.0 : 1.0 }
192 // = (-1) ^ ( sign_X XOR swap )
194 // Both (P_hi,P_lo) and sigma can be stored in a table and fetched
195 // using (sign_X,swap) as an index. (P_hi, P_lo) can be stored as a
196 // double-precision, and single-precision pair; and sigma can
197 // obviously be just a single-precision number.
199 // In the algorithm we propose, arctan(V/U) is calculated to high accuracy
200 // as A_hi + A_lo. Consequently, the Result ATANL( Arg_Y, Arg_X ) is
203 // s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo)
205 // We now discuss the calculation of arctan(V/U) for 0 < V <= U.
207 // For (V/U) < 2^(-3), we use a simple polynomial of the form
209 // z + z^3*(P_1 + z^2*(P_2 + z^2*(P_3 + ... + P_8)))
213 // For the sake of accuracy, the first term "z" must approximate V/U to
214 // extra precision. For z^3 and higher power, a working precision
215 // approximation to V/U suffices. Thus, we obtain:
217 // z_hi + z_lo = V/U to extra precision and
218 // z = V/U to working precision
220 // The value arctan(V/U) is delivered as two pieces (A_hi, A_lo)
222 // (A_hi,A_lo) = (z_hi, z^3*(P_1 + ... + P_8) + z_lo).
225 // For 2^(-3) <= (V/U) <= 1, we use a table-driven approach.
228 // (V/U) = 2^k * 1.b_1 b_2 .... b_63 b_64 b_65 ....
232 // z_hi = 2^k * 1.b_1 b_2 b_3 b_4 1
238 // arctan(V/U) = arctan(z_hi) + acrtan| -------------- |
239 // | 1 + (V/U)*z_hi |
245 // = arctan(z_hi) + acrtan| -------------- |
249 // = arctan(z_hi) + acrtan( V' / U' )
254 // V' = V - U*z_hi; U' = U + V*z_hi.
258 // w_hi + w_lo = V'/U' to extra precision and
259 // w = V'/U' to working precision
261 // then we can approximate arctan(V'/U') by
263 // arctan(V'/U') = w_hi + w_lo
264 // + w^3*(Q_1 + w^2*(Q_2 + w^2*(Q_3 + w^2*Q_4)))
266 // = w_hi + w_lo + poly
268 // Finally, arctan(z_hi) is calculated beforehand and stored in a table
269 // as Tbl_hi, Tbl_lo. Thus,
271 // (A_hi, A_lo) = (Tbl_hi, w_hi+(poly+(w_lo+Tbl_lo)))
273 // This completes the mathematical description.
279 // Step 0. Check for unsupported format.
282 // ( expo(Arg_X) not zero AND msb(Arg_X) = 0 ) OR
283 // ( expo(Arg_Y) not zero AND msb(Arg_Y) = 0 )
285 // then one of the arguments is unsupported. Generate an
286 // invalid and return qNaN.
288 // Step 1. Initialize
290 // Normalize Arg_X and Arg_Y and set the following
292 // sign_X := sign_bit(Arg_X)
293 // s_Y := (sign_bit(Arg_Y)==0? 1.0 : -1.0)
294 // swap := (|Arg_X| >= |Arg_Y|? 0 : 1 )
295 // U := max( |Arg_X|, |Arg_Y| )
296 // V := min( |Arg_X|, |Arg_Y| )
298 // execute: frcap E, pred, V, U
299 // If pred is 0, go to Step 5 for special cases handling.
301 // Step 2. Decide on branch.
304 // If Q < 2^(-3) go to Step 4 for simple polynomial case.
306 // Step 3. Table-driven algorithm.
308 // Q is represented as
310 // 2^(-k) * 1.b_1 b_2 b_3 ... b_63; k = 0,-1,-2,-3
312 // and that if k = 0, b_1 = b_2 = b_3 = b_4 = 0.
316 // z_hi := 2^(-k) * 1.b_1 b_2 b_3 b_4 1
318 // (note that there are 49 possible values of z_hi).
320 // ...We now calculate V' and U'. While V' is representable
321 // ...as a 64-bit number because of cancellation, U' is
322 // ...not in general a 64-bit number. Obtaining U' accurately
323 // ...requires two working precision numbers
325 // U_prime_hi := U + V * z_hi ...WP approx. to U'
326 // U_prime_lo := ( U - U_prime_hi ) + V*z_hi ...observe order
327 // V_prime := V - U * z_hi ...this is exact
329 // C_hi := frcpa (1.0, U_prime_hi) ...C_hi approx 1/U'_hi
332 // C_hi := C_hi + C_hi*(1.0 - C_hi*U_prime_hi)
334 // ...at this point C_hi is (1/U_prime_hi) to roughly 64 bits
336 // w_hi := V_prime * C_hi ...w_hi is V_prime/U_prime to
337 // ...roughly working precision
339 // ...note that we want w_hi + w_lo to approximate
340 // ...V_prime/(U_prime_hi + U_prime_lo) to extra precision
341 // ...but for now, w_hi is good enough for the polynomial
345 // poly := w_hi*wsq*(Q_1 + wsq*(Q_2 + wsq*(Q_3 + wsq*Q_4)))
348 // (Tbl_hi, Tbl_lo) = atan(z_hi) indexed by (k,b_1,b_2,b_3,b_4)
349 // ...Tbl_hi is a double-precision number
350 // ...Tbl_lo is a single-precision number
352 // (P_hi, P_lo) := M(sign_X,swap)*(Pi_by_2_hi, Pi_by_2_lo)
353 // ...as discussed previous. Again; the implementation can
354 // ...chose to fetch P_hi and P_lo from a table indexed by
355 // ...(sign_X, swap).
356 // ...P_hi is a double-precision number;
357 // ...P_lo is a single-precision number.
359 // ...calculate w_lo so that w_hi + w_lo is V'/U' accurately
360 // w_lo := ((V_prime - w_hi*U_prime_hi) -
361 // w_hi*U_prime_lo) * C_hi ...observe order
364 // ...Ready to deliver arctan(V'/U') as A_hi, A_lo
366 // A_lo := w_hi + (poly + (Tbl_lo + w_lo)) ...observe order
368 // ...Deliver final Result
369 // ...s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo)
371 // sigma := ( (sign_X XOR swap) ? -1.0 : 1.0 )
372 // ...sigma can be obtained by a table lookup using
373 // ...(sign_X,swap) as index and stored as single precision
374 // ...sigma should be calculated earlier
379 // Res_hi := P_hi + sigma*A_hi ...this is exact because
380 // ...both P_hi and Tbl_hi
381 // ...are double-precision
382 // ...and |Tbl_hi| > 2^(-4)
383 // ...P_hi is either 0 or
386 // Res_lo := sigma*A_lo + P_lo
388 // Return Res_hi + s_Y*Res_lo in user-defined rounding control
390 // Step 4. Simple polynomial case.
392 // ...E and Q are inherited from Step 2.
394 // A_hi := Q ...Q is inherited from Step 2 Q approx V/U
397 // E := E + E2(1.0 - E*U1
398 // ...at this point E approximates 1/U to roughly working precision
400 // z := V * E ...z approximates V/U to roughly working precision
402 // z8 := zsq * zsq; z8 := z8 * z8
404 // poly1 := P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
405 // poly2 := zsq*(P_1 + zsq*(P_2 + zsq*P_3))
407 // poly := poly1 + z8*poly2
409 // z_lo := (V - A_hi*U)*E
411 // A_lo := z*poly + z_lo
412 // ...A_hi, A_lo approximate arctan(V/U) accurately
414 // (P_hi, P_lo) := M(sign_X,swap)*(Pi_by_2_hi, Pi_by_2_lo)
415 // ...one can store the M(sign_X,swap) as single precision
418 // ...Deliver final Result
419 // ...s_Y*P_hi + s_Y*sigma*A_hi + s_Y*(sigma*A_lo + P_lo)
421 // sigma := ( (sign_X XOR swap) ? -1.0 : 1.0 )
422 // ...sigma can be obtained by a table lookup using
423 // ...(sign_X,swap) as index and stored as single precision
424 // ...sigma should be calculated earlier
429 // Res_hi := P_hi + sigma*A_hi ...need to compute
430 // ...P_hi + sigma*A_hi
433 // tmp := (P_hi - Res_hi) + sigma*A_hi
435 // Res_lo := s_Y*(sigma*A_lo + P_lo) + tmp
437 // Return Res_hi + Res_lo in user-defined rounding control
439 // Step 5. Special Cases
441 // If pred is 0 where pred is obtained in
442 // frcap E, pred, V, U
444 // we are in one of those special cases of 0,+-inf or NaN
446 // If one of U and V is NaN, return U+V (which will generate
447 // invalid in case one is a signaling NaN). Otherwise,
448 // return the Result as described in the table
453 // X \ | +0 | -0 | +inf | -inf | finite non-zero
455 // ______________________________________________________
457 // +-0 | Invalid/ | pi/2 | -pi/2 | sign(Y)*pi/2
459 // --------------------------------------------------------
461 // +inf | +0 | -0 | pi/4 | -pi/4 | sign(Y)*0
462 // --------------------------------------------------------
464 // -inf | +pi | -pi | 3pi/4 | -3pi/4 | sign(Y)*pi
465 // --------------------------------------------------------
466 // finite | X>0? | pi/2 | -pi/2 |
467 // non-zero| sign(Y)*0: | | | N/A
468 // | sign(Y)*pi | | |
472 #include "libm_support.h"
551 GR_Parameter_RESULT = r51
552 GR_Parameter_TAG = r52
563 ASM_TYPE_DIRECTIVE(Constants_atan,@object)
564 data4 0x54442D18, 0x3FF921FB, 0x248D3132, 0x3E000000
565 // double pi/2, single lo_pi/2, two**(-3)
566 data4 0xAAAAAAA3, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // P_1
567 data4 0xCCCC54B2, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // P_2
568 data4 0x47E4D0C2, 0x92492492, 0x0000BFFC, 0x00000000 // P_3
569 data4 0x58870889, 0xE38E38E0, 0x00003FFB, 0x00000000 // P_4
570 data4 0x290149F8, 0xBA2E895B, 0x0000BFFB, 0x00000000 // P_5
571 data4 0x250F733D, 0x9D88E6D4, 0x00003FFB, 0x00000000 // P_6
572 data4 0xFB8745A0, 0x884E51FF, 0x0000BFFB, 0x00000000 // P_7
573 data4 0x394396BD, 0xE1C7412B, 0x00003FFA, 0x00000000 // P_8
574 data4 0xAAAAA52F, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // Q_1
575 data4 0xC75B60D3, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // Q_2
576 data4 0x011F1940, 0x924923AD, 0x0000BFFC, 0x00000000 // Q_3
577 data4 0x2A5F89BD, 0xE36F716D, 0x00003FFB, 0x00000000 // Q_4
579 // Entries Tbl_hi (double precision)
580 // B = 1+Index/16+1/32 Index = 0
581 // Entries Tbl_lo (single precision)
582 // B = 1+Index/16+1/32 Index = 0
584 data4 0xA935BD8E, 0x3FE9A000, 0x23ACA08F, 0x00000000
586 // Entries Tbl_hi (double precision) Index = 0,1,...,15
587 // B = 2^(-1)*(1+Index/16+1/32)
588 // Entries Tbl_lo (single precision)
589 // Index = 0,1,...,15 B = 2^(-1)*(1+Index/16+1/32)
591 data4 0x7F175A34, 0x3FDE77EB, 0x238729EE, 0x00000000
592 data4 0x73C1A40B, 0x3FE0039C, 0x249334DB, 0x00000000
593 data4 0x5B5B43DA, 0x3FE0C614, 0x22CBA7D1, 0x00000000
594 data4 0x88BE7C13, 0x3FE1835A, 0x246310E7, 0x00000000
595 data4 0xE2CC9E6A, 0x3FE23B71, 0x236210E5, 0x00000000
596 data4 0x8406CBCA, 0x3FE2EE62, 0x2462EAF5, 0x00000000
597 data4 0x1CD41719, 0x3FE39C39, 0x24B73EF3, 0x00000000
598 data4 0x5B795B55, 0x3FE44506, 0x24C11260, 0x00000000
599 data4 0x5BB6EC04, 0x3FE4E8DE, 0x242519EE, 0x00000000
600 data4 0x1F732FBA, 0x3FE587D8, 0x24D4346C, 0x00000000
601 data4 0x115D7B8D, 0x3FE6220D, 0x24ED487B, 0x00000000
602 data4 0x920B3D98, 0x3FE6B798, 0x2495FF1E, 0x00000000
603 data4 0x8FBA8E0F, 0x3FE74897, 0x223D9531, 0x00000000
604 data4 0x289FA093, 0x3FE7D528, 0x242B0411, 0x00000000
605 data4 0x576CC2C5, 0x3FE85D69, 0x2335B374, 0x00000000
606 data4 0xA99CC05D, 0x3FE8E17A, 0x24C27CFB, 0x00000000
608 // Entries Tbl_hi (double precision) Index = 0,1,...,15
609 // B = 2^(-2)*(1+Index/16+1/32)
610 // Entries Tbl_lo (single precision)
611 // Index = 0,1,...,15 B = 2^(-2)*(1+Index/16+1/32)
613 data4 0x510665B5, 0x3FD025FA, 0x24263482, 0x00000000
614 data4 0x362431C9, 0x3FD1151A, 0x242C8DC9, 0x00000000
615 data4 0x67E47C95, 0x3FD20255, 0x245CF9BA, 0x00000000
616 data4 0x7A823CFE, 0x3FD2ED98, 0x235C892C, 0x00000000
617 data4 0x29271134, 0x3FD3D6D1, 0x2389BE52, 0x00000000
618 data4 0x586890E6, 0x3FD4BDEE, 0x24436471, 0x00000000
619 data4 0x175E0F4E, 0x3FD5A2E0, 0x2389DBD4, 0x00000000
620 data4 0x9F5FA6FD, 0x3FD68597, 0x2476D43F, 0x00000000
621 data4 0x52817501, 0x3FD76607, 0x24711774, 0x00000000
622 data4 0xB8DF95D7, 0x3FD84422, 0x23EBB501, 0x00000000
623 data4 0x7CD0C662, 0x3FD91FDE, 0x23883A0C, 0x00000000
624 data4 0x66168001, 0x3FD9F930, 0x240DF63F, 0x00000000
625 data4 0x5422058B, 0x3FDAD00F, 0x23FE261A, 0x00000000
626 data4 0x378624A5, 0x3FDBA473, 0x23A8CD0E, 0x00000000
627 data4 0x0AAD71F8, 0x3FDC7655, 0x2422D1D0, 0x00000000
628 data4 0xC9EC862B, 0x3FDD45AE, 0x2344A109, 0x00000000
630 // Entries Tbl_hi (double precision) Index = 0,1,...,15
631 // B = 2^(-3)*(1+Index/16+1/32)
632 // Entries Tbl_lo (single precision)
633 // Index = 0,1,...,15 B = 2^(-3)*(1+Index/16+1/32)
635 data4 0x84212B3D, 0x3FC068D5, 0x239874B6, 0x00000000
636 data4 0x41060850, 0x3FC16465, 0x2335E774, 0x00000000
637 data4 0x171A535C, 0x3FC25F6E, 0x233E36BE, 0x00000000
638 data4 0xEDEB99A3, 0x3FC359E8, 0x239680A3, 0x00000000
639 data4 0xC6092A9E, 0x3FC453CE, 0x230FB29E, 0x00000000
640 data4 0xBA11570A, 0x3FC54D18, 0x230C1418, 0x00000000
641 data4 0xFFB3AA73, 0x3FC645BF, 0x23F0564A, 0x00000000
642 data4 0xE8A7D201, 0x3FC73DBD, 0x23D4A5E1, 0x00000000
643 data4 0xE398EBC7, 0x3FC8350B, 0x23D4ADDA, 0x00000000
644 data4 0x7D050271, 0x3FC92BA3, 0x23BCB085, 0x00000000
645 data4 0x601081A5, 0x3FCA217E, 0x23BC841D, 0x00000000
646 data4 0x574D780B, 0x3FCB1696, 0x23CF4A8E, 0x00000000
647 data4 0x4D768466, 0x3FCC0AE5, 0x23BECC90, 0x00000000
648 data4 0x4E1D5395, 0x3FCCFE65, 0x2323DCD2, 0x00000000
649 data4 0x864C9D9D, 0x3FCDF110, 0x23F53F3A, 0x00000000
650 data4 0x451D980C, 0x3FCEE2E1, 0x23CCB11F, 0x00000000
652 data4 0x54442D18, 0x400921FB, 0x33145C07, 0x3CA1A626 // PI two doubles
653 data4 0x54442D18, 0x3FF921FB, 0x33145C07, 0x3C91A626 // PI_by_2 two dbles
654 data4 0x54442D18, 0x3FE921FB, 0x33145C07, 0x3C81A626 // PI_by_4 two dbles
655 data4 0x7F3321D2, 0x4002D97C, 0x4C9E8A0A, 0x3C9A7939 // 3PI_by_4 two dbles
656 ASM_SIZE_DIRECTIVE(Constants_atan)
667 (p0) mov ArgX_orig = f1
668 (p0) br.cond.sptk atan2l ;;
671 ASM_SIZE_DIRECTIVE(atanl)
679 .proc __ieee754_atan2l#
680 .global __ieee754_atan2l#
691 alloc r32 = ar.pfs, 0, 17 , 4, 0
692 (p0) mov ArgY = ArgY_orig
696 (p0) mov ArgX = ArgX_orig
701 (p0) fclass.m.unc p7,p0 = ArgY_orig, 0x103
708 // Save original input args and load table ptr.
710 (p0) fclass.m.unc p6,p0 = ArgX_orig, 0x103
714 (p0) addl table_ptr1 = @ltoff(Constants_atan#), gp
715 (p0) fclass.m.unc p0,p9 = ArgY_orig, 0x1FF
719 ld8 table_ptr1 = [table_ptr1]
720 (p0) fclass.m.unc p0,p8 = ArgX_orig, 0x1FF
725 (p0) fclass.m.unc p13,p0 = ArgY_orig, 0x0C3
729 (p0) fclass.m.unc p12,p0 = ArgX_orig, 0x0C3
735 // Check for NatVals.
736 // Check for everything - if false, then must be pseudo-zero
737 // or pseudo-nan (IA unsupporteds).
742 (p6) br.cond.spnt L(ATANL_NATVAL) ;;
748 (p7) br.cond.spnt L(ATANL_NATVAL) ;;
751 (p0) ldfd P_hi = [table_ptr1],8
753 (p8) br.cond.spnt L(ATANL_UNSUPPORTED) ;;
756 (p0) add table_ptr2 = 96, table_ptr1
757 (p9) br.cond.spnt L(ATANL_UNSUPPORTED)
759 // Load double precision high-order part of pi
761 (p12) br.cond.spnt L(ATANL_NAN) ;;
765 (p0) fnorm.s1 ArgX = ArgX
766 (p13) br.cond.spnt L(ATANL_NAN) ;;
769 // Normalize the input argument.
770 // Branch out if NaN inputs
773 (p0) ldfs P_lo = [table_ptr1], 4
775 (p0) fnorm.s1 ArgY = ArgY ;;
779 (p0) ldfs TWO_TO_NEG3 = [table_ptr1], 180
781 // U = max(ArgX_abs,ArgY_abs)
782 // V = min(ArgX_abs,ArgY_abs)
791 // Get exp and sign of ArgX
792 // Get exp and sign of ArgY
793 // Load 2**(-3) and increment ptr to Q_4.
795 (p0) fmerge.s ArgX_abs = f1, ArgX
799 // load single precision low-order part of pi = P_lo
802 (p0) getf.exp sign_X = ArgX
803 (p0) fmerge.s ArgY_abs = f1, ArgY
807 (p0) getf.exp sign_Y = ArgY
809 (p0) shr sign_X = sign_X, 17 ;;
813 (p0) shr sign_Y = sign_Y, 17 ;;
814 (p0) cmp.eq.unc p8, p9 = 0x00000, sign_Y ;;
819 // Is ArgX_abs >= ArgY_abs
822 (p0) fmax.s1 U = ArgX_abs, ArgY_abs
830 // sign_X is sign bit of ArgX
831 // sign_Y is sign bit of ArgY
833 (p0) fcmp.ge.s1 p6, p7 = ArgX_abs, ArgY_abs
838 (p0) fmin.s1 V = ArgX_abs, ArgY_abs
843 (p8) fadd.s1 s_Y = f0, f1
844 (p6) cmp.eq.unc p10, p11 = 0x00000, sign_X
847 (p6) add swap = r0, r0
849 (p7) add swap = 1, r0
858 (p10) fsub.s1 M = M, f1
863 (p9) fsub.s1 s_Y = f0, f1
868 (p0) frcpa.s1 E, p6 = V, U
876 (p6) br.cond.sptk L(ATANL_STEP2)
877 (p0) br.cond.spnt L(ATANL_SPECIAL_HANDLING) ;;
882 (p0) fmpy.s1 Q = E, V
887 (p0) fcmp.eq.s0 p0, p9 = f1, ArgY_orig
895 (p0) fcmp.eq.s0 p0, p8 = f1, ArgX_orig
900 (p11) fadd.s1 M = M, f1
905 // *************************************************
906 // ********************* STEP2 *********************
907 // *************************************************
908 (p0) movl special = 0x8400000000000000
913 // lookup = b_1 b_2 b_3 B_4
915 (p0) movl special1 = 0x0000000000000100 ;;
920 // Do fnorms to raise any denormal operand
923 (p0) fmpy.s1 P_hi = M, P_hi
928 (p0) fmpy.s1 P_lo = M, P_lo
936 (p0) fcmp.lt.unc.s1 p6, p7 = Q, TWO_TO_NEG3
940 (p0) getf.sig significand_Q = Q
941 (p0) getf.exp exponent_Q = Q
946 (p0) andcm k = 0x0003, exponent_Q
947 (p0) extr.u lookup = significand_Q, 59, 4 ;;
951 (p0) dep special = lookup, special, 59, 4
953 // Generate 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
955 (p6) br.cond.spnt L(ATANL_POLY) ;;
958 (p0) cmp.eq.unc p8, p9 = 0x0000, k
959 (p0) fmpy.s1 P_hi = s_Y, P_hi
961 // We waited a few extra cycles so P_lo and P_hi could be calculated.
962 // Load the constant 256 for loading up table entries.
964 // *************************************************
965 // ******************** STEP3 **********************
966 // *************************************************
967 (p0) add table_ptr2 = 16, table_ptr1
970 // Let z_hi have exponent and sign of original Q
971 // Load the Tbl_hi(0) else, increment pointer.
974 (p0) ldfe Q_4 = [table_ptr1], -16
975 (p0) xor swap = sign_X, swap ;;
976 (p9) sub k = k, r0, 1
979 (p0) setf.sig z_hi = special
980 (p0) ldfe Q_3 = [table_ptr1], -16
981 (p9) add table_ptr2 = 16, table_ptr2 ;;
984 // U_hold = U - U_prime_hi
985 // k = k * 256 - Result can be 0, 256, or 512.
988 (p0) ldfe Q_2 = [table_ptr1], -16
989 (p8) ldfd Tbl_hi = [table_ptr2], 8
993 // U_prime_lo = U_hold + V * z_hi
994 // lookup -> lookup * 16 + k
997 (p0) ldfe Q_1 = [table_ptr1], -16 ;;
998 (p8) ldfs Tbl_lo = [table_ptr2], 8
1000 // U_prime_hi = U + V * z_hi
1001 // Load the Tbl_lo(0)
1003 (p9) pmpy2.r k = k, special1 ;;
1023 (p9) shladd lookup = lookup, 0x0004, k ;;
1026 (p9) add table_ptr2 = table_ptr2, lookup ;;
1028 // V_prime = V - U * z_hi
1030 (p9) ldfd Tbl_hi = [table_ptr2], 8
1036 // C_hi = frcpa(1,U_prime_hi)
1038 (p9) ldfs Tbl_lo = [table_ptr2], 8
1040 // z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
1041 // Point to beginning of Tbl_hi entries - k = 0.
1043 (p0) fmerge.se z_hi = Q, z_hi ;;
1047 (p0) fma.s1 U_prime_hi = V, z_hi, U
1052 (p0) fnma.s1 V_prime = U, z_hi, V
1057 (p0) mov A_hi = Tbl_hi
1062 (p0) fsub.s1 U_hold = U, U_prime_hi
1067 (p0) frcpa.s1 C_hi, p6 = f1, U_prime_hi
1071 (p0) cmp.eq.unc p7, p6 = 0x00000, swap
1072 (p0) fmpy.s1 A_hi = s_Y, A_hi
1078 // poly = wsq * poly
1080 (p7) fadd.s1 sigma = f0, f1
1085 (p0) fma.s1 U_prime_lo = z_hi, V, U_hold
1090 (p6) fsub.s1 sigma = f0, f1
1095 (p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
1101 // A_lo = A_lo + w_hi
1102 // A_hi = s_Y * A_hi
1104 (p0) fma.s1 Res_hi = sigma, A_hi, P_hi
1110 // C_hi_hold = 1 - C_hi * U_prime_hi (1)
1112 (p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi
1118 // C_hi = C_hi + C_hi * C_hi_hold (1)
1120 (p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
1126 // C_hi_hold = 1 - C_hi * U_prime_hi (2)
1128 (p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi
1134 // C_hi = C_hi + C_hi * C_hi_hold (2)
1136 (p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
1142 // C_hi_hold = 1 - C_hi * U_prime_hi (3)
1144 (p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi
1150 // C_hi = C_hi + C_hi * C_hi_hold (3)
1152 (p0) fmpy.s1 w_hi = V_prime, C_hi
1158 // w_hi = V_prime * C_hi
1160 (p0) fmpy.s1 wsq = w_hi, w_hi
1165 (p0) fnma.s1 w_lo = w_hi, U_prime_hi, V_prime
1171 // wsq = w_hi * w_hi
1172 // w_lo = = V_prime - w_hi * U_prime_hi
1174 (p0) fma.s1 poly = wsq, Q_4, Q_3
1179 (p0) fnma.s1 w_lo = w_hi, U_prime_lo, w_lo
1185 // poly = Q_3 + wsq * Q_4
1186 // w_lo = = w_lo - w_hi * U_prime_lo
1188 (p0) fma.s1 poly = wsq, poly, Q_2
1193 (p0) fmpy.s1 w_lo = C_hi, w_lo
1199 // poly = Q_2 + wsq * poly
1200 // w_lo = = w_lo * C_hi
1202 (p0) fma.s1 poly = wsq, poly, Q_1
1207 (p0) fadd.s1 A_lo = Tbl_lo, w_lo
1213 // Result = Res_hi + Res_lo * s_Y (User Supplied Rounding Mode)
1215 (p0) fmpy.s0 Q_1 = Q_1, Q_1
1221 // poly = Q_1 + wsq * poly
1222 // A_lo = Tbl_lo + w_lo
1223 // swap = xor(swap,sign_X)
1225 (p0) fmpy.s1 poly = wsq, poly
1232 // poly = wsq * poly
1235 (p0) fmpy.s1 poly = w_hi, poly
1241 // if (PR_1) sigma = -1.0
1242 // if (PR_2) sigma = 1.0
1244 (p0) fadd.s1 A_lo = A_lo, poly
1250 // P_hi = s_Y * P_hi
1251 // A_lo = A_lo + poly
1253 (p0) fadd.s1 A_lo = A_lo, w_hi
1258 (p0) fma.s1 Res_lo = sigma, A_lo, P_lo
1264 // Res_hi = P_hi + sigma * A_hi
1265 // Res_lo = P_lo + sigma * A_lo
1267 (p0) fma.s0 Result = Res_lo, s_Y, Res_hi
1274 // poly1 = P_5 + zsq * poly1
1275 // poly2 = zsq * poly2
1279 (p0) xor swap = sign_X, swap
1281 (p0) fnma.s1 E_hold = E, U, f1 ;;
1287 // poly1 = P_4 + zsq * poly1
1288 // swap = xor(swap,sign_X)
1292 // poly1 = poly1 <== Done with poly1
1293 // poly1 = P_4 + zsq * poly1
1294 // swap = xor(swap,sign_X)
1296 (p0) cmp.eq.unc p7, p6 = 0x00000, swap
1300 (p0) fmpy.s1 P_hi = s_Y, P_hi
1305 (p6) fsub.s1 sigma = f0, f1
1310 (p7) fadd.s1 sigma = f0, f1
1314 // ***********************************************
1315 // ******************** STEP4 ********************
1316 // ***********************************************
1320 (p0) addl table_ptr1 = @ltoff(Constants_atan#), gp
1326 ld8 table_ptr1 = [table_ptr1]
1335 (p0) fma.s1 E = E, E_hold, E
1338 // Iterate 3 times E = E + E*(1.0 - E*U)
1339 // Also load P_8, P_7, P_6, P_5, P_4
1340 // E_hold = 1.0 - E * U (1)
1343 (p0) add table_ptr1 = 128, table_ptr1 ;;
1348 // E = E + E_hold*E (1)
1351 (p0) ldfe P_8 = [table_ptr1], -16
1353 // poly = z8*poly1 + poly2 (Typo in writeup)
1356 (p0) fnma.s1 z_lo = A_temp, U, V ;;
1361 // E_hold = 1.0 - E * U (2)
1363 (p0) ldfe P_7 = [table_ptr1], -16
1369 // E = E + E_hold*E (2)
1371 (p0) ldfe P_6 = [table_ptr1], -16
1377 // E_hold = 1.0 - E * U (3)
1379 (p0) ldfe P_5 = [table_ptr1], -16
1385 // E = E + E_hold*E (3)
1388 // At this point E approximates 1/U to roughly working precision
1389 // z = V*E approximates V/U
1391 (p0) ldfe P_4 = [table_ptr1], -16
1392 (p0) fnma.s1 E_hold = E, U, f1 ;;
1399 (p0) ldfe P_3 = [table_ptr1], -16
1407 (p0) ldfe P_2 = [table_ptr1], -16
1415 (p0) ldfe P_1 = [table_ptr1], -16
1420 (p0) movl int_temp = 0x24005
1424 (p0) fma.s1 E = E, E_hold, E
1429 (p0) fnma.s1 E_hold = E, U, f1
1434 (p0) fma.s1 E = E, E_hold, E
1439 (p0) fmpy.s1 Z = V, E
1445 // z_lo = V - A_temp * U
1446 // if (PR_2) sigma = 1.0
1448 (p0) fmpy.s1 z_lo = z_lo, E
1453 (p0) fmpy.s1 zsq = Z, Z
1460 // if (PR_1) sigma = -1.0
1462 (p0) fadd.s1 A_hi = A_temp, z_lo
1471 // Now what we want to do is
1472 // poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
1473 // poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3))
1475 (p0) fma.s1 poly1 = zsq, P_8, P_7
1480 (p0) fma.s1 poly2 = zsq, P_3, P_2
1485 (p0) fmpy.s1 z8 = zsq, zsq
1490 (p0) fsub.s1 A_temp = A_temp, A_hi
1496 // A_lo = Z * poly + z_lo
1498 (p0) fmerge.s tmp = A_hi, A_hi
1504 // poly1 = P_7 + zsq * P_8
1505 // poly2 = P_2 + zsq * P_3
1507 (p0) fma.s1 poly1 = zsq, poly1, P_6
1512 (p0) fma.s1 poly2 = zsq, poly2, P_1
1517 (p0) fmpy.s1 z8 = z8, z8
1522 (p0) fadd.s1 z_lo = A_temp, z_lo
1528 // poly1 = P_6 + zsq * poly1
1529 // poly2 = P_2 + zsq * poly2
1531 (p0) fma.s1 poly1 = zsq, poly1, P_5
1536 (p0) fmpy.s1 poly2 = poly2, zsq
1542 // Result = Res_hi + Res_lo (User Supplied Rounding Mode)
1544 (p0) fmpy.s1 P_5 = P_5, P_5
1549 (p0) fma.s1 poly1 = zsq, poly1, P_4
1554 (p0) fma.s1 poly = z8, poly1, poly2
1560 // Fixup added to force inexact later -
1561 // A_hi = A_temp + z_lo
1562 // z_lo = (A_temp - A_hi) + z_lo
1564 (p0) fma.s1 A_lo = Z, poly, z_lo
1569 (p0) fadd.s1 A_hi = tmp, A_lo
1574 (p0) fsub.s1 tmp = tmp, A_hi
1579 (p0) fmpy.s1 A_hi = s_Y, A_hi
1584 (p0) fadd.s1 A_lo = tmp, A_lo
1588 (p0) setf.exp tmp = int_temp
1590 // P_hi = s_Y * P_hi
1591 // A_hi = s_Y * A_hi
1593 (p0) fma.s1 Res_hi = sigma, A_hi, P_hi
1598 (p0) fclass.m.unc p6,p0 = A_lo, 0x007
1609 // Res_hi = P_hi + sigma * A_hi
1611 (p0) fsub.s1 tmp = P_hi, Res_hi
1617 // tmp = P_hi - Res_hi
1619 (p0) fma.s1 tmp = A_hi, sigma, tmp
1624 (p0) fma.s1 sigma = A_lo, sigma, P_lo
1630 // tmp = sigma * A_hi + tmp
1631 // sigma = A_lo * sigma + P_lo
1633 (p0) fma.s1 Res_lo = s_Y, sigma, tmp
1639 // Res_lo = s_Y * sigma + tmp
1641 (p0) fadd.s0 Result = Res_lo, Res_hi
1645 L(ATANL_UNSUPPORTED):
1649 (p0) fmpy.s0 Result = ArgX,ArgY
1650 (p0) br.ret.sptk b0 ;;
1652 L(ATANL_SPECIAL_HANDLING):
1655 (p0) fcmp.eq.s0 p0, p6 = f1, ArgY_orig
1660 (p0) fcmp.eq.s0 p0, p5 = f1, ArgX_orig
1665 (p0) fclass.m.unc p6, p7 = ArgY, 0x007
1670 (p0) movl special = 992
1677 (p0) addl table_ptr1 = @ltoff(Constants_atan#), gp
1683 ld8 table_ptr1 = [table_ptr1]
1691 (p0) add table_ptr1 = table_ptr1, special
1693 (p7) br.cond.spnt L(ATANL_ArgY_Not_ZERO) ;;
1696 (p0) ldfd Result = [table_ptr1], 8
1698 (p6) fclass.m.unc p14, p0 = ArgX, 0x035 ;;
1702 (p0) ldfd Result_lo = [table_ptr1], -8
1703 (p6) fclass.m.unc p15, p0 = ArgX, 0x036 ;;
1707 (p14) fmerge.s Result = ArgY, f0
1712 (p6) fclass.m.unc p13, p0 = ArgX, 0x007
1717 (p14) fmerge.s Result_lo = ArgY, f0
1721 (p13) mov GR_Parameter_TAG = 36
1728 // Return sign_Y * 0 when ArgX > +0
1730 (p15) fmerge.s Result = ArgY, Result
1735 (p15) fmerge.s Result_lo = ArgY, Result_lo
1741 // Return sign_Y * 0 when ArgX < -0
1743 (p0) fadd.s0 Result = Result, Result_lo
1744 (p13) br.cond.spnt __libm_error_region ;;
1750 // Call error support funciton for atan(0,0)
1752 (p0) br.ret.sptk b0 ;;
1754 L(ATANL_ArgY_Not_ZERO):
1757 (p0) fclass.m.unc p9, p10 = ArgY, 0x023
1763 (p10) br.cond.spnt L(ATANL_ArgY_Not_INF) ;;
1767 (p9) fclass.m.unc p6, p0 = ArgX, 0x017
1772 (p9) fclass.m.unc p7, p0 = ArgX, 0x021
1777 (p9) fclass.m.unc p8, p0 = ArgX, 0x022
1781 (p6) add table_ptr1 = 16, table_ptr1 ;;
1782 (p0) ldfd Result = [table_ptr1], 8
1786 (p0) ldfd Result_lo = [table_ptr1], -8
1792 (p6) fmerge.s Result = ArgY, Result
1797 (p6) fmerge.s Result_lo = ArgY, Result_lo
1802 (p6) fadd.s0 Result = Result, Result_lo
1803 (p6) br.ret.sptk b0 ;;
1806 // Load PI/2 and adjust its sign.
1807 // Return +PI/2 when ArgY = +Inf and ArgX = +/-0 or normal
1808 // Return -PI/2 when ArgY = -Inf and ArgX = +/-0 or normal
1811 (p7) add table_ptr1 = 32, table_ptr1 ;;
1812 (p7) ldfd Result = [table_ptr1], 8
1816 (p7) ldfd Result_lo = [table_ptr1], -8
1822 (p7) fmerge.s Result = ArgY, Result
1827 (p7) fmerge.s Result_lo = ArgY, Result_lo
1832 (p7) fadd.s0 Result = Result, Result_lo
1833 (p7) br.ret.sptk b0 ;;
1836 // Load PI/4 and adjust its sign.
1837 // Return +PI/4 when ArgY = +Inf and ArgX = +Inf
1838 // Return -PI/4 when ArgY = -Inf and ArgX = +Inf
1841 (p8) add table_ptr1 = 48, table_ptr1 ;;
1842 (p8) ldfd Result = [table_ptr1], 8
1846 (p8) ldfd Result_lo = [table_ptr1], -8
1852 (p8) fmerge.s Result = ArgY, Result
1857 (p8) fmerge.s Result_lo = ArgY, Result_lo
1862 (p8) fadd.s0 Result = Result, Result_lo
1863 (p8) br.ret.sptk b0 ;;
1865 L(ATANL_ArgY_Not_INF):
1869 // Load PI/4 and adjust its sign.
1870 // Return +3PI/4 when ArgY = +Inf and ArgX = -Inf
1871 // Return -3PI/4 when ArgY = -Inf and ArgX = -Inf
1873 (p0) fclass.m.unc p6, p0 = ArgX, 0x007
1878 (p0) fclass.m.unc p7, p0 = ArgX, 0x021
1883 (p0) fclass.m.unc p8, p0 = ArgX, 0x022
1887 (p6) add table_ptr1 = 16, table_ptr1 ;;
1888 (p6) ldfd Result = [table_ptr1], 8
1892 (p6) ldfd Result_lo = [table_ptr1], -8
1898 (p6) fmerge.s Result = ArgY, Result
1903 (p6) fmerge.s Result_lo = ArgY, Result_lo
1908 (p6) fadd.s0 Result = Result, Result_lo
1909 (p6) br.ret.spnt b0 ;;
1914 // return = sign_Y * PI/2 when ArgX = 0
1916 (p7) fmerge.s Result = ArgY, f0
1921 (p7) fnorm.s0 Result = Result
1922 (p7) br.ret.spnt b0 ;;
1925 // return = sign_Y * 0 when ArgX = Inf
1928 (p8) ldfd Result = [table_ptr1], 8 ;;
1929 (p8) ldfd Result_lo = [table_ptr1], -8
1934 (p8) fmerge.s Result = ArgY, Result
1939 (p8) fmerge.s Result_lo = ArgY, Result_lo
1944 (p8) fadd.s0 Result = Result, Result_lo
1945 (p8) br.ret.sptk b0 ;;
1948 // return = sign_Y * PI when ArgX = -Inf
1951 ASM_SIZE_DIRECTIVE(atan2l)
1952 ASM_SIZE_DIRECTIVE(__atan2l)
1953 ASM_SIZE_DIRECTIVE(__ieee754_atan2l)
1955 .proc __libm_error_region
1956 __libm_error_region:
1959 add GR_Parameter_Y=-32,sp // Parameter 2 value
1961 .save ar.pfs,GR_SAVE_PFS
1962 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
1966 add sp=-64,sp // Create new stack
1968 mov GR_SAVE_GP=gp // Save gp
1971 stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
1972 add GR_Parameter_X = 16,sp // Parameter 1 address
1973 .save b0, GR_SAVE_B0
1974 mov GR_SAVE_B0=b0 // Save b0
1978 stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
1979 add GR_Parameter_RESULT = 0,GR_Parameter_Y
1980 nop.b 0 // Parameter 3 address
1983 stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
1984 add GR_Parameter_Y = -16,GR_Parameter_Y
1985 br.call.sptk b0=__libm_error_support# // Call error handling function
1990 add GR_Parameter_RESULT = 48,sp
1993 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
1995 add sp = 64,sp // Restore stack pointer
1996 mov b0 = GR_SAVE_B0 // Restore return address
1999 mov gp = GR_SAVE_GP // Restore gp
2000 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
2001 br.ret.sptk b0 // Return
2004 .endp __libm_error_region
2005 ASM_SIZE_DIRECTIVE(__libm_error_region)
2006 .type __libm_error_support#,@function
2007 .global __libm_error_support#