1 .file "libm_atan2_reg.s"
3 // Copyright (C) 2000, 2001, Intel Corporation
4 // All rights reserved.
6 // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
7 // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
13 // * Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
16 // * Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
20 // * The name of Intel Corporation may not be used to endorse or promote
21 // products derived from this software without specific prior written
24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 // Intel Corporation is the author of this code, and requests that all
37 // problem reports or change requests be submitted to it directly at
38 // http://developer.intel.com/opensource.
41 //==============================================================
42 // 2/02/00: Initial version
43 // 4/04/00 Unwind support added
45 #include "libm_support.h"
50 ASM_TYPE_DIRECTIVE(Constants_atan#,@object)
52 data4 0x54442D18, 0x3FF921FB, 0x248D3132, 0x3E000000
53 // double pi/2, single lo_pi/2, two**(-3)
54 data4 0xAAAAAAA3, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // P_1
55 data4 0xCCCC54B2, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // P_2
56 data4 0x47E4D0C2, 0x92492492, 0x0000BFFC, 0x00000000 // P_3
57 data4 0x58870889, 0xE38E38E0, 0x00003FFB, 0x00000000 // P_4
58 data4 0x290149F8, 0xBA2E895B, 0x0000BFFB, 0x00000000 // P_5
59 data4 0x250F733D, 0x9D88E6D4, 0x00003FFB, 0x00000000 // P_6
60 data4 0xFB8745A0, 0x884E51FF, 0x0000BFFB, 0x00000000 // P_7
61 data4 0x394396BD, 0xE1C7412B, 0x00003FFA, 0x00000000 // P_8
62 data4 0xAAAAA52F, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // Q_1
63 data4 0xC75B60D3, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // Q_2
64 data4 0x011F1940, 0x924923AD, 0x0000BFFC, 0x00000000 // Q_3
65 data4 0x2A5F89BD, 0xE36F716D, 0x00003FFB, 0x00000000 // Q_4
66 // Entries Tbl_hi (double precision)
67 // B = 1+Index/16+1/32 Index = 0
68 // Entries Tbl_lo (single precision)
69 // B = 1+Index/16+1/32 Index = 0
70 data4 0xA935BD8E, 0x3FE9A000, 0x23ACA08F, 0x00000000
71 // Entries Tbl_hi (double precision) Index = 0,1,...,15
72 // B = 2^(-1)*(1+Index/16+1/32)
73 // Entries Tbl_lo (single precision)
74 // Index = 0,1,...,15 B = 2^(-1)*(1+Index/16+1/32)
75 data4 0x7F175A34, 0x3FDE77EB, 0x238729EE, 0x00000000
76 data4 0x73C1A40B, 0x3FE0039C, 0x249334DB, 0x00000000
77 data4 0x5B5B43DA, 0x3FE0C614, 0x22CBA7D1, 0x00000000
78 data4 0x88BE7C13, 0x3FE1835A, 0x246310E7, 0x00000000
79 data4 0xE2CC9E6A, 0x3FE23B71, 0x236210E5, 0x00000000
80 data4 0x8406CBCA, 0x3FE2EE62, 0x2462EAF5, 0x00000000
81 data4 0x1CD41719, 0x3FE39C39, 0x24B73EF3, 0x00000000
82 data4 0x5B795B55, 0x3FE44506, 0x24C11260, 0x00000000
83 data4 0x5BB6EC04, 0x3FE4E8DE, 0x242519EE, 0x00000000
84 data4 0x1F732FBA, 0x3FE587D8, 0x24D4346C, 0x00000000
85 data4 0x115D7B8D, 0x3FE6220D, 0x24ED487B, 0x00000000
86 data4 0x920B3D98, 0x3FE6B798, 0x2495FF1E, 0x00000000
87 data4 0x8FBA8E0F, 0x3FE74897, 0x223D9531, 0x00000000
88 data4 0x289FA093, 0x3FE7D528, 0x242B0411, 0x00000000
89 data4 0x576CC2C5, 0x3FE85D69, 0x2335B374, 0x00000000
90 data4 0xA99CC05D, 0x3FE8E17A, 0x24C27CFB, 0x00000000
92 // Entries Tbl_hi (double precision) Index = 0,1,...,15
93 // B = 2^(-2)*(1+Index/16+1/32)
94 // Entries Tbl_lo (single precision)
95 // Index = 0,1,...,15 B = 2^(-2)*(1+Index/16+1/32)
97 data4 0x510665B5, 0x3FD025FA, 0x24263482, 0x00000000
98 data4 0x362431C9, 0x3FD1151A, 0x242C8DC9, 0x00000000
99 data4 0x67E47C95, 0x3FD20255, 0x245CF9BA, 0x00000000
100 data4 0x7A823CFE, 0x3FD2ED98, 0x235C892C, 0x00000000
101 data4 0x29271134, 0x3FD3D6D1, 0x2389BE52, 0x00000000
102 data4 0x586890E6, 0x3FD4BDEE, 0x24436471, 0x00000000
103 data4 0x175E0F4E, 0x3FD5A2E0, 0x2389DBD4, 0x00000000
104 data4 0x9F5FA6FD, 0x3FD68597, 0x2476D43F, 0x00000000
105 data4 0x52817501, 0x3FD76607, 0x24711774, 0x00000000
106 data4 0xB8DF95D7, 0x3FD84422, 0x23EBB501, 0x00000000
107 data4 0x7CD0C662, 0x3FD91FDE, 0x23883A0C, 0x00000000
108 data4 0x66168001, 0x3FD9F930, 0x240DF63F, 0x00000000
109 data4 0x5422058B, 0x3FDAD00F, 0x23FE261A, 0x00000000
110 data4 0x378624A5, 0x3FDBA473, 0x23A8CD0E, 0x00000000
111 data4 0x0AAD71F8, 0x3FDC7655, 0x2422D1D0, 0x00000000
112 data4 0xC9EC862B, 0x3FDD45AE, 0x2344A109, 0x00000000
114 // Entries Tbl_hi (double precision) Index = 0,1,...,15
115 // B = 2^(-3)*(1+Index/16+1/32)
116 // Entries Tbl_lo (single precision)
117 // Index = 0,1,...,15 B = 2^(-3)*(1+Index/16+1/32)
119 data4 0x84212B3D, 0x3FC068D5, 0x239874B6, 0x00000000
120 data4 0x41060850, 0x3FC16465, 0x2335E774, 0x00000000
121 data4 0x171A535C, 0x3FC25F6E, 0x233E36BE, 0x00000000
122 data4 0xEDEB99A3, 0x3FC359E8, 0x239680A3, 0x00000000
123 data4 0xC6092A9E, 0x3FC453CE, 0x230FB29E, 0x00000000
124 data4 0xBA11570A, 0x3FC54D18, 0x230C1418, 0x00000000
125 data4 0xFFB3AA73, 0x3FC645BF, 0x23F0564A, 0x00000000
126 data4 0xE8A7D201, 0x3FC73DBD, 0x23D4A5E1, 0x00000000
127 data4 0xE398EBC7, 0x3FC8350B, 0x23D4ADDA, 0x00000000
128 data4 0x7D050271, 0x3FC92BA3, 0x23BCB085, 0x00000000
129 data4 0x601081A5, 0x3FCA217E, 0x23BC841D, 0x00000000
130 data4 0x574D780B, 0x3FCB1696, 0x23CF4A8E, 0x00000000
131 data4 0x4D768466, 0x3FCC0AE5, 0x23BECC90, 0x00000000
132 data4 0x4E1D5395, 0x3FCCFE65, 0x2323DCD2, 0x00000000
133 data4 0x864C9D9D, 0x3FCDF110, 0x23F53F3A, 0x00000000
134 data4 0x451D980C, 0x3FCEE2E1, 0x23CCB11F, 0x00000000
135 data4 0x54442D18, 0x400921FB, 0x33145C07, 0x3CA1A626 // I two doubles
136 data4 0x54442D18, 0x3FF921FB, 0x33145C07, 0x3C91A626 // I_by_2 two dbls
137 data4 0x54442D18, 0x3FE921FB, 0x33145C07, 0x3C81A626 // I_by_4 two dbls
138 data4 0x7F3321D2, 0x4002D97C, 0x4C9E8A0A, 0x3C9A7939 // 3I_by_4 two dbls
139 ASM_SIZE_DIRECTIVE(Constants_atan#)
142 .proc __libm_atan2_reg#
143 .global __libm_atan2_reg#
149 alloc r32 = ar.pfs,0,20,4,0
155 (p0) addl r39 = @ltoff(Constants_atan#), gp
168 nop 999 // EMbo added ...
172 nop 999 // EMbo added ...
173 (p0) fclass.nm.unc p9,p0 = f32 ,0x1FF
174 nop 999;; // EMbo added ...
176 nop 999 // EMbo added ...
177 (p0) fclass.nm.unc p8,p0 = f33 ,0x1FF
178 nop 999 // EMbo added ...
180 nop 999 // EMbo added ...
181 (p0) fclass.m.unc p6,p0 = f33 ,0x103
182 nop 999;; // EMbo added ...
184 nop 999 // EMbo added ...
185 (p0) fclass.m.unc p7,p0 = f32 ,0x103
186 nop 999 // EMbo added ...
188 nop 999 // EMbo added ...
189 (p0) fclass.m.unc p12,p0 = f33 ,0x0C3
190 nop 999;; // EMbo added ...
192 nop 999 // EMbo added ...
194 // Check for NatVals.
195 // Check for EM Unsupporteds
198 (p0) fclass.m.unc p13,p0 = f32 ,0x0C3
199 (p6) br.cond.sptk L(ATAN_NATVAL);;
201 nop 999 // EMbo added ...
202 (p7) br.cond.sptk L(ATAN_NATVAL)
203 (p8) br.cond.sptk L(ATAN_UNSUPPORTED);;
205 (p0) add r40 = 96, r39
206 nop 999 // EMbo added ...
207 (p9) br.cond.sptk L(ATAN_UNSUPPORTED);;
209 (p0) ldfd f50 = [r39],8
210 nop 999 // EMbo added ...
211 (p12) br.cond.sptk L(ATAN_NAN);;
213 nop 999 // EMbo added ...
214 (p0) fnorm.s1 f33 = f33
215 (p13) br.cond.sptk L(ATAN_NAN);;
217 (p0) ldfs f51 = [r39],4
219 // Remove sign bits from exponents
221 // Normalize the input argument.
223 (p0) fnorm.s1 f32 = f32
224 nop 999 // EMbo added ...
226 nop 999 // EMbo added ...
228 nop 999;; // EMbo added ...
230 nop 999;; // EMbo added ...
231 (p0) ldfs f78 = [r39],180
232 nop 999;; // EMbo added ...
234 (p0) getf.exp r36 = f33;;
236 // Get exp and sign of ArgX
237 // Get exp and sign of ArgY
238 // Load 2**(-3) and increment ptr to Q_4.
240 (p0) getf.exp r37 = f32
241 (p0) shr.u r36 = r36,17;;
243 nop 999 // EMbo added ...
244 (p0) fmerge.s f84 = f1,f32
245 (p0) shr.u r37 = r37,17;;
247 nop 999 // EMbo added ...
251 // sign_X is sign bit of ArgX
252 // sign_Y is sign bit of ArgY
254 (p0) fmerge.s f83 = f1,f33
255 (p0) cmp.eq.unc p8,p9 = 0x00000, r37;;
257 nop 999 // EMbo added ...
258 (p8) fadd.s1 f34 = f0, f1
259 nop 999;; // EMbo added ...
261 nop 999 // EMbo added ...
262 (p9) fsub.s1 f34 = f0, f1
263 nop 999;; // EMbo added ...
265 nop 999 // EMbo added ...
266 (p0) fmin.s1 f36 = f83, f84
267 nop 999 // EMbo added ...
269 nop 999 // EMbo added ...
270 (p0) fmax.s1 f35 = f83, f84
271 nop 999;; // EMbo added ...
273 nop 999 // EMbo added ...
275 // Is ArgX_abs >= ArgY_abs
278 (p0) fcmp.ge.s1 p6,p7 = f83,f84
279 nop 999;; // EMbo added ...
281 (p6) cmp.eq.unc p10, p11 = 0x00000, r36
282 (p6) add r38 = r0, r0;;
284 // U = max(ArgX_abs,ArgY_abs)
285 // V = min(ArgX_abs,ArgY_abs)
294 (p7) add r38 = 1,r0;;
296 nop 999 // EMbo added ...
297 (p0) frcpa.s1 f37, p6 = f36, f35
298 nop 999;; // EMbo added ...
300 nop 999 // EMbo added ...
304 (p10) fsub.s1 f82 = f82, f1
305 (p6) br.cond.sptk L(ATAN_STEP2);;
307 nop 999 // EMbo added ...
308 nop 999 // EMbo added ...
309 // /**************************************************/
310 // /********************* STEP2 **********************/
311 // /**************************************************/
312 (p0) br.cond.spnt L(ATAN_SPECIAL_HANDLING);;
316 nop 999 // EMbo added ...
317 (p0) movl r47 = 0x8400000000000000
319 nop 999 // EMbo added ...
320 (p0) movl r48 = 0x0000000000000100;;
322 nop 999 // EMbo added ...
323 (p0) fmpy.s1 f38 = f37, f36
324 nop 999 // EMbo added ...
326 nop 999 // EMbo added ...
327 (p0) fcmp.lt.unc.s0 p0,p9 = f9,f1
328 nop 999;; // EMbo added ...
330 nop 999 // EMbo added ...
331 (p0) fcmp.lt.unc.s0 p0,p8 = f8,f1
332 nop 999 // EMbo added ...
334 nop 999 // EMbo added ...
338 (p11) fadd.s1 f82 = f82, f1
339 nop 999;; // EMbo added ...
341 (p0) getf.sig r46 = f38
342 (p0) fcmp.lt.unc p6,p7 = f38,f78
343 nop 999;; // EMbo added ...
345 nop 999 // EMbo added ...
346 (p0) fmpy.s1 f38 = f37, f36
347 (p0) extr.u r42 = r46, 59, 4;;
349 nop 999 // EMbo added ...
350 (p0) fmpy.s1 f50 = f82, f50
351 (p0) dep r47 = r42, r47, 59, 4
353 nop 999 // EMbo added ...
354 (p0) fmpy.s1 f51 = f82, f51
355 nop 999;; // EMbo added ...
357 nop 999;; // EMbo added ...
362 // Do fcmp to raise any denormal operand
365 (p0) getf.exp r45 = f38
366 nop 999;; // EMbo added ...
369 // lookup = b_1 b_2 b_3 B_4
372 // Generate 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
374 (p0) andcm r41 = 0x0003, r45
375 nop 999 // EMbo added ...
377 // We waited a few extra cycles so P_lo and P_hi could be calculated.
378 // Load the constant 256 for loading up table entries.
380 // /**************************************************/
381 // /********************* STEP3 **********************/
382 // /**************************************************/
383 (p6) br.cond.spnt L(ATAN_POLY);;
385 (p0) setf.sig f39 = r47
386 (p0) cmp.eq.unc p8, p9 = 0x0000, r41
388 // z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
389 // point to beginning of Tbl_hi entries - k = 0.
391 (p0) add r40 = 16, r39
393 (p0) ldfe f73 = [r39],-16;;
394 (p9) sub r41 = r41,r0,1
395 (p9) add r40 = 16,r40
397 (p8) ldfd f48 = [r40],8
398 (p0) fmpy.s1 f50 = f34, f50
399 (p0) xor r38 = r36,r38;;
401 (p0) ldfe f71 = [r39],-16;;
402 (p8) ldfs f49 = [r40],8
403 (p9) pmpy2.r r41 = r41,r48;;
405 (p0) ldfe f69 = [r39],-16
407 // Let z_hi have exponent and sign of original Q
408 // Load the Tbl_hi(0) else, increment pointer.
410 (p0) fmerge.se f39 = f38,f39
411 (p9) shladd r42 = r42,0x0004,r41;;
413 (p9) add r40 = r40, r42;;
414 (p9) ldfd f48 = [r40],8
415 nop 999;; // EMbo added ...
417 (p0) ldfe f67 = [r39],-16;;
418 (p9) ldfs f49 = [r40],8
419 nop 999 // EMbo added ...
421 nop 999 // EMbo added ...
423 // U_prime_hi = U + V * z_hi
424 // Load the Tbl_lo(0)
426 (p0) fma.s1 f40 = f36, f39, f35
427 nop 999;; // EMbo added ...
429 nop 999 // EMbo added ...
430 (p0) fnma.s1 f42 = f35, f39, f36
431 nop 999 // EMbo added ...
433 nop 999 // EMbo added ...
435 nop 999;; // EMbo added ...
437 nop 999 // EMbo added ...
438 (p0) frcpa.s1 f43, p6 = f1, f40
439 nop 999;; // EMbo added ...
441 nop 999 // EMbo added ...
443 // U_prime_lo = U - U_prime_hi
444 // k = k * 256 - result can be 0, 256, or 512.
446 (p0) fsub.s1 f41 = f35, f40
447 (p0) cmp.eq.unc p7, p6 = 0x00000, r38
449 nop 999 // EMbo added ...
450 (p0) fmpy.s1 f52 = f34, f52
451 nop 999;; // EMbo added ...
453 nop 999 // EMbo added ...
454 (p7) fadd.s1 f54 = f0, f1
455 nop 999;; // EMbo added ...
457 nop 999 // EMbo added ...
458 (p6) fsub.s1 f54 = f0, f1
459 nop 999;; // EMbo added ...
461 nop 999 // EMbo added ...
462 (p0) fnma.s1 f80 = f43, f40, f1
463 nop 999;; // EMbo added ...
465 nop 999 // EMbo added ...
466 (p0) fadd.s1 f79 = f41, f40
467 nop 999 // EMbo added ...
469 nop 999 // EMbo added ...
470 (p0) fma.s1 f41 = f36, f39, f41
471 nop 999;; // EMbo added ...
473 nop 999 // EMbo added ...
474 (p0) fma.s1 f56 = f54, f52, f50
475 nop 999;; // EMbo added ...
477 nop 999 // EMbo added ...
478 (p0) fma.s1 f43 = f80, f43, f43
479 nop 999;; // EMbo added ...
481 nop 999 // EMbo added ...
483 // U_prime_lo = U - U_hold
484 // lookup -> lookup * 16 + k
487 // V_prime = V - U * z_hi
488 // U_prime_lo = V * z_hi + U_prime_lo
490 (p0) fsub.s1 f79 = f35, f79
491 nop 999;; // EMbo added ...
493 nop 999 // EMbo added ...
494 (p0) fnma.s1 f80 = f43, f40, f1
495 nop 999;; // EMbo added ...
497 nop 999 // EMbo added ...
499 // C_hi = frcpa(1,U_prime_hi)
500 // U_prime_lo = U_prime_lo + U_hold
503 // C_hi_hold = 1 - C_hi * U_prime_hi (1)
506 // C_hi = C_hi + C_hi * C_hi_hold (1)
509 // C_hi_hold = 1 - C_hi * U_prime_hi (2)
511 (p0) fadd.s1 f41 = f41, f79
512 nop 999;; // EMbo added ...
514 nop 999 // EMbo added ...
516 // C_hi = C_hi + C_hi * C_hi_hold (2)
518 (p0) fma.s1 f43 = f80, f43, f43
519 nop 999;; // EMbo added ...
521 nop 999 // EMbo added ...
523 // C_hi_hold = 1 - C_hi * U_prime_hi (3)
525 (p0) fnma.s1 f80 = f43, f40, f1
526 nop 999;; // EMbo added ...
528 nop 999 // EMbo added ...
530 // C_hi = C_hi + C_hi * C_hi_hold (3)
532 (p0) fma.s1 f43 = f80, f43, f43
533 nop 999;; // EMbo added ...
535 nop 999 // EMbo added ...
537 // w_hi = V_prime * C_hi
539 (p0) fmpy.s1 f44 = f42, f43
540 nop 999;; // EMbo added ...
542 nop 999 // EMbo added ...
543 (p0) fmpy.s1 f46 = f44, f44
544 nop 999 // EMbo added ...
546 nop 999 // EMbo added ...
549 // w_lo = = V_prime - w_hi * U_prime_hi
551 (p0) fnma.s1 f45 = f44, f40, f42
552 nop 999;; // EMbo added ...
554 nop 999 // EMbo added ...
555 (p0) fma.s1 f47 = f46, f73, f71
556 nop 999 // EMbo added ...
558 nop 999 // EMbo added ...
560 // poly = Q_3 + wsq * Q_4
561 // w_lo = = w_lo - w_hi * U_prime_lo
563 (p0) fnma.s1 f45 = f44, f41, f45
564 nop 999;; // EMbo added ...
566 nop 999 // EMbo added ...
567 (p0) fma.s1 f47 = f46, f47, f69
568 nop 999 // EMbo added ...
570 nop 999 // EMbo added ...
572 // poly = Q_2 + wsq * poly
573 // w_lo = = w_lo * C_hi
575 (p0) fmpy.s1 f45 = f43, f45
576 nop 999;; // EMbo added ...
578 nop 999 // EMbo added ...
579 (p0) fma.s1 f47 = f46, f47, f67
580 nop 999 // EMbo added ...
582 nop 999 // EMbo added ...
584 // poly = Q_1 + wsq * poly
585 // A_lo = Tbl_lo + w_lo
586 // swap = xor(swap,sign_X)
588 (p0) fadd.s1 f53 = f49, f45
589 nop 999;; // EMbo added ...
591 nop 999 // EMbo added ...
597 (p0) fmpy.s1 f47 = f46, f47
598 nop 999;; // EMbo added ...
600 nop 999 // EMbo added ...
605 // if (p6) sigma = -1.0
606 // if (p7) sigma = 1.0
608 (p0) fmpy.s1 f47 = f44, f47
609 nop 999;; // EMbo added ...
611 nop 999 // EMbo added ...
614 // A_lo = A_lo + poly
616 (p0) fadd.s1 f53 = f53, f47
617 nop 999;; // EMbo added ...
619 nop 999 // EMbo added ...
621 // A_lo = A_lo + w_hi
624 (p0) fadd.s1 f53 = f53, f44
625 nop 999;; // EMbo added ...
627 nop 999 // EMbo added ...
629 // result_hi = P_hi + sigma * A_hi
630 // result_lo = P_lo + sigma * A_lo
632 (p0) fma.s1 f55 = f54, f53, f51
633 (p0) br.cond.sptk L(RETURN_ATAN);;
636 // result = result_hi + result_lo * s_Y (User Supplied Rounding Mode)
638 // (p0) fma.d.s0 f57 = f55, f34, f56
640 // /**************************************************/
641 // /********************* STEP4 **********************/
642 // /**************************************************/
646 (p0) xor r38 = r36,r38
647 (p0) addl r39 = @ltoff(Constants_atan#), gp
661 nop 999 // EMbo added ...
662 (p0) movl r47 = 0x24005;;
664 (p0) add r39 = 128, r39
665 (p0) fnma.s1 f81 = f37, f35, f1
666 (p0) cmp.eq.unc p7, p6 = 0x00000, r38;;
668 nop 999 // EMbo added ...
669 (p0) ldfe f77 = [r39],-16
671 // Iterate 3 times E = E + E*(1.0 - E*U)
672 // Also load P_8, P_7, P_6, P_5, P_4
673 // E_hold = 1.0 - E * U (1)
678 nop 999 // EMbo added ...
679 (p0) ldfe f76 = [r39],-16
680 (p6) fsub.s1 f54 = f0, f1;;
682 nop 999 // EMbo added ...
683 (p0) ldfe f75 = [r39],-16
685 // E = E + E_hold*E (1)
688 (p0) fma.s1 f37 = f37, f81, f37;;
690 nop 999 // EMbo added ...
691 (p0) ldfe f74 = [r39],-16
692 (p0) fnma.s1 f64 = f85, f35, f36;;
694 nop 999 // EMbo added ...
695 (p0) ldfe f72 = [r39],-16
696 (p7) fadd.s1 f54 = f0, f1;;
698 nop 999 // EMbo added ...
699 (p0) ldfe f70 = [r39],-16
701 // E_hold = 1.0 - E * U (2)
703 (p0) fnma.s1 f81 = f37, f35, f1;;
705 nop 999 // EMbo added ...
706 (p0) ldfe f68 = [r39],-16
707 (p0) fmpy.s1 f50 = f34, f50;;
709 nop 999 // EMbo added ...
710 (p0) ldfe f66 = [r39],-16
711 (p0) fmpy.d.s0 f67 = f67, f67
713 nop 999 // EMbo added ...
715 // E = E + E_hold*E (2)
717 (p0) fma.s1 f37 = f37, f81, f37
718 nop 999;; // EMbo added ...
720 nop 999 // EMbo added ...
722 // E_hold = 1.0 - E * U (3)
724 (p0) fnma.s1 f81 = f37, f35, f1
725 nop 999;; // EMbo added ...
727 nop 999 // EMbo added ...
729 // E = E + E_hold*E (3)
730 // At this point E approximates 1/U to roughly working precision
731 // z = V*E approximates V/U
733 (p0) fma.s1 f37 = f37, f81, f37
734 nop 999;; // EMbo added ...
736 nop 999 // EMbo added ...
740 (p0) fmpy.s1 f59 = f36, f37
741 nop 999 // EMbo added ...
743 nop 999 // EMbo added ...
744 (p0) fmpy.s1 f64 = f64, f37
745 nop 999;; // EMbo added ...
747 nop 999 // EMbo added ...
752 (p0) fmpy.s1 f60 = f59, f59
753 nop 999 // EMbo added ...
755 nop 999 // EMbo added ...
756 (p0) fadd.s1 f52 = f85, f64
757 nop 999;; // EMbo added ...
759 nop 999 // EMbo added ...
760 (p0) fma.s1 f62 = f60, f77, f76
761 nop 999 // EMbo added ...
763 nop 999 // EMbo added ...
764 (p0) fma.s1 f63 = f60, f70, f68
765 nop 999;; // EMbo added ...
767 nop 999 // EMbo added ...
772 (p0) fmpy.s1 f61 = f60, f60
773 nop 999 // EMbo added ...
775 nop 999 // EMbo added ...
776 (p0) fsub.s1 f85 = f85, f52
777 nop 999;; // EMbo added ...
779 nop 999 // EMbo added ...
780 (p0) fmerge.s f65 = f52,f52
781 nop 999;; // EMbo added ...
783 nop 999 // EMbo added ...
784 (p0) fma.s1 f62 = f60, f62, f75
785 nop 999 // EMbo added ...
787 nop 999 // EMbo added ...
788 (p0) fma.s1 f63 = f60, f63, f66
789 nop 999;; // EMbo added ...
791 nop 999 // EMbo added ...
795 // poly1 = _4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
796 // poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3))
799 // poly1 = P_7 + zsq * P_8
800 // poly2 = P_2 + zsq * P_3
801 // poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*poly1))
802 // poly2 = zsq*(P_1 + zsq*poly2)
805 // poly1 = P_6 + zsq * poly1
806 // poly2 = P_1 + zsq * poly2
807 // poly1 = P_4 + zsq*(P_5 + zsq*poly1)
810 (p0) fmpy.s1 f61 = f61, f61
811 nop 999 // EMbo added ...
813 nop 999 // EMbo added ...
814 (p0) fadd.s1 f64 = f85, f64
815 nop 999;; // EMbo added ...
817 nop 999 // EMbo added ...
818 (p0) fma.s1 f62 = f60, f62, f74
819 nop 999 // EMbo added ...
821 nop 999 // EMbo added ...
823 // poly1 = P_5 + zsq * poly1
824 // poly2 = zsq * poly2
825 // poly1 = P_4 + zsq*poly1
827 (p0) fmpy.s1 f63 = f63, f60
828 nop 999;; // EMbo added ...
830 nop 999 // EMbo added ...
832 // poly1 = P_4 + zsq * poly1
833 // swap = xor(swap,sign_X)
835 (p0) fma.s1 f62 = f60, f62, f72
836 nop 999;; // EMbo added ...
838 nop 999 // EMbo added ...
840 // poly = z8*poly1 + poly2 (Typo in writeup)
844 // z_lo = V - A_temp * U
845 // if (p7) sigma = 1.0
846 // Writeup shows A_temp as A_hi
850 // if (p6) sigma = -1.0
851 // z_lo = (V - A_temp * U) *E
854 // Fixup added to force inexact later -
855 // A_hi = A_temp + z_lo
856 // z_lo = (A_temp - A_hi) + z_lo
857 // z_lo = A_hi - z_lo -A_hi + z_lo = about 0
859 (p0) fma.s1 f47 = f61, f62, f63
860 nop 999;; // EMbo added ...
862 nop 999 // EMbo added ...
864 // A_lo = z * poly + z_lo
866 (p0) fma.s1 f53 = f59, f47, f64
867 nop 999;; // EMbo added ...
869 nop 999 // EMbo added ...
870 (p0) fadd.s1 f52 = f65, f53
871 nop 999;; // EMbo added ...
873 nop 999 // EMbo added ...
874 (p0) fsub.s1 f65 = f65, f52
875 nop 999 // EMbo added ...
877 nop 999 // EMbo added ...
878 (p0) fmpy.s1 f52 = f34, f52
879 nop 999;; // EMbo added ...
881 nop 999 // EMbo added ...
882 (p0) fadd.s1 f53 = f65, f53
883 nop 999 // EMbo added ...
885 (p0) setf.exp f65 = r47
886 (p0) fma.s1 f56 = f54, f52, f50
887 nop 999;; // EMbo added ...
889 nop 999 // EMbo added ...
890 (p0) fclass.m.unc p6,p0 = f53,0x007
891 nop 999;; // EMbo added ...
893 nop 999 // EMbo added ...
899 // result_hi = P_hi + sigma * A_hi
902 nop 999 // EMbo added ...
904 nop 999 // EMbo added ...
906 // tmp = P_hi - result_hi
908 (p0) fsub.s1 f65 = f50, f56
909 nop 999;; // EMbo added ...
911 nop 999 // EMbo added ...
912 (p0) fma.s1 f65 = f52, f54, f65
913 nop 999 // EMbo added ...
915 nop 999 // EMbo added ...
917 // tmp = sigma * A_hi + tmp
918 // sigma = A_lo * sigma + P_lo
920 (p0) fma.s1 f54 = f53, f54, f51
921 nop 999;; // EMbo added ...
923 nop 999 // EMbo added ...
925 // result_lo = s_Y * sigma + tmp
927 (p0) fma.s1 f55 = f34, f54, f65
928 nop 999;; // EMbo added ...
932 (p0) br.cond.sptk L(RETURN_ATAN);;
935 // result = result_hi + result_lo (User Supplied Rounding Mode)
937 // (p0) fadd.d.s0 f57 = f55, f56
941 nop 999 // EMbo added ...
943 // Deal with the NatVal and unsupported cases.
944 // Raise invalid if warrented.
946 (p0) fmpy.d.s0 f57 = f8, f9
947 br.cond.sptk L(RETURN_ATAN);;
951 nop 999 // EMbo added ...
953 // If only one NaN, then generate the resulting
954 // NaN and return - may raise invalid.
956 (p0) fmpy.d.s0 f57 = f8, f9
957 (p0) br.cond.sptk L(RETURN_ATAN);;
959 L(ATAN_SPECIAL_HANDLING):
962 (p0) addl r39 = @ltoff(Constants_atan#), gp
964 (p0) fcmp.lt.s0 p0,p7 = f8,f1
969 // Raise denormal operand faults if necessary
974 (p0) fcmp.lt.s0 p0,p6 = f9,f1
975 nop 999;; // EMbo added ...
982 nop 999 // EMbo added ...
983 (p0) fclass.m.unc p6,p7 = f32,0x007
984 nop 999;; // EMbo added ...
986 nop 999 // EMbo added ...
987 (p0) movl r47 = 992;;
989 (p0) add r39 = r39, r47
990 nop 999 // EMbo added ...
991 (p7) br.cond.sptk L(ATAN_ArgY_Not_ZERO);;
993 nop 999 // EMbo added ...
994 (p6) fclass.m.unc p14,p0 = f33,0x035
995 nop 999 // EMbo added ...
997 nop 999 // EMbo added ...
998 (p6) fclass.m.unc p15,p0 = f33,0x036
999 nop 999;; // EMbo added ...
1001 nop 999 // EMbo added ...
1002 (p6) fclass.m.unc p13,p0 = f33,0x007
1003 nop 999 // EMbo added ...
1005 (p0) ldfd f56 = [r39],8
1006 nop 999 // EMbo added ...
1007 nop 999;; // EMbo added ...
1009 (p0) ldfd f55 = [r39],-8
1010 (p14) fmerge.s f56 = f32,f0
1011 nop 999;; // EMbo added ...
1013 nop 999 // EMbo added ...
1015 // Return sign_Y * 0 when Y = +/-0 and X > 0
1017 (p14) fmerge.s f55 = f32,f0
1018 nop 999;; // EMbo added ...
1020 nop 999 // EMbo added ...
1021 (p15) fmerge.s f56 = f32,f56
1022 nop 999;; // EMbo added ...
1024 nop 999 // EMbo added ...
1026 // Return sign_Y * PI when X < -0
1029 (p15) fmerge.s f55 = f32,f55
1030 nop 999;; // EMbo added ...
1032 nop 999 // EMbo added ...
1033 (p0) fadd.d.s0 f57 = f56,f55
1037 // Call error support function for atan(0,0)
1038 // - expected value already computed.
1042 (p0) br.cond.sptk L(RETURN_ATAN)
1044 L(ATAN_ArgY_Not_ZERO):
1046 nop 999 // EMbo added ...
1047 (p0) fclass.m.unc p9,p10 = f32,0x023
1048 nop 999;; // EMbo added ...
1050 nop 999 // EMbo added ...
1051 (p9) fclass.m.unc p6,p0 = f33,0x017
1052 (p10) br.cond.sptk L(ATAN_ArgY_Not_INF);;
1054 (p6) add r39 = 16,r39
1055 (p9) fclass.m.unc p7,p0 = f33,0x021
1056 nop 999;; // EMbo added ...
1058 nop 999 // EMbo added ...
1059 (p0) ldfd f56 = [r39],8
1060 (p9) fclass.m.unc p8,p0 = f33,0x022;;
1062 (p0) ldfd f55 = [r39],-8
1063 nop 999 // EMbo added ...
1064 nop 999;; // EMbo added ...
1066 nop 999 // EMbo added ...
1067 (p6) fmerge.s f56 = f32,f56
1068 nop 999;; // EMbo added ...
1070 nop 999 // EMbo added ...
1071 (p6) fmerge.s f55 = f32,f55
1072 nop 999;; // EMbo added ...
1074 nop 999 // EMbo added ...
1076 // Load I/2 and adjust its sign.
1077 // Return +I/2 when ArgY = +Inf and ArgX = +/-0,normal
1078 // Return -I/2 when ArgY = -Inf and ArgX = +/-0,normal
1080 (p6) fadd.d.s0 f57 = f56, f55
1081 (p6) br.cond.sptk L(RETURN_ATAN);;
1083 (p7) add r39 = 32,r39;;
1084 (p7) ldfd f56 = [r39],8
1085 nop 999;; // EMbo added ...
1087 nop 999;; // EMbo added ...
1088 (p7) ldfd f55 = [r39],-8
1089 nop 999;; // EMbo added ...
1091 nop 999 // EMbo added ...
1092 (p7) fmerge.s f56 = f32,f56
1093 nop 999;; // EMbo added ...
1095 nop 999 // EMbo added ...
1096 (p7) fmerge.s f55 = f32,f55
1097 nop 999;; // EMbo added ...
1099 nop 999 // EMbo added ...
1101 // Load PI/4 and adjust its sign.
1102 // Return +PI/4 when ArgY = +Inf and ArgX = +Inf
1103 // Return -PI/4 when ArgY = -Inf and ArgX = +Inf
1105 (p7) fadd.d.s0 f57 = f56, f55
1106 (p7) br.cond.sptk L(RETURN_ATAN);;
1108 (p8) add r39 = 48,r39;;
1109 (p8) ldfd f56 =[r39],8
1110 nop 999;; // EMbo added ...
1112 nop 999;; // EMbo added ...
1113 (p8) ldfd f55 =[r39],-8
1114 nop 999;; // EMbo added ...
1116 nop 999 // EMbo added ...
1117 (p8) fmerge.s f56 = f32,f56
1118 nop 999;; // EMbo added ...
1120 nop 999 // EMbo added ...
1121 (p8) fmerge.s f55 = f32,f55
1122 nop 999;; // EMbo added ...
1124 nop 999 // EMbo added ...
1126 // Load I/4 and adjust its sign.
1127 // Return +3I/4 when ArgY = +Inf and ArgX = -Inf
1128 // Return -3I/4 when ArgY = -Inf and ArgX = -Inf
1130 (p8) fadd.d.s0 f57 = f56, f55
1131 (p8) br.cond.sptk L(RETURN_ATAN);;
1133 L(ATAN_ArgY_Not_INF):
1135 nop 999 // EMbo added ...
1136 (p0) fclass.m.unc p6,p0 = f33,0x007
1137 nop 999 // EMbo added ...
1139 nop 999 // EMbo added ...
1140 (p0) fclass.m.unc p7,p0 = f33,0x021
1141 nop 999;; // EMbo added ...
1143 nop 999 // EMbo added ...
1144 (p0) fclass.m.unc p8,p0 = f33,0x022
1145 (p6) add r39 = 16,r39;;
1147 (p6) ldfd f56 =[r39],8
1148 nop 999 // EMbo added ...
1149 nop 999;; // EMbo added ...
1151 nop 999;; // EMbo added ...
1152 (p6) ldfd f55 =[r39],-8
1153 nop 999;; // EMbo added ...
1155 nop 999 // EMbo added ...
1156 (p6) fmerge.s f56 = f32,f56
1157 nop 999;; // EMbo added ...
1159 nop 999 // EMbo added ...
1160 (p6) fmerge.s f55 = f32,f55
1161 nop 999;; // EMbo added ...
1163 nop 999 // EMbo added ...
1165 // return = sign_Y * I/2 when ArgX = +/-0
1167 (p6) fadd.d.s0 f57 = f56, f55
1168 (p6) br.cond.sptk L(RETURN_ATAN);;
1170 nop 999 // EMbo added ...
1171 (p7) fmerge.s f56 = f32,f0
1172 nop 999 // EMbo added ...
1174 nop 999 // EMbo added ...
1175 (p7) fmerge.s f55 = f32,f0
1176 nop 999;; // EMbo added ...
1178 nop 999 // EMbo added ...
1180 // return = sign_Y * 0 when ArgX = Inf
1182 (p7) fadd.d.s0 f57 = f56, f55
1183 (p7) br.cond.sptk L(RETURN_ATAN);;
1185 (p8) ldfd f56 = [r39],8
1186 nop 999 // EMbo added ...
1187 nop 999;; // EMbo added ...
1189 nop 999;; // EMbo added ...
1190 (p8) ldfd f55 = [r39],-8
1191 nop 999;; // EMbo added ...
1193 nop 999 // EMbo added ...
1194 (p8) fmerge.s f56 = f32,f56
1195 nop 999;; // EMbo added ...
1197 nop 999 // EMbo added ...
1198 (p8) fmerge.s f55 = f32,f55
1199 nop 999;; // EMbo added ...
1201 nop 999 // EMbo added ...
1203 // return = sign_Y * I when ArgX = -Inf
1205 (p8) fadd.d.s0 f57 = f56, f55
1206 nop 999 // EMbo added ...
1210 // The answer is in f57.
1214 // W is in f9 and untouched
1233 .endp __libm_atan2_reg
1234 ASM_SIZE_DIRECTIVE(__libm_atan2_reg)