sysdeps/ia64/fpu/s_atan.S

   1 .file "atan.s"
   2
   3
   4 // Copyright (c) 2000 - 2003, Intel Corporation
   5 // All rights reserved.
   6 //
   7 // Contributed 2000 by the Intel Numerics Group, Intel Corporation
   8 //
   9 // Redistribution and use in source and binary forms, with or without
  10 // modification, are permitted provided that the following conditions are
  11 // met:
  12 //
  13 // * Redistributions of source code must retain the above copyright
  14 // notice, this list of conditions and the following disclaimer.
  15 //
  16 // * Redistributions in binary form must reproduce the above copyright
  17 // notice, this list of conditions and the following disclaimer in the
  18 // documentation and/or other materials provided with the distribution.
  19 //
  20 // * The name of Intel Corporation may not be used to endorse or promote
  21 // products derived from this software without specific prior written
  22 // permission.
  23
  24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  35 //
  36 // Intel Corporation is the author of this code, and requests that all
  37 // problem reports or change requests be submitted to it directly at
  38 // http://www.intel.com/software/products/opensource/libraries/num.htm.
  39 //
  40 // History
  41 //==============================================================
  42 // 02/02/00  Initial version
  43 // 04/13/00  Improved speed
  44 // 04/19/00  Removed the qualifying predicate from the fmerge.s that
  45 //           takes the absolute value.
  46 // 06/16/00  Reassigned FP registers to eliminate stalls on loads
  47 // 08/30/00  Saved 5 cycles in main path by rearranging large argument logic
  48 //           and delaying use of result of fcmp in load by 1 group
  49 // 05/20/02  Cleaned up namespace and sf0 syntax
  50 // 08/20/02  Use atan2 algorithm with x=1 for better accuracy
  51 // 02/06/03  Reordered header: .section, .global, .proc, .align
  52 //
  53 // API
  54 //==============================================================
  55 // double atan(double Y)
  56 //
  57 // Overview of operation
  58 //==============================================================
  59 //
  60 // The atan function returns values in the interval [-pi/2,+pi/2].
  61 //
  62 // The algorithm used is the atan2(Y,X) algorithm where we fix X=1.0.
  63 //
  64 // There are two basic paths: swap true and swap false.
  65 // atan2(Y,X) ==> atan2(V/U) where U >= V. If Y > X, we must swap.
  66 //
  67 // p6  swap True    |Y| > |X|
  68 // p7  swap False   |Y| <= |X|
  69 //
  70 //
  71 // Simple trigonometric identities show
  72 //   Region 1
  73 //         |Y|<=1.0, V=Y, U=1.0     atan2(Y,X) = sgnY * (0 + atan(V/U))
  74 //
  75 //   Region 2
  76 //         |Y|>1.0, V=1.0, U=Y      atan2(Y,X) = sgnY * (pi/2 - atan(V/U))
  77 //
  78 //
  79 // We compute atan(V/U) from the identity
  80 //      atan(z) + atan([(V/U)-z] / [1+(V/U)z])
  81 //      where z is a limited precision approximation (16 bits) to V/U
  82 //
  83 // z is calculated with the assistance of the frcpa instruction.
  84 //
  85 // atan(z) is calculated by a polynomial z + z^3 * p(w),  w=z^2
  86 // where p(w) = P0+P1*w+...+P22*w^22
  87 //
  88 // Let d = [(V/U)-z] / [1+(V/U)z]) = (V-U*z)/(U+V*z)
  89 //
  90 // Approximate atan(d) by d + P0*d^3
  91 // Let F = 1/(U+V*z) * (1-a), where |a|< 2^-8.8.
  92 // Compute q(a) = 1 + a + ... + a^5.
  93 // Then F*q(a) approximates the reciprocal to more than 50 bits.
  94
  95 // Special values
  96 //==============================================================
  97 // atan(QNAN)  = QNAN
  98 // atan(SNAN)  = quieted SNAN
  99 // atan(+-inf) = +- pi/2
 100 // atan(+-0)   = +-0
 101
 102 // Registers used
 103 //==============================================================
 104
 105 // predicate registers used:
 106 // p6 -> p15
 107
 108 // floating-point registers used:
 109 // f8, input
 110 // f32 -> f116
 111
 112 // general registers used
 113 // r14 -> r16
 114
 115 // Assembly macros
 116 //==============================================================
 117
 118 EXP_AD_P1                    = r14
 119 EXP_AD_P2                    = r15
 120 rsig_near_one                = r16
 121
 122 atan2_Y                      = f8
 123 atan2_X                      = f1
 124
 125 atan2_u1_X                   = f32
 126 atan2_u1_Y                   = f33
 127 atan2_z2_X                   = f34
 128
 129 atan2_two                    = f36
 130 atan2_B1sq_Y                 = f37
 131 atan2_z1_X                   = f38
 132 atan2_B1X                    = f40
 133
 134 atan2_B1Y                    = f41
 135 atan2_wp_X                   = f42
 136 atan2_B1sq_X                 = f43
 137 atan2_z                      = f44
 138 atan2_w                      = f45
 139
 140 atan2_P0                     = f46
 141 atan2_P1                     = f47
 142 atan2_P2                     = f48
 143 atan2_P3                     = f49
 144 atan2_P4                     = f50
 145
 146 atan2_P5                     = f51
 147 atan2_P6                     = f52
 148 atan2_P7                     = f53
 149 atan2_P8                     = f54
 150 atan2_P9                     = f55
 151
 152 atan2_P10                    = f56
 153 atan2_P11                    = f57
 154 atan2_P12                    = f58
 155 atan2_P13                    = f59
 156 atan2_P14                    = f60
 157
 158 atan2_P15                    = f61
 159 atan2_P16                    = f62
 160 atan2_P17                    = f63
 161 atan2_P18                    = f64
 162 atan2_P19                    = f65
 163
 164 atan2_P20                    = f66
 165 atan2_P21                    = f67
 166 atan2_P22                    = f68
 167 atan2_pi_by_2                = f69
 168 atan2_sgn_pi_by_2            = f69
 169 atan2_V13                    = f70
 170
 171 atan2_W11                    = f71
 172 atan2_E                      = f72
 173 atan2_wp_Y                   = f73
 174 atan2_V11                    = f74
 175 atan2_V12                    = f75
 176
 177 atan2_V7                     = f76
 178 atan2_V8                     = f77
 179 atan2_W7                     = f78
 180 atan2_W8                     = f79
 181 atan2_W3                     = f80
 182
 183 atan2_W4                     = f81
 184 atan2_V3                     = f82
 185 atan2_V4                     = f83
 186 atan2_F                      = f84
 187 atan2_gV                     = f85
 188
 189 atan2_V10                    = f86
 190 atan2_zcub                   = f87
 191 atan2_V6                     = f88
 192 atan2_V9                     = f89
 193 atan2_W10                    = f90
 194
 195 atan2_W6                     = f91
 196 atan2_W2                     = f92
 197 atan2_V2                     = f93
 198 atan2_alpha                  = f94
 199 atan2_alpha_1                = f95
 200
 201 atan2_gVF                    = f96
 202 atan2_V5                     = f97
 203 atan2_W12                    = f98
 204 atan2_W5                     = f99
 205 atan2_alpha_sq               = f100
 206
 207 atan2_Cp                     = f101
 208 atan2_V1                     = f102
 209 atan2_ysq                    = f103
 210 atan2_W1                     = f104
 211 atan2_alpha_cub              = f105
 212
 213 atan2_C                      = f106
 214 atan2_d                      = f108
 215 atan2_A_hi                   = f109
 216 atan2_dsq                    = f110
 217
 218 atan2_pd                     = f111
 219 atan2_A_lo                   = f112
 220 atan2_A                      = f113
 221 atan2_Pp                     = f114
 222 atan2_sgnY                   = f115
 223
 224 atan2_sig_near_one           = f116
 225 atan2_near_one               = f116
 226
 227 /////////////////////////////////////////////////////////////
 228
 229
 230 RODATA
 231
 232 .align 16
 233
 234 LOCAL_OBJECT_START(atan2_tb1)
 235 data8 0xA21922DC45605EA1 ,  0x00003FFA // P11
 236 data8 0xB199DD6D2675C40F ,  0x0000BFFA // P10
 237 data8 0xC2F01E5DDD100DBE ,  0x00003FFA // P9
 238 data8 0xD78F28FC2A592781 ,  0x0000BFFA // P8
 239 data8 0xF0F03ADB3FC930D3 ,  0x00003FFA // P7
 240 data8 0x88887EBB209E3543 ,  0x0000BFFB // P6
 241 data8 0x9D89D7D55C3287A5 ,  0x00003FFB // P5
 242 data8 0xBA2E8B9793955C77 ,  0x0000BFFB // P4
 243 data8 0xE38E38E320A8A098 ,  0x00003FFB // P3
 244 data8 0x9249249247E37913 ,  0x0000BFFC // P2
 245 data8 0xCCCCCCCCCCC906CD ,  0x00003FFC // P1
 246 data8 0xAAAAAAAAAAAAA8A9 ,  0x0000BFFD // P0
 247 data8 0x0000000000000000 ,  0x00000000 // pad to avoid bank conflict
 248 LOCAL_OBJECT_END(atan2_tb1)
 249
 250 LOCAL_OBJECT_START(atan2_tb2)
 251 data8 0xCE585A259BD8374C ,  0x00003FF0 // P21
 252 data8 0x9F90FB984D8E39D0 ,  0x0000BFF3 // P20
 253 data8 0x9D3436AABE218776 ,  0x00003FF5 // P19
 254 data8 0xDEC343E068A6D2A8 ,  0x0000BFF6 // P18
 255 data8 0xF396268151CFB11C ,  0x00003FF7 // P17
 256 data8 0xD818B4BB43D84BF2 ,  0x0000BFF8 // P16
 257 data8 0xA2270D30A90AA220 ,  0x00003FF9 // P15
 258 data8 0xD5F4F2182E7A8725 ,  0x0000BFF9 // P14
 259 data8 0x80D601879218B53A ,  0x00003FFA // P13
 260 data8 0x9297B23CCFFB291F ,  0x0000BFFA // P12
 261 data8 0xFE7E52D2A89995B3 ,  0x0000BFEC // P22
 262 data8 0xC90FDAA22168C235 ,  0x00003FFF // pi/2
 263 LOCAL_OBJECT_END(atan2_tb2)
 264
 265
 266
 267
 268 .section .text
 269 GLOBAL_LIBM_ENTRY(atan)
 270
 271 { .mfi
 272            nop.m 999
 273            frcpa.s1     atan2_u1_Y,p7 = f1,atan2_Y
 274            nop.i 999
 275 }
 276 { .mfi
 277            addl         EXP_AD_P1   = @ltoff(atan2_tb1), gp
 278            fma.s1       atan2_two  = f1,f1,f1
 279            nop.i 999
 280 ;;
 281 }
 282
 283 { .mfi
 284            ld8  EXP_AD_P1 = [EXP_AD_P1]
 285            frcpa.s1     atan2_u1_X,p6 = f1,atan2_X
 286            nop.i 999
 287 }
 288 { .mfi
 289            nop.m 999
 290            fma.s1       atan2_ysq  = atan2_Y,atan2_Y,f0
 291            nop.i 999
 292 }
 293 ;;
 294
 295 { .mfi
 296            add  EXP_AD_P2 = 0xd0,EXP_AD_P1
 297            fmerge.s     atan2_sgnY = atan2_Y,f1
 298            nop.i 999
 299 }
 300 ;;
 301
 302
 303 { .mfi
 304            ldfe         atan2_P11  = [EXP_AD_P1],16
 305            fclass.m p10,p0 = atan2_Y, 0xc3     // Test for y=nan
 306            nop.i 999
 307 }
 308 { .mfi
 309            ldfe         atan2_P21  = [EXP_AD_P2],16
 310            nop.f 999
 311            nop.i 999
 312 ;;
 313 }
 314
 315
 316 { .mfi
 317            ldfe         atan2_P10  = [EXP_AD_P1],16
 318            fnma.s1      atan2_B1Y  = atan2_u1_Y, atan2_Y, atan2_two
 319            nop.i 999
 320 }
 321 { .mfi
 322            ldfe         atan2_P20  = [EXP_AD_P2],16
 323            fma.s1       atan2_wp_Y   = atan2_u1_Y, atan2_u1_Y, f0
 324            nop.i 999
 325 ;;
 326 }
 327
 328 { .mfi
 329            ldfe         atan2_P9   = [EXP_AD_P1],16
 330            fma.s1       atan2_z1_X = atan2_u1_X, atan2_Y, f0
 331            nop.i 999
 332 }
 333 { .mfi
 334            ldfe         atan2_P19  = [EXP_AD_P2],16
 335            fnma.s1      atan2_B1X  = atan2_u1_X, atan2_X, atan2_two
 336            nop.i 999
 337 }
 338 ;;
 339
 340 { .mfi
 341            ldfe         atan2_P8   = [EXP_AD_P1],16
 342            fma.s1       atan2_z2_X = atan2_u1_X, atan2_ysq, f0
 343            nop.i 999
 344 }
 345 { .mfb
 346            ldfe         atan2_P18  = [EXP_AD_P2],16
 347 (p10)      fma.d.s0 f8 = atan2_Y,atan2_X,f0   // If y=nan, result quietized y
 348 (p10)      br.ret.spnt b0        // Exit if y=nan
 349 }
 350 ;;
 351
 352 // p6 true if swap,    means |y| >  1.0    or ysq > 1.0
 353 // p7 true if no swap, means 1.0 >= |y|    or 1.0 >= ysq
 354 { .mfi
 355            ldfe         atan2_P7   = [EXP_AD_P1],16
 356            fcmp.ge.s1 p7,p6    = f1, atan2_ysq
 357            nop.i 999
 358 }
 359 { .mmf
 360            ldfe         atan2_P17  = [EXP_AD_P2],16
 361            nop.m 999
 362            nop.f 999
 363 }
 364 ;;
 365
 366 { .mfi
 367            ldfe         atan2_P6   = [EXP_AD_P1],16
 368            fma.s1       atan2_E         = atan2_u1_Y, atan2_B1Y, atan2_Y
 369            nop.i 999
 370 }
 371 { .mfi
 372            ldfe         atan2_P16  = [EXP_AD_P2],16
 373            fma.s1       atan2_B1sq_Y = atan2_B1Y, atan2_B1Y, f0
 374            nop.i 999
 375 ;;
 376 }
 377
 378 { .mfi
 379            ldfe         atan2_P5   = [EXP_AD_P1],16
 380 (p7)       fma.s1       atan2_wp_X   = atan2_z1_X, atan2_z1_X, f0
 381            nop.i 999
 382 }
 383 { .mfi
 384            ldfe         atan2_P15       = [EXP_AD_P2],16
 385 (p7)       fma.s1       atan2_B1sq_X = atan2_B1X, atan2_B1X, f0
 386            nop.i 999
 387 ;;
 388 }
 389
 390 { .mfi
 391            ldfe         atan2_P4   = [EXP_AD_P1],16
 392 (p6)       fma.s1       atan2_z         = atan2_u1_Y, atan2_B1Y, f0
 393            nop.i 999
 394 }
 395 { .mfi
 396            ldfe         atan2_P14  = [EXP_AD_P2],16
 397 (p7)       fma.s1       atan2_E         = atan2_z2_X, atan2_B1X, atan2_X
 398            nop.i 999
 399 ;;
 400 }
 401
 402
 403 { .mfi
 404            ldfe         atan2_P3        = [EXP_AD_P1],16
 405            fcmp.eq.s0  p14,p15=atan2_X,atan2_Y  // Dummy for denorm and invalid
 406            nop.i 999
 407 }
 408 { .mmf
 409            ldfe         atan2_P13  = [EXP_AD_P2],16
 410            nop.m 999
 411 (p7)       fma.s1       atan2_z         = atan2_z1_X, atan2_B1X, f0
 412 ;;
 413 }
 414
 415 { .mfi
 416            ldfe         atan2_P2        = [EXP_AD_P1],16
 417 (p6)       fma.s1       atan2_w         = atan2_wp_Y, atan2_B1sq_Y,f0
 418            nop.i 999
 419 }
 420 { .mlx
 421            ldfe         atan2_P12  = [EXP_AD_P2],16
 422            movl         rsig_near_one = 0x8000000000000001 // signif near 1.0
 423 ;;
 424 }
 425
 426 { .mfi
 427            ldfe         atan2_P1        = [EXP_AD_P1],16
 428            fclass.m p9,p0 = atan2_Y, 0x23  // test if y inf
 429            nop.i 999
 430 }
 431 { .mfi
 432            ldfe         atan2_P22       = [EXP_AD_P2],16
 433 (p7)       fma.s1       atan2_w         = atan2_wp_X, atan2_B1sq_X,f0
 434            nop.i 999
 435 ;;
 436 }
 437
 438 { .mfi
 439            ldfe         atan2_P0        = [EXP_AD_P1],16
 440            frcpa.s1     atan2_F,p0     = f1, atan2_E
 441            nop.i 999
 442 }
 443 { .mfi
 444            ldfe         atan2_pi_by_2   = [EXP_AD_P2],16
 445 (p6)       fnma.s1       atan2_gV        = atan2_Y, atan2_z, atan2_X
 446            nop.i 999
 447 ;;
 448 }
 449
 450 { .mfi
 451            setf.sig      atan2_sig_near_one = rsig_near_one
 452 (p7)       fnma.s1       atan2_gV        = atan2_X, atan2_z, atan2_Y
 453            nop.i 999
 454 }
 455 { .mfb
 456            nop.m 999
 457 (p9)       fma.d.s0  f8 = atan2_sgnY, atan2_pi_by_2, f0  // +-pi/2 if y inf
 458 (p9)       br.ret.spnt b0      // exit if y inf, result is +-pi/2
 459 ;;
 460 }
 461
 462 { .mfi
 463            nop.m 999
 464            fma.s1       atan2_V13       = atan2_w, atan2_P11, atan2_P10
 465            nop.i 999
 466 }
 467 { .mfi
 468            nop.m 999
 469            fma.s1       atan2_W11       = atan2_w, atan2_P21, atan2_P20
 470            nop.i 999
 471 ;;
 472 }
 473
 474 { .mfi
 475            nop.m 999
 476            fma.s1       atan2_V11       = atan2_w, atan2_P9, atan2_P8
 477            nop.i 999
 478 }
 479 { .mfi
 480            nop.m 999
 481            fma.s1       atan2_V12       = atan2_w, atan2_w, f0
 482            nop.i 999
 483 ;;
 484 }
 485
 486 { .mfi
 487            nop.m 999
 488            fma.s1       atan2_V8        = atan2_w, atan2_P7 , atan2_P6
 489            nop.i 999
 490 }
 491 { .mfi
 492            nop.m 999
 493            fma.s1       atan2_W8        = atan2_w, atan2_P19, atan2_P18
 494            nop.i 999
 495 ;;
 496 }
 497
 498 { .mfi
 499            nop.m 999
 500            fnma.s1      atan2_alpha     = atan2_E, atan2_F, f1
 501            nop.i 999
 502 }
 503 { .mfi
 504            nop.m 999
 505            fnma.s1      atan2_alpha_1   = atan2_E, atan2_F, atan2_two
 506            nop.i 999
 507 ;;
 508 }
 509
 510
 511 { .mfi
 512            nop.m 999
 513            fma.s1       atan2_V7        = atan2_w, atan2_P5 , atan2_P4
 514            nop.i 999
 515 }
 516 { .mfi
 517            nop.m 999
 518            fma.s1       atan2_W7        = atan2_w, atan2_P17, atan2_P16
 519            nop.i 999
 520 ;;
 521 }
 522
 523 { .mfi
 524            nop.m 999
 525            fma.s1       atan2_V4        = atan2_w, atan2_P3 , atan2_P2
 526            nop.i 999
 527 }
 528 { .mfi
 529            nop.m 999
 530            fma.s1       atan2_W4        = atan2_w, atan2_P15, atan2_P14
 531            nop.i 999
 532 ;;
 533 }
 534
 535 { .mfi
 536            nop.m 999
 537            fma.s1       atan2_V3        = atan2_w, atan2_P1 , atan2_P0
 538            nop.i 999
 539 }
 540 { .mfi
 541            nop.m 999
 542            fma.s1       atan2_W3        = atan2_w, atan2_P13, atan2_P12
 543            nop.i 999
 544 ;;
 545 }
 546
 547 { .mfi
 548            nop.m 999
 549            fma.s1       atan2_V10       = atan2_V12, atan2_V13, atan2_V11
 550            nop.i 999
 551 }
 552 { .mfi
 553            nop.m 999
 554            fma.s1       atan2_gVF       = atan2_gV, atan2_F, f0
 555            nop.i 999
 556 ;;
 557 }
 558
 559 { .mfi
 560            nop.m 999
 561            fma.s1       atan2_alpha_sq  = atan2_alpha, atan2_alpha, f0
 562            nop.i 999
 563 }
 564 { .mfi
 565            nop.m 999
 566            fma.s1       atan2_Cp        = atan2_alpha, atan2_alpha_1, f1
 567            nop.i 999
 568 ;;
 569 }
 570
 571 { .mfi
 572            nop.m 999
 573            fma.s1       atan2_V9        = atan2_V12, atan2_V12, f0
 574            nop.i 999
 575 }
 576 { .mfi
 577            nop.m 999
 578            fma.s1       atan2_W10       = atan2_V12, atan2_P22 , atan2_W11
 579            nop.i 999
 580 ;;
 581 }
 582
 583 { .mfi
 584            nop.m 999
 585            fma.s1       atan2_V6        = atan2_V12, atan2_V8 , atan2_V7
 586            nop.i 999
 587 }
 588 { .mfi
 589            nop.m 999
 590            fma.s1       atan2_W6        = atan2_V12, atan2_W8 , atan2_W7
 591            nop.i 999
 592 ;;
 593 }
 594
 595 { .mfi
 596            nop.m 999
 597            fma.s1       atan2_V2        = atan2_V12, atan2_V4 , atan2_V3
 598            nop.i 999
 599 }
 600 { .mfi
 601            nop.m 999
 602            fma.s1       atan2_W2        = atan2_V12, atan2_W4  , atan2_W3
 603            nop.i 999
 604 ;;
 605 }
 606
 607 { .mfi
 608            nop.m 999
 609            fma.s1       atan2_alpha_cub = atan2_alpha, atan2_alpha_sq, f0
 610            nop.i 999
 611 }
 612 { .mfi
 613            nop.m 999
 614            fma.s1       atan2_C         = atan2_gVF, atan2_Cp, f0
 615            nop.i 999
 616 ;;
 617 }
 618
 619 { .mfi
 620            nop.m 999
 621            fma.s1       atan2_W12       = atan2_V9, atan2_V9, f0
 622            nop.i 999
 623 ;;
 624 }
 625
 626 { .mfi
 627            nop.m 999
 628            fma.s1       atan2_V5        = atan2_V9, atan2_V10, atan2_V6
 629            nop.i 999
 630 }
 631 { .mfi
 632            nop.m 999
 633            fma.s1       atan2_W5        = atan2_V9, atan2_W10, atan2_W6
 634            nop.i 999
 635 ;;
 636 }
 637
 638 { .mfi
 639            nop.m 999
 640            fclass.m p8,p0 = atan2_Y, 0x07  // Test for y=0
 641            nop.i 999
 642 }
 643 { .mfi
 644            nop.m 999
 645            fma.s1       atan2_d         = atan2_alpha_cub, atan2_C, atan2_C
 646            nop.i 999
 647 }
 648 ;;
 649
 650 { .mfi
 651            nop.m 999
 652            fma.s1       atan2_W12       = atan2_V9, atan2_W12, f0
 653            nop.i 999
 654 }
 655 ;;
 656
 657 { .mfi
 658            nop.m 999
 659            fma.s1       atan2_V1        = atan2_V9, atan2_V5, atan2_V2
 660            nop.i 999
 661 }
 662 { .mfi
 663            nop.m 999
 664            fma.s1       atan2_W1        = atan2_V9, atan2_W5, atan2_W2
 665            nop.i 999
 666 ;;
 667 }
 668
 669 { .mfi
 670            nop.m 999
 671 (p8)       fmerge.s     f8              = atan2_sgnY, f0  // +-0 if y=0
 672            nop.i 999
 673 }
 674 { .mfb
 675            nop.m 999
 676            fma.s1       atan2_zcub      = atan2_z, atan2_w, f0
 677 (p8)       br.ret.spnt b0      // Exit if y=0
 678 ;;
 679 }
 680
 681 { .mfi
 682            nop.m 999
 683            fma.s1       atan2_pd        = atan2_P0, atan2_d, f0
 684            nop.i 999
 685 }
 686 { .mfi
 687            nop.m 999
 688            fma.s1       atan2_dsq       = atan2_d, atan2_d, f0
 689            nop.i 999
 690 ;;
 691 }
 692
 693
 694 { .mfi
 695            nop.m 999
 696            fmerge.se    atan2_near_one = f1, atan2_sig_near_one // Const ~1.0
 697            nop.i 999
 698 }
 699 { .mfi
 700            nop.m 999
 701            fma.s1       atan2_Pp        = atan2_W12, atan2_W1, atan2_V1
 702            nop.i 999
 703 ;;
 704 }
 705
 706 { .mfi
 707            nop.m 999
 708            fma.s1       atan2_sgn_pi_by_2 = atan2_pi_by_2, atan2_sgnY, f0
 709            nop.i 999
 710 }
 711 { .mfi
 712            nop.m 999
 713            fma.s1       atan2_A_lo      = atan2_pd, atan2_dsq, atan2_d
 714            nop.i 999
 715 ;;
 716 }
 717
 718
 719 { .mfi
 720            nop.m 999
 721            fma.s1       atan2_A_hi      = atan2_zcub, atan2_Pp, atan2_z
 722            nop.i 999
 723 ;;
 724 }
 725
 726
 727 { .mfi
 728            nop.m 999
 729 (p6)       fma.s1       atan2_A         = atan2_A_hi, f1, atan2_A_lo
 730            nop.i 999
 731 }
 732 // For |Y| <= |X| and X > 0, result is A_hi + A_lo
 733 { .mfi
 734            nop.m 999
 735 (p7)       fma.d.s0       f8         = atan2_A_hi, f1, atan2_A_lo
 736            nop.i 999
 737 ;;
 738 }
 739
 740 // For |Y| > |X|, result is  +- pi/2 - (A_hi + A_lo)
 741 // We perturb A by multiplying by 1.0+1ulp as we produce the result
 742 // in order to get symmetrically rounded results in directed rounding modes.
 743 // If we don't do this, there are a few cases where the trailing 11 bits of
 744 // the significand of the result, before converting to double, are zero.  These
 745 // cases do not round symmetrically in round to +infinity or round to -infinity.
 746 { .mfb
 747            nop.m 999
 748 (p6)       fnma.d.s0      f8        = atan2_A, atan2_near_one, atan2_sgn_pi_by_2
 749            br.ret.sptk  b0
 750 ;;
 751 }
 752
 753 GLOBAL_LIBM_END(atan)