sysdeps/ia64/fpu/e_atan2.S

   1 .file "atan2.s"
   2
   3 // Copyright (C) 2000, 2001, Intel Corporation
   4 // All rights reserved.
   5 //
   6 // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
   7 // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
   8 //
   9 // Redistribution and use in source and binary forms, with or without
  10 // modification, are permitted provided that the following conditions are
  11 // met:
  12 //
  13 // * Redistributions of source code must retain the above copyright
  14 // notice, this list of conditions and the following disclaimer.
  15 //
  16 // * Redistributions in binary form must reproduce the above copyright
  17 // notice, this list of conditions and the following disclaimer in the
  18 // documentation and/or other materials provided with the distribution.
  19 //
  20 // * The name of Intel Corporation may not be used to endorse or promote
  21 // products derived from this software without specific prior written
  22 // permission.
  23 //
  24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  35 //
  36 // Intel Corporation is the author of this code, and requests that all
  37 // problem reports or change requests be submitted to it directly at
  38 // http://developer.intel.com/opensource.
  39 //
  40 // History
  41 //==============================================================
  42 // 2/02/00  Initial version
  43 // 4/04/00  Unwind support added
  44 // 8/15/00  Bundle added after call to __libm_error_support to properly
  45 //          set [the previously overwritten] GR_Parameter_RESULT.
  46 // 8/17/00  Changed predicate register macro-usage to direct predicate
  47 //          names due to an assembler bug.
  48 // 9/28/00  Updated to set invalid on SNaN inputs
  49 // 1/19/01  Fixed flags for small results
  50 //
  51 // API
  52 //==============================================================
  53 // double atan2(double Y, double X)
  54 //
  55 // Overview of operation
  56 //==============================================================
  57 //
  58 // There are two basic paths: swap true and swap false.
  59 // atan2(Y,X) ==> atan2(V/U) where U >= V. If Y > X, we must swap.
  60 //
  61 // p6  swap True    |Y| > |X|
  62 // p7  swap False   |Y| <= |X|
  63 // p8  X+   (If swap=True p8=p9=0)
  64 // p9  X-
  65 //
  66 // all the other predicates p10 thru p15 are false for the main path
  67 //
  68 // Simple trigonometric identities show
  69 //   Region 1 (-45 to +45 degrees):
  70 //         X>0, |Y|<=X, V=Y, U=X     atan2(Y,X) = sgnY * (0 + atan(V/U))
  71 //
  72 //   Region 2 (-90 to -45 degrees, and +45 to +90 degrees):
  73 //         X>0, |Y|>X, V=X, U=Y      atan2(Y,X) = sgnY * (pi/2 - atan(V/U))
  74 //
  75 //   Region 3 (-135 to -90 degrees, and +90 to +135 degrees):
  76 //         X<0, |Y|>X, V=X, U=Y      atan2(Y,X) = sgnY * (pi/2 + atan(V/U))
  77 //
  78 //   Region 4 (-180 to -135 degrees, and +135 to +180 degrees):
  79 //         X<0, |Y|<=X, V=Y, U=X      atan2(Y,X) = sgnY * (pi - atan(V/U))
  80 //
  81 // So the result is always of the form atan2(Y,X) = P + sgnXY * atan(V/U)
  82 //
  83 // We compute atan(V/U) from the identity
  84 //      atan(z) + atan([(V/U)-z] / [1+(V/U)z])
  85 //      where z is a limited precision approximation (16 bits) to V/U
  86 //
  87 // z is calculated with the assistance of the frcpa instruction.
  88 //
  89 // atan(z) is calculated by a polynomial z + z^3 * p(w),  w=z^2
  90 // where p(w) = P0+P1*w+...+P22*w^22
  91 //
  92 // Let d = [(V/U)-z] / [1+(V/U)z]) = (V-U*z)/(U+V*z)
  93 //
  94 // Approximate atan(d) by d + P0*d^3
  95 // Let F = 1/(U+V*z) * (1-a), where |a|< 2^-8.8.
  96 // Compute q(a) = 1 + a + ... + a^5.
  97 // Then F*q(a) approximates the reciprocal to more than 50 bits.
  98
  99 // Special values
 100 //==============================================================
 101 //              Y                 x          Result
 102 //             +number           +inf        +0
 103 //             -number           +inf        -0
 104 //             +number           -inf        +pi
 105 //             -number           -inf        -pi
 106 //
 107 //             +inf              +number     +pi/2
 108 //             -inf              +number     -pi/2
 109 //             +inf              -number     +pi/2
 110 //             -inf              -number     -pi/2
 111 //
 112 //             +inf              +inf        +pi/4
 113 //             -inf              +inf        -pi/4
 114 //             +inf              -inf        +3pi/4
 115 //             -inf              -inf        -3pi/4
 116 //
 117 //             +1                +1          +pi/4
 118 //             -1                +1          -pi/4
 119 //             +1                -1          +3pi/4
 120 //             -1                -1          -3pi/4
 121 //
 122 //             +number           +0          +pi/2
 123 //             -number           +0          -pi/2
 124 //             +number           -0          +pi/2
 125 //             -number           -0          -pi/2
 126 //
 127 //             +0                +number     +0
 128 //             -0                +number     -0
 129 //             +0                -number     +pi
 130 //             -0                -number     -pi
 131 //
 132 //             +0                +0          +0
 133 //             -0                +0          -0
 134 //             +0                -0          +pi
 135 //             -0                -0          -pi
 136 //
 137 //            Nan             anything      quiet Y
 138 //            anything        NaN           quiet X
 139
 140 // atan2(+-0/+-0) sets double error tag to 37
 141 // atan2(+-0/+-0) sets single error tag to 38
 142
 143 #include "libm_support.h"
 144
 145 // Assembly macros
 146 //==============================================================
 147
 148 EXP_AD_P1                    = r33
 149 EXP_AD_P2                    = r34
 150 atan2_GR_sml_exp             = r35
 151
 152
 153 GR_SAVE_B0                   = r35
 154 GR_SAVE_GP                   = r36
 155 GR_SAVE_PFS                  = r37
 156
 157 GR_Parameter_X               = r38
 158 GR_Parameter_Y               = r39
 159 GR_Parameter_RESULT          = r40
 160 atan2_GR_tag                 = r41
 161
 162
 163 atan2_X                      = f9
 164 atan2_Y                      = f8
 165
 166 atan2_u1_X                   = f32
 167 atan2_u1_Y                   = f33
 168 atan2_Umax                   = f34
 169 atan2_Vmin                   = f35
 170 atan2_two                    = f36
 171 atan2_absX                   = f37
 172 atan2_z1_X                   = f38
 173 atan2_z1_Y                   = f39
 174 atan2_B1X                    = f40
 175 atan2_B1Y                    = f41
 176 atan2_wp                     = f42
 177 atan2_B1sq                   = f43
 178 atan2_z                      = f44
 179 atan2_w                      = f45
 180
 181 atan2_P0                     = f46
 182 atan2_P1                     = f47
 183 atan2_P2                     = f48
 184 atan2_P3                     = f49
 185 atan2_P4                     = f50
 186 atan2_P5                     = f51
 187 atan2_P6                     = f52
 188 atan2_P7                     = f53
 189 atan2_P8                     = f54
 190 atan2_P9                     = f55
 191 atan2_P10                    = f56
 192 atan2_P11                    = f57
 193 atan2_P12                    = f58
 194 atan2_P13                    = f59
 195 atan2_P14                    = f60
 196 atan2_P15                    = f61
 197 atan2_P16                    = f62
 198 atan2_P17                    = f63
 199 atan2_P18                    = f64
 200 atan2_P19                    = f65
 201 atan2_P20                    = f66
 202 atan2_P21                    = f67
 203 atan2_P22                    = f68
 204 atan2_Pi_by_2                = f69
 205
 206 atan2_V13                    = f70
 207 atan2_W11                    = f71
 208 atan2_E                      = f72
 209 atan2_gamma                  = f73
 210 atan2_V11                    = f74
 211 atan2_V12                    = f75
 212 atan2_V7                     = f76
 213 atan2_V8                     = f77
 214 atan2_W7                     = f78
 215 atan2_W8                     = f79
 216 atan2_W3                     = f80
 217 atan2_W4                     = f81
 218 atan2_V3                     = f82
 219 atan2_V4                     = f83
 220 atan2_F                      = f84
 221 atan2_gV                     = f85
 222 atan2_V10                    = f86
 223 atan2_zcub                   = f87
 224 atan2_V6                     = f88
 225 atan2_V9                     = f89
 226 atan2_W10                    = f90
 227 atan2_W6                     = f91
 228 atan2_W2                     = f92
 229 atan2_V2                     = f93
 230
 231 atan2_alpha                  = f94
 232 atan2_alpha_1                = f95
 233 atan2_gVF                    = f96
 234 atan2_V5                     = f97
 235 atan2_W12                    = f98
 236 atan2_W5                     = f99
 237 atan2_alpha_sq               = f100
 238 atan2_Cp                     = f101
 239 atan2_V1                     = f102
 240
 241 atan2_sml_norm               = f103
 242 atan2_FR_tmp                 = f103
 243
 244 atan2_W1                     = f104
 245 atan2_alpha_cub              = f105
 246 atan2_C                      = f106
 247 atan2_P                      = f107
 248 atan2_d                      = f108
 249 atan2_A_hi                   = f109
 250 atan2_dsq                    = f110
 251 atan2_pd                     = f111
 252 atan2_A_lo                   = f112
 253 atan2_A                      = f113
 254
 255 atan2_Pp                     = f114
 256
 257 atan2_sgnY                   = f116
 258 atan2_pi                     = f117
 259 atan2_sgnX                   = f118
 260 atan2_sgnXY                  = f119
 261
 262 atan2_3pi_by_4               = f120
 263 atan2_pi_by_4                = f121
 264
 265 //atan2_sF                     = p7
 266 //atan2_sT                     = p6
 267
 268 // These coefficients are for atan2.
 269 // You can also use this set to substitute those used in the |X| <= 1 case for atan;
 270 // BUT NOT vice versa.
 271
 272 /////////////////////////////////////////////////////////////
 273
 274
 275 #ifdef _LIBC
 276 .rodata
 277 #else
 278 .data
 279 #endif
 280
 281 .align 16
 282
 283 atan2_tb1:
 284 ASM_TYPE_DIRECTIVE(atan2_tb1,@object)
 285 data8 0xB199DD6D2675C40F ,  0x0000BFFA // P10
 286 data8 0xA21922DC45605EA1 ,  0x00003FFA // P11
 287 data8 0xD78F28FC2A592781 ,  0x0000BFFA // P8
 288 data8 0xC2F01E5DDD100DBE ,  0x00003FFA // P9
 289 data8 0x9D89D7D55C3287A5 ,  0x00003FFB // P5
 290 data8 0xF0F03ADB3FC930D3 ,  0x00003FFA // P7
 291 data8 0xF396268151CFB11C ,  0x00003FF7 // P17
 292 data8 0x9D3436AABE218776 ,  0x00003FF5 // P19
 293 data8 0x80D601879218B53A ,  0x00003FFA // P13
 294 data8 0xA2270D30A90AA220 ,  0x00003FF9 // P15
 295 data8 0xCCCCCCCCCCC906CD ,  0x00003FFC // P1
 296 data8 0xE38E38E320A8A098 ,  0x00003FFB // P3
 297 data8 0xFE7E52D2A89995B3 ,  0x0000BFEC // P22
 298 data8 0xC90FDAA22168C235 ,  0x00003FFE // pi/4
 299 ASM_SIZE_DIRECTIVE(atan2_tb1)
 300
 301 atan2_tb2:
 302 ASM_TYPE_DIRECTIVE(atan2_tb2,@object)
 303 data8 0x9F90FB984D8E39D0 ,  0x0000BFF3 // P20
 304 data8 0xCE585A259BD8374C ,  0x00003FF0 // P21
 305 data8 0xBA2E8B9793955C77 ,  0x0000BFFB // P4
 306 data8 0x88887EBB209E3543 ,  0x0000BFFB // P6
 307 data8 0xD818B4BB43D84BF2 ,  0x0000BFF8 // P16
 308 data8 0xDEC343E068A6D2A8 ,  0x0000BFF6 // P18
 309 data8 0x9297B23CCFFB291F ,  0x0000BFFA // P12
 310 data8 0xD5F4F2182E7A8725 ,  0x0000BFF9 // P14
 311 data8 0xAAAAAAAAAAAAA8A9 ,  0x0000BFFD // P0
 312 data8 0x9249249247E37913 ,  0x0000BFFC // P2
 313 data8 0xC90FDAA22168C235 ,  0x00003FFF // pi/2
 314 data8 0xC90FDAA22168C235 ,  0x00004000 // pi
 315 data8 0x96cbe3f9990e91a8 ,  0x00004000 // 3pi/4
 316 ASM_SIZE_DIRECTIVE(atan2_tb2)
 317
 318
 319
 320
 321 .align 32
 322 .global atan2#
 323 #ifdef _LIBC
 324 .global __atan2#
 325 .global __ieee754_atan2#
 326 #endif
 327
 328 ////////////////////////////////////////////////////////
 329
 330 .section .text
 331 .align 32
 332
 333 .proc  atan2#
 334 atan2:
 335 #ifdef _LIBC
 336 .proc  __atan2#
 337 __atan2:
 338 .proc  __ieee754_atan2#
 339 __ieee754_atan2:
 340 #endif
 341 // qnan snan inf norm     unorm 0 -+
 342 // 0    0    1   0        0     0 11
 343
 344
 345 //         Y NAN?     p10 p11
 346 // p10 ==> quiet Y and return
 347 // p11     X NAN?     p12, p13
 348 // p12 ==> quiet X and return
 349
 350 { .mfi
 351            alloc        r32           = ar.pfs,1,5,4,0
 352            frcpa.s1     atan2_u1_X,p6 = f1,atan2_X
 353            addl         EXP_AD_P2   = @ltoff(atan2_tb2), gp
 354 }
 355 { .mfi
 356            addl         EXP_AD_P1   = @ltoff(atan2_tb1), gp
 357            fclass.m.unc p10,p11 = f8, 0xc3
 358            nop.i 999
 359 ;;
 360 }
 361
 362 { .mfi
 363            ld8  EXP_AD_P1 = [EXP_AD_P1]
 364            frcpa.s1     atan2_u1_Y,p7 = f1,atan2_Y
 365            nop.i 999
 366 }
 367 { .mfi
 368            nop.m 999
 369            fma.s1       atan2_two  = f1,f1,f1
 370            nop.i 999
 371 ;;
 372 }
 373
 374
 375 { .mfi
 376            ld8 EXP_AD_P2 = [ EXP_AD_P2]
 377            famax.s1     atan2_Umax =  f8,f9
 378            nop.i 999
 379 }
 380 ;;
 381
 382 { .mfi
 383            nop.m 999
 384            fmerge.s     atan2_absX = f0,atan2_X
 385            nop.i 999
 386 }
 387 ;;
 388
 389 // p10 Y NAN, quiet and return
 390 { .mfi
 391            ldfe         atan2_P10  = [EXP_AD_P1],16
 392            fmerge.s     atan2_sgnY = atan2_Y,f1
 393            nop.i 999
 394 }
 395 { .mfb
 396            nop.m 999
 397 (p10)      fma.d f8 = f8,f9,f0
 398 (p10)      br.ret.spnt b0
 399 ;;
 400 }
 401
 402
 403 { .mmf
 404            ldfe         atan2_P11  = [EXP_AD_P1],16
 405            ldfe         atan2_P20  = [EXP_AD_P2],16
 406            fmerge.s     atan2_sgnX = atan2_X,f1
 407 ;;
 408 }
 409
 410
 411 { .mfi
 412            ldfe         atan2_P8   = [EXP_AD_P1],16
 413            fma.s1       atan2_z1_X = atan2_u1_X, atan2_Y, f0
 414            nop.i 999
 415 }
 416 { .mfi
 417
 418            ldfe         atan2_P21  = [EXP_AD_P2],16
 419            fma.s1       atan2_z1_Y = atan2_u1_Y, atan2_X, f0
 420            nop.i 999
 421 ;;
 422 }
 423
 424 { .mfi
 425            ldfe         atan2_P9   = [EXP_AD_P1],16
 426            fnma.s1      atan2_B1X  = atan2_u1_X, atan2_X, atan2_two
 427            nop.i 999
 428 }
 429 { .mfi
 430
 431            ldfe         atan2_P4   = [EXP_AD_P2],16
 432            fnma.s1      atan2_B1Y  = atan2_u1_Y, atan2_Y, atan2_two
 433            nop.i 999
 434 ;;
 435 }
 436
 437 // p6 (atan2_sT) true if swap
 438 // p7 (atan2_sF) true if no swap
 439 // p11 ==> Y !NAN;  X NAN?
 440
 441 { .mfi
 442            ldfe         atan2_P5   = [EXP_AD_P1],16
 443 //           fcmp.eq.unc.s1 atan2_sF,atan2_sT    = atan2_Umax, atan2_X
 444            fcmp.eq.unc.s1 p7,p6    = atan2_Umax, atan2_X
 445            nop.i 999
 446 }
 447 { .mfi
 448            ldfe         atan2_P6   = [EXP_AD_P2],16
 449 (p11)      fclass.m.unc p12,p13    = f9, 0xc3
 450            nop.i 999
 451 ;;
 452 }
 453
 454 { .mmf
 455            ldfe         atan2_P7   = [EXP_AD_P1],16
 456            ldfe         atan2_P16  = [EXP_AD_P2],16
 457            famin.s1     atan2_Vmin =  f8,f9
 458 ;;
 459 }
 460
 461 // p8 true if X positive
 462 // p9 true if X negative
 463 // both are false is swap is true
 464 { .mfi
 465            ldfe         atan2_P17  = [EXP_AD_P1],16
 466 //(atan2_sF) fcmp.eq.unc.s1 p8,p9    = atan2_sgnX,f1
 467 (p7) fcmp.eq.unc.s1 p8,p9    = atan2_sgnX,f1
 468            nop.i 999
 469 }
 470 { .mfi
 471            ldfe         atan2_P18  = [EXP_AD_P2],16
 472            fma.s1       atan2_sgnXY     = atan2_sgnX, atan2_sgnY, f0
 473            nop.i 999
 474 ;;
 475 }
 476
 477
 478 { .mfi
 479            ldfe         atan2_P19  = [EXP_AD_P1],16
 480 //(atan2_sF) fma.s1       atan2_wp   = atan2_z1_X, atan2_z1_X, f0
 481 (p7) fma.s1       atan2_wp   = atan2_z1_X, atan2_z1_X, f0
 482            nop.i 999
 483 }
 484 { .mfi
 485            ldfe         atan2_P12  = [EXP_AD_P2],16
 486 //(atan2_sT) fma.s1       atan2_wp   = atan2_z1_Y, atan2_z1_Y, f0
 487 (p6) fma.s1       atan2_wp   = atan2_z1_Y, atan2_z1_Y, f0
 488            nop.i 999
 489 ;;
 490 }
 491
 492
 493 { .mfi
 494            ldfe         atan2_P13  = [EXP_AD_P1],16
 495 //(atan2_sF) fma.s1       atan2_z         = atan2_z1_X, atan2_B1X, f0
 496 (p7) fma.s1       atan2_z         = atan2_z1_X, atan2_B1X, f0
 497            nop.i 999
 498 }
 499 { .mfi
 500            ldfe         atan2_P14  = [EXP_AD_P2],16
 501 //(atan2_sT) fma.s1       atan2_z         = atan2_z1_Y, atan2_B1Y, f0
 502 (p6) fma.s1       atan2_z         = atan2_z1_Y, atan2_B1Y, f0
 503            nop.i 999
 504 ;;
 505 }
 506
 507
 508 { .mfi
 509            ldfe         atan2_P15       = [EXP_AD_P1],16
 510 //(atan2_sF) fma.s1       atan2_B1sq = atan2_B1X, atan2_B1X, f0
 511 (p7) fma.s1       atan2_B1sq = atan2_B1X, atan2_B1X, f0
 512            nop.i 999
 513 }
 514 { .mfi
 515            ldfe         atan2_P0        = [EXP_AD_P2],16
 516 //(atan2_sT) fma.s1       atan2_B1sq = atan2_B1Y, atan2_B1Y, f0
 517 (p6) fma.s1       atan2_B1sq = atan2_B1Y, atan2_B1Y, f0
 518            nop.i 999
 519 ;;
 520 }
 521
 522
 523 // p12 ==> X NAN, quiet and return
 524 { .mfi
 525            ldfe         atan2_P1        = [EXP_AD_P1],16
 526            fmerge.s     atan2_Umax      = f0,atan2_Umax
 527            nop.i 999
 528 }
 529 { .mfb
 530            ldfe         atan2_P2        = [EXP_AD_P2],16
 531 (p12)      fma.d        f8 = f9,f8,f0
 532 (p12)      br.ret.spnt b0
 533 ;;
 534 }
 535
 536
 537 // p10 ==> x  inf     y ?
 538 // p11 ==> x !inf     y ?
 539 { .mfi
 540            ldfe         atan2_P3        = [EXP_AD_P1],16
 541            fmerge.s     atan2_Vmin      = f0,atan2_Vmin
 542            nop.i 999
 543 }
 544 { .mfi
 545            ldfe         atan2_Pi_by_2   = [EXP_AD_P2],16
 546            fclass.m.unc p10,p11 = f9, 0x23
 547            nop.i 999
 548 ;;
 549 }
 550
 551
 552 { .mmf
 553            ldfe         atan2_P22       = [EXP_AD_P1],16
 554            ldfe         atan2_pi        = [EXP_AD_P2],16
 555            nop.f 999
 556 ;;
 557 }
 558
 559 { .mfi
 560            nop.m 999
 561            fcmp.eq.s0  p12,p13=f9,f8   // Dummy to catch denormal and invalid
 562            nop.i 999
 563 ;;
 564 }
 565
 566
 567 { .mfi
 568            ldfe         atan2_pi_by_4       = [EXP_AD_P1],16
 569 //(atan2_sT) fmerge.ns    atan2_sgnXY     = atan2_sgnXY, atan2_sgnXY
 570 (p6) fmerge.ns    atan2_sgnXY     = atan2_sgnXY, atan2_sgnXY
 571            nop.i 999
 572 }
 573 { .mfi
 574            ldfe         atan2_3pi_by_4       = [EXP_AD_P2],16
 575            fma.s1       atan2_w         = atan2_wp, atan2_B1sq,f0
 576            nop.i 999
 577 ;;
 578 }
 579
 580 // p12 ==> x  inf     y inf
 581 // p13 ==> x  inf     y !inf
 582 { .mfi
 583            nop.m 999
 584            fmerge.s     atan2_z         = f0, atan2_z
 585            nop.i 999
 586 ;;
 587 }
 588
 589 { .mfi
 590            nop.m 99
 591 (p10)      fclass.m.unc p12,p13 = f8, 0x23
 592            nop.i 999
 593 }
 594 { .mfi
 595            nop.m 99
 596 (p11)      fclass.m.unc p14,p15 = f8, 0x23
 597            nop.i 999
 598 ;;
 599 }
 600
 601 { .mfi
 602            nop.m 999
 603 (p12)      fcmp.eq.unc.s1 p10,p11       = atan2_sgnX,f1
 604            nop.i 99
 605 ;;
 606 }
 607
 608
 609 { .mfb
 610            mov atan2_GR_sml_exp = 0x1  // Small exponent for making small norm
 611 (p14)      fma.d       f8 = atan2_sgnY, atan2_Pi_by_2, f0
 612 (p14)      br.ret.spnt b0
 613 ;;
 614 }
 615
 616 // Make a very small normal in case need to force inexact and underflow
 617 { .mfi
 618            setf.exp atan2_sml_norm = atan2_GR_sml_exp
 619            fma.s1       atan2_V13       = atan2_w, atan2_P11, atan2_P10
 620            nop.i 999
 621 }
 622 { .mfi
 623            nop.m 999
 624            fma.s1       atan2_W11       = atan2_w, atan2_P21, atan2_P20
 625            nop.i 999
 626 ;;
 627 }
 628
 629
 630 { .mfi
 631            nop.m 999
 632            fma.s1       atan2_E         = atan2_Vmin, atan2_z, atan2_Umax
 633            nop.i 999
 634 }
 635 { .mfi
 636            nop.m 999
 637            fnma.s1      atan2_gamma     = atan2_Umax, atan2_z, f1
 638            nop.i 999
 639 ;;
 640 }
 641
 642 { .mfi
 643            nop.m 999
 644            fma.s1       atan2_V11       = atan2_w, atan2_P9, atan2_P8
 645            nop.i 999
 646 }
 647 { .mfi
 648            nop.m 999
 649            fma.s1       atan2_V12       = atan2_w, atan2_w, f0
 650            nop.i 999
 651 ;;
 652 }
 653
 654 { .mfi
 655            nop.m 999
 656            fma.s1       atan2_V7        = atan2_w, atan2_P5 , atan2_P4
 657            nop.i 999
 658 }
 659 { .mfi
 660            nop.m 999
 661            fma.s1       atan2_V8        = atan2_w, atan2_P7 , atan2_P6
 662            nop.i 999
 663 ;;
 664 }
 665
 666 { .mfi
 667            nop.m 999
 668            fma.s1       atan2_W7        = atan2_w, atan2_P17, atan2_P16
 669            nop.i 999
 670 }
 671 { .mfi
 672            nop.m 999
 673            fma.s1       atan2_W8        = atan2_w, atan2_P19, atan2_P18
 674            nop.i 999
 675 ;;
 676 }
 677
 678 { .mfi
 679            nop.m 999
 680            fma.s1       atan2_W3        = atan2_w, atan2_P13, atan2_P12
 681            nop.i 999
 682 }
 683 { .mfi
 684            nop.m 999
 685            fma.s1       atan2_W4        = atan2_w, atan2_P15, atan2_P14
 686            nop.i 999
 687 ;;
 688 }
 689
 690 { .mfi
 691            nop.m 999
 692            fma.s1       atan2_V3        = atan2_w, atan2_P1 , atan2_P0
 693            nop.i 999
 694 }
 695 { .mfi
 696            nop.m 999
 697            fma.s1       atan2_V4        = atan2_w, atan2_P3 , atan2_P2
 698            nop.i 999
 699 ;;
 700 }
 701
 702 { .mfi
 703            nop.m 999
 704            fma.s1       atan2_zcub      = atan2_z, atan2_w, f0
 705            nop.i 999
 706 }
 707 { .mfi
 708            nop.m 999
 709            fnma.s1       atan2_gV        = atan2_Umax, atan2_z, atan2_Vmin
 710            nop.i 999
 711 ;;
 712 }
 713
 714 { .mfi
 715            nop.m 999
 716            frcpa.s1     atan2_F,p15     = f1, atan2_E
 717            nop.i 999
 718 }
 719 { .mfi
 720            nop.m 999
 721            fma.s1       atan2_V10       = atan2_V12, atan2_V13, atan2_V11
 722            nop.i 999
 723 ;;
 724 }
 725
 726 { .mfi
 727            nop.m 999
 728            fma.s1       atan2_V6        = atan2_V12, atan2_V8 , atan2_V7
 729            nop.i 999
 730 }
 731 { .mfi
 732            nop.m 999
 733            fma.s1       atan2_V9        = atan2_V12, atan2_V12, f0
 734            nop.i 999
 735 ;;
 736 }
 737
 738 { .mfi
 739            nop.m 999
 740            fma.s1       atan2_W10       = atan2_V12, atan2_P22 , atan2_W11
 741            nop.i 999
 742 }
 743 { .mfi
 744            nop.m 999
 745            fma.s1       atan2_W6        = atan2_V12, atan2_W8 , atan2_W7
 746            nop.i 999
 747 ;;
 748 }
 749
 750 { .mfi
 751            nop.m 999
 752            fma.s1       atan2_W2        = atan2_V12, atan2_W4  , atan2_W3
 753            nop.i 999
 754 }
 755 { .mfi
 756            nop.m 999
 757            fma.s1       atan2_V2        = atan2_V12, atan2_V4 , atan2_V3
 758            nop.i 999
 759 ;;
 760 }
 761
 762
 763 // Both X and Y are INF
 764 // p10 ==> X +
 765 // p11 ==> X -
 766 .pred.rel "mutex",p10,p11
 767 { .mfb
 768            nop.m 999
 769 (p10)      fma.d       f8              = atan2_sgnY, atan2_pi_by_4, f0
 770 (p10)      br.ret.spnt b0
 771 }
 772 { .mfb
 773            nop.m 999
 774 (p11)      fma.d       f8              = atan2_sgnY, atan2_3pi_by_4, f0
 775 (p11)      br.ret.spnt b0
 776 ;;
 777 }
 778
 779
 780 .pred.rel "mutex",p8,p9,p6
 781 { .mfi
 782            nop.m 999
 783            fnma.s1      atan2_alpha     = atan2_E, atan2_F, f1
 784            nop.i 999
 785 }
 786 { .mfi
 787            nop.m 999
 788            fnma.s1      atan2_alpha_1   = atan2_E, atan2_F, atan2_two
 789            nop.i 999
 790 ;;
 791 }
 792
 793
 794 { .mfi
 795            nop.m 999
 796 //(atan2_sT) fmerge.s     atan2_P         = atan2_Y, atan2_Pi_by_2
 797 (p6) fmerge.s     atan2_P         = atan2_Y, atan2_Pi_by_2
 798            nop.i 999
 799 }
 800 { .mfi
 801            nop.m 999
 802            fma.s1       atan2_gVF       = atan2_gV, atan2_F, f0
 803            nop.i 999
 804 ;;
 805 }
 806
 807
 808 { .mfi
 809            nop.m 999
 810            fma.s1       atan2_V5        = atan2_V9, atan2_V10, atan2_V6
 811            nop.i 999
 812 }
 813 { .mfi
 814            nop.m 999
 815            fma.s1       atan2_W12       = atan2_V9, atan2_V9, f0
 816            nop.i 999
 817 ;;
 818 }
 819
 820
 821
 822 { .mfi
 823            nop.m 999
 824 (p8)       fmerge.s     atan2_P         = atan2_sgnY, f0
 825            nop.i 999
 826 }
 827 { .mfi
 828            nop.m 999
 829            fma.s1       atan2_W5        = atan2_V9, atan2_W10, atan2_W6
 830            nop.i 999
 831 ;;
 832 }
 833
 834
 835
 836
 837 { .mfi
 838            nop.m 999
 839 (p9)       fmerge.s     atan2_P         = atan2_sgnY, atan2_pi
 840            nop.i 999
 841 ;;
 842 }
 843
 844
 845 { .mfi
 846            nop.m 999
 847            fma.s1       atan2_alpha_sq  = atan2_alpha, atan2_alpha, f0
 848            nop.i 999
 849 }
 850 { .mfi
 851            nop.m 999
 852            fma.s1       atan2_Cp        = atan2_alpha, atan2_alpha_1, f1
 853            nop.i 999
 854 ;;
 855 }
 856
 857
 858 { .mfi
 859            nop.m 999
 860            fma.s1       atan2_V1        = atan2_V9, atan2_V5, atan2_V2
 861            nop.i 999
 862 }
 863 { .mfi
 864            nop.m 999
 865            fma.s1       atan2_W12       = atan2_V9, atan2_W12, f0
 866            nop.i 999
 867 ;;
 868 }
 869
 870
 871 // p13 ==> x  inf     y !inf
 872 { .mfi
 873            nop.m 999
 874            fma.s1       atan2_W1        = atan2_V9, atan2_W5, atan2_W2
 875            nop.i 999
 876 }
 877 { .mfi
 878            nop.m 999
 879 (p13)      fcmp.eq.unc.s1 p10,p11       = atan2_sgnX,f1
 880            nop.i 999
 881 ;;
 882 }
 883
 884
 885 { .mfi
 886            nop.m 999
 887            fma.s1       atan2_alpha_cub = atan2_alpha, atan2_alpha_sq, f0
 888            nop.i 999
 889 }
 890 { .mfi
 891            nop.m 999
 892            fma.s1       atan2_C         = atan2_gVF, atan2_Cp, f0
 893            nop.i 999
 894 ;;
 895 }
 896
 897 .pred.rel "mutex",p10,p11
 898 // x inf y !inf
 899 { .mfb
 900            nop.m 999
 901 (p10)      fmerge.s     f8              = atan2_sgnY, f0
 902 (p10)      br.ret.spnt b0
 903 }
 904 { .mfb
 905            nop.m 999
 906 (p11)      fma.d        f8              = atan2_sgnY, atan2_pi, f0
 907 (p11)      br.ret.spnt b0
 908 ;;
 909 }
 910
 911
 912
 913 // p10 ==> y   0     x?
 914 // p11 ==> y  !0     x?
 915 { .mfi
 916            nop.m 999
 917            fclass.m.unc p10,p11 = f8, 0x07
 918            nop.i 999
 919 ;;
 920 }
 921
 922 { .mfi
 923            nop.m 999
 924 (p8)       fmerge.s     atan2_sml_norm  = atan2_sgnY, atan2_sml_norm
 925            nop.i 999
 926 ;;
 927 }
 928
 929 { .mfi
 930            nop.m 999
 931            fma.s1       atan2_Pp        = atan2_W12, atan2_W1, atan2_V1
 932            nop.i 999
 933 }
 934 { .mfi
 935            nop.m 999
 936            fma.s1       atan2_d         = atan2_alpha_cub, atan2_C, atan2_C
 937            nop.i 999
 938 ;;
 939 }
 940
 941 // p12 ==>  y0     x0
 942 // p13 ==>  y0     x!0
 943 // p14 ==>  y!0    x0
 944 // p15 ==>  y!0    x!0
 945 { .mfi
 946            nop.m 999
 947 (p10)      fclass.m.unc p12,p13 = f9, 0x07
 948            nop.i 999
 949 }
 950 { .mfi
 951            nop.m 999
 952 (p11)      fclass.m.unc p14,p15 = f9, 0x07
 953            nop.i 999
 954 ;;
 955 }
 956
 957
 958
 959
 960 { .mfb
 961            nop.m 999
 962 (p13)      fcmp.eq.unc.s1 p10,p11       = atan2_sgnX,f1
 963 (p12)      br.spnt ATAN2_ERROR
 964 ;;
 965 }
 966
 967
 968
 969 { .mfi
 970            nop.m 999
 971            fma.s1       atan2_pd        = atan2_P0, atan2_d, f0
 972            nop.i 999
 973 }
 974 { .mfi
 975            nop.m 999
 976            fma.s1       atan2_dsq       = atan2_d, atan2_d, f0
 977            nop.i 999
 978 ;;
 979 }
 980
 981 { .mfi
 982            nop.m 999
 983            fma.s1       atan2_A_hi      = atan2_zcub, atan2_Pp, atan2_z
 984            nop.i 999
 985 }
 986 { .mfb
 987            nop.m 999
 988 (p14)      fma.d       f8 = atan2_sgnY, atan2_Pi_by_2, f0
 989 (p14)      br.ret.spnt b0
 990 ;;
 991 }
 992
 993
 994
 995 { .mfb
 996            nop.m 999
 997 (p10)      fmerge.s     f8              = atan2_sgnY, f0
 998 (p10)      br.ret.spnt b0
 999 }
1000 { .mfb
1001            nop.m 999
1002 (p11)      fma.d        f8              = atan2_sgnY, atan2_pi, f0
1003 (p11)      br.ret.spnt b0
1004 ;;
1005 }
1006
1007
1008
1009 { .mfi
1010            nop.m 999
1011            fma.s1       atan2_A_lo      = atan2_pd, atan2_dsq, atan2_d
1012            nop.i 999
1013 ;;
1014 }
1015
1016
1017 { .mfi
1018            nop.m 999
1019            fma.s1       atan2_A         = atan2_A_hi, f1, atan2_A_lo
1020            nop.i 999
1021 ;;
1022 }
1023
1024 // Force inexact and possibly underflow if very small results
1025 { .mfi
1026            nop.m 999
1027 (p8)       fma.d        atan2_FR_tmp    = atan2_sgnXY, atan2_A, atan2_sml_norm
1028            nop.i 999
1029 }
1030 { .mfb
1031            nop.m 999
1032            fma.d        f8              = atan2_sgnXY, atan2_A, atan2_P
1033            br.ret.sptk  b0
1034 ;;
1035 }
1036
1037 ATAN2_ERROR:
1038
1039 { .mfi
1040           nop.m 999
1041           fcmp.eq.unc.s1 p10,p11       = atan2_sgnX,f1
1042           nop.i 999
1043 }
1044 ;;
1045
1046 { .mfi
1047           mov        atan2_GR_tag     = 37
1048 (p10)     fmerge.s     f10             = atan2_sgnY, f0
1049           nop.i 999
1050 }
1051 { .mfi
1052           nop.m 999
1053 (p11)     fma.d        f10            = atan2_sgnY, atan2_pi, f0
1054           nop.i 999
1055 ;;
1056 }
1057 .endp atan2#
1058 ASM_SIZE_DIRECTIVE(atan2#)
1059
1060
1061 // Stack operations when calling error support.
1062 //       (1)               (2)                          (3) (call)              (4)
1063 //   sp   -> +          psp -> +                     psp -> +                   sp -> +
1064 //           |                 |                            |                         |
1065 //           |                 | <- GR_Y               R3 ->| <- GR_RESULT            | -> f8
1066 //           |                 |                            |                         |
1067 //           | <-GR_Y      Y2->|                       Y2 ->| <- GR_Y                 |
1068 //           |                 |                            |                         |
1069 //           |                 | <- GR_X               X1 ->|                         |
1070 //           |                 |                            |                         |
1071 //  sp-64 -> +          sp ->  +                     sp ->  +                         +
1072 //    save ar.pfs          save b0                                               restore gp
1073 //    save gp                                                                    restore ar.pfs
1074
1075
1076 .proc __libm_error_region
1077 __libm_error_region:
1078 .prologue
1079 // (1)
1080 { .mfi
1081         add   GR_Parameter_Y=-32,sp             // Parameter 2 value
1082         nop.f 999
1083 .save   ar.pfs,GR_SAVE_PFS
1084         mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
1085 }
1086 { .mfi
1087 .fframe 64
1088         add sp=-64,sp                          // Create new stack
1089         nop.f 0
1090         mov GR_SAVE_GP=gp                      // Save gp
1091 };;
1092
1093
1094 // (2)
1095 { .mmi
1096         stfd [GR_Parameter_Y] = f8,16         // STORE Parameter 2 on stack
1097         add GR_Parameter_X = 16,sp            // Parameter 1 address
1098 .save   b0, GR_SAVE_B0
1099         mov GR_SAVE_B0=b0                     // Save b0
1100 };;
1101
1102 .body
1103 // (3)
1104 { .mib
1105         stfd [GR_Parameter_X] = f9                   // STORE Parameter 1 on stack
1106         add   GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
1107         nop.b 0
1108 }
1109 { .mib
1110         stfd [GR_Parameter_Y] = f10                  // STORE Parameter 3 on stack
1111         add   GR_Parameter_Y = -16,GR_Parameter_Y
1112         br.call.sptk b0=__libm_error_support#        // Call error handling function
1113 };;
1114 { .mmi
1115         nop.m 0
1116         nop.m 0
1117         add   GR_Parameter_RESULT = 48,sp
1118 };;
1119
1120 // (4)
1121 { .mmi
1122         ldfd  f8 = [GR_Parameter_RESULT]       // Get return result off stack
1123 .restore sp
1124         add   sp = 64,sp                       // Restore stack pointer
1125         mov   b0 = GR_SAVE_B0                  // Restore return address
1126 };;
1127 { .mib
1128         mov   gp = GR_SAVE_GP                  // Restore gp
1129         mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
1130         br.ret.sptk     b0                     // Return
1131 };;
1132
1133 .endp __libm_error_region
1134 ASM_SIZE_DIRECTIVE(__libm_error_region)
1135
1136 .type   __libm_error_support#,@function
1137 .global __libm_error_support#