sysdeps/ia64/fpu/s_erff.S

   1 .file "erff.s"
   2
   3
   4 // Copyright (c) 2001 - 2005, Intel Corporation
   5 // All rights reserved.
   6 //
   7 // Contributed 2001 by the Intel Numerics Group, Intel Corporation
   8 //
   9 // Redistribution and use in source and binary forms, with or without
  10 // modification, are permitted provided that the following conditions are
  11 // met:
  12 //
  13 // * Redistributions of source code must retain the above copyright
  14 // notice, this list of conditions and the following disclaimer.
  15 //
  16 // * Redistributions in binary form must reproduce the above copyright
  17 // notice, this list of conditions and the following disclaimer in the
  18 // documentation and/or other materials provided with the distribution.
  19 //
  20 // * The name of Intel Corporation may not be used to endorse or promote
  21 // products derived from this software without specific prior written
  22 // permission.
  23
  24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  35 //
  36 // Intel Corporation is the author of this code, and requests that all
  37 // problem reports or change requests be submitted to it directly at
  38 // http://www.intel.com/software/products/opensource/libraries/num.htm.
  39 //
  40 // History
  41 //==============================================================
  42 // 08/14/01 Initial version
  43 // 05/20/02 Cleaned up namespace and sf0 syntax
  44 // 02/06/03 Reordered header: .section, .global, .proc, .align
  45 // 03/31/05 Reformatted delimiters between data tables
  46 //
  47 // API
  48 //==============================================================
  49 // float erff(float)
  50 //
  51 // Overview of operation
  52 //==============================================================
  53 // Background
  54 //
  55 //
  56 // There are 8 paths:
  57 // 1. x = +/-0.0
  58 //    Return erff(x) = +/-0.0
  59 //
  60 // 2. 0.0 < |x| < 0.125
  61 //    Return erff(x) = x *Pol3(x^2),
  62 //    where Pol3(x^2) = C3*x^6 + C2*x^4 + C1*x^2 + C0
  63 //
  64 // 3. 0.125 <= |x| < 4.0
  65 //    Return erff(x) = sign(x)*PolD(x)*PolC(|x|) + sign(x)*PolA(|x|),
  66 //    where sign(x)*PolD(x) = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4),
  67 //          PolC(|x|) = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0,
  68 //          PolA(|x|) = A3|x|^3 + A2*x^2 + A1*|x| + A0
  69 //
  70 //    Actually range 0.125<=|x|< 4.0 is splitted to 5 subranges.
  71 //    For each subrange there is particular set of coefficients.
  72 //    Below is the list of subranges:
  73 //    3.1 0.125 <= |x| < 0.25
  74 //    3.2 0.25 <= |x| < 0.5
  75 //    3.3 0.5 <= |x| < 1.0
  76 //    3.4 1.0 <= |x| < 2.0
  77 //    3.5 2.0 <= |x| < 4.0
  78 //
  79 // 4. 4.0 <= |x| < +INF
  80 //    Return erff(x) = sign(x)*(1.0d - 2^(-52))
  81 //
  82 // 5. |x| = INF
  83 //    Return erff(x) = sign(x) * 1.0
  84 //
  85 // 6. x = [S,Q]NaN
  86 //    Return erff(x) = QNaN
  87 //
  88 // 7. x is positive denormal
  89 //    Return erff(x) = C0*x - x^2,
  90 //    where C0 = 2.0/sqrt(Pi)
  91 //
  92 // 8. x is negative denormal
  93 //    Return erff(x) = C0*x + x^2,
  94 //    where C0 = 2.0/sqrt(Pi)
  95 //
  96 // Registers used
  97 //==============================================================
  98 // Floating Point registers used:
  99 // f8, input
 100 // f32 -> f59
 101
 102 // General registers used:
 103 // r32 -> r45, r2, r3
 104
 105 // Predicate registers used:
 106 // p0, p6 -> p12, p14, p15
 107
 108 // p6           to filter out case when x = [Q,S]NaN or +/-0
 109 // p7           to filter out case when x = denormal
 110 // p8           set if |x| >= 0.3125, used also to process denormal input
 111 // p9           to filter out case when |x| = inf
 112 // p10          to filter out case when |x| < 0.125
 113 // p11          to filter out case when 0.125 <= |x| < 4.0
 114 // p12          to filter out case when |x| >= 4.0
 115 // p14          set to 1 for positive x
 116 // p15          set to 1 for negative x
 117
 118 // Assembly macros
 119 //==============================================================
 120 rDataPtr           = r2
 121 rDataPtr1          = r3
 122
 123 rBias              = r33
 124 rCoeffAddr3        = r34
 125 rCoeffAddr1        = r35
 126 rCoeffAddr2        = r36
 127 rOffset2           = r37
 128 rBias2             = r38
 129 rMask              = r39
 130 rArg               = r40
 131 rBound             = r41
 132 rSignBit           = r42
 133 rAbsArg            = r43
 134 rDataPtr2          = r44
 135 rSaturation        = r45
 136
 137 //==============================================================
 138 fA0                = f32
 139 fA1                = f33
 140 fA2                = f34
 141 fA3                = f35
 142 fC0                = f36
 143 fC1                = f37
 144 fC2                = f38
 145 fC3                = f39
 146 fD0                = f40
 147 fD1                = f41
 148 fD2                = f42
 149 fB0                = f43
 150 fArgSqr            = f44
 151 fAbsArg            = f45
 152 fSignumX           = f46
 153 fArg4              = f47
 154 fArg4Sgn           = f48
 155 fArg3              = f49
 156 fArg3Sgn           = f50
 157 fArg7Sgn           = f51
 158 fArg6Sgn           = f52
 159 fPolC              = f53
 160 fPolCTmp           = f54
 161 fPolA              = f55
 162 fPolATmp           = f56
 163 fPolD              = f57
 164 fPolDTmp           = f58
 165 fArgSqrSgn         = f59
 166
 167 // Data tables
 168 //==============================================================
 169
 170 RODATA
 171
 172 .align 16
 173
 174 LOCAL_OBJECT_START(erff_data)
 175 // Polynomial coefficients for the erf(x), 0.125 <= |x| < 0.25
 176 data8 0xBE4218BB56B49E66 // C0
 177 data8 0x3F7AFB8315DA322B // C1
 178 data8 0x3F615D6EBEE0CA32 // C2
 179 data8 0xBF468D71CF4F0918 // C3
 180 data8 0x40312115B0932F24 // D0
 181 data8 0xC0160D6CD0991EA3 // D1
 182 data8 0xBFE04A567A6DBE4A // D2
 183 data8 0xBF4207BC640D1509 // B0
 184 // Polynomial coefficients for the erf(x), 0.25 <= |x| < 0.5
 185 data8 0x3F90849356383F58 // C0
 186 data8 0x3F830BD5BA240F09 // C1
 187 data8 0xBF3FA4970E2BCE23 // C2
 188 data8 0xBF6061798E58D0FD // C3
 189 data8 0xBF68C0D83DD22E02 // D0
 190 data8 0x401C0A9EE4108F94 // D1
 191 data8 0xC01056F9B5E387F5 // D2
 192 data8 0x3F1C9744E36A5706 // B0
 193 // Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0
 194 data8 0x3F85F7D419A13DE3 // C0
 195 data8 0x3F791A13FF66D45A // C1
 196 data8 0x3F46B17B16B5929F // C2
 197 data8 0xBF5124947A8BF45E // C3
 198 data8 0x3FA1B3FD95EA9564 // D0
 199 data8 0x40250CECD79A020A // D1
 200 data8 0xC0190DC96FF66CCD // D2
 201 data8 0x3F4401AE28BA4DD5 // B0
 202 // Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0
 203 data8 0xBF49E07E3584C3AE // C0
 204 data8 0x3F3166621131445C // C1
 205 data8 0xBF65B7FC1EAC2099 // C2
 206 data8 0x3F508C6BD211D736 // C3
 207 data8 0xC053FABD70601067 // D0
 208 data8 0x404A06640EE87808 // D1
 209 data8 0xC0283F30817A3F08 // D2
 210 data8 0xBF2F6DBBF4D6257F // B0
 211 // Polynomial coefficients for the erf(x), 2.0 <= |x| < 4.0
 212 data8 0xBF849855D67E9407 // C0
 213 data8 0x3F5ECA5FEC01C70C // C1
 214 data8 0xBF483110C30FABA4 // C2
 215 data8 0x3F1618DA72860403 // C3
 216 data8 0xC08A5C9D5FE8B9F6 // D0
 217 data8 0x406EFF5F088CEC4B // D1
 218 data8 0xC03A5743DF38FDE0 // D2
 219 data8 0xBEE397A9FA5686A2 // B0
 220 // Polynomial coefficients for the erf(x), -0.125 < x < 0.125
 221 data8 0x3FF20DD7504270CB // C0
 222 data8 0xBFD8127465AFE719 // C1
 223 data8 0x3FBCE2D77791DD77 // C2
 224 data8 0xBF9B582755CDF345 // C3
 225 // Polynomial coefficients for the erf(x), 0.125 <= |x| < 0.25
 226 data8 0xBD54E7E451AF0E36 // A0
 227 data8 0x3FF20DD75043FE20 // A1
 228 data8 0xBE05680ACF8280E4 // A2
 229 data8 0xBFD812745E92C3D3 // A3
 230 // Polynomial coefficients for the erf(x), 0.25 <= |x| < 0.5
 231 data8 0xBE1ACEC2859CB55F // A0
 232 data8 0x3FF20DD75E8D2B64 // A1
 233 data8 0xBEABC6A83208FCFC // A2
 234 data8 0xBFD81253E42E7B99 // A3
 235 // Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0
 236 data8 0x3EABD5A2482B4979 // A0
 237 data8 0x3FF20DCAA52085D5 // A1
 238 data8 0x3F13A994A348795B // A2
 239 data8 0xBFD8167B2DFCDE44 // A3
 240 // Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0
 241 data8 0xBF5BA377DDAB4E17 // A0
 242 data8 0x3FF2397F1D8FC0ED // A1
 243 data8 0xBF9945BFC1915C21 // A2
 244 data8 0xBFD747AAABB690D8 // A3
 245 // Polynomial coefficients for the erf(x), 2.0 <= |x| < 4.0
 246 data8 0x3FF0E2920E0391AF // A0
 247 data8 0xC00D249D1A95A5AE // A1
 248 data8 0x40233905061C3803 // A2
 249 data8 0xC027560B851F7690 // A3
 250 //
 251 data8 0x3FEFFFFFFFFFFFFF // 1.0 - epsilon
 252 data8 0x3FF20DD750429B6D // C0 = 2.0/sqrt(Pi)
 253 LOCAL_OBJECT_END(erff_data)
 254
 255
 256 .section .text
 257 GLOBAL_LIBM_ENTRY(erff)
 258
 259 { .mfi
 260       alloc          r32 = ar.pfs, 0, 14, 0, 0
 261       fmerge.s       fAbsArg = f1, f8             // |x|
 262       addl           rMask = 0x806, r0
 263 }
 264 { .mfi
 265       addl           rDataPtr = @ltoff(erff_data), gp
 266       fma.s1         fArgSqr = f8, f8, f0         // x^2
 267       adds           rSignBit = 0x1, r0
 268 }
 269 ;;
 270
 271 { .mfi
 272       getf.s         rArg = f8                    // x in GR
 273       fclass.m       p7,p0 = f8, 0x0b             // is x denormal ?
 274       // sign bit and 2 most bits in significand
 275       shl            rMask = rMask, 20
 276 }
 277 { .mfi
 278       ld8            rDataPtr = [rDataPtr]
 279       nop.f          0
 280       adds           rBias2 = 0x1F0, r0
 281 }
 282 ;;
 283
 284 { .mfi
 285       nop.m          0
 286       fmerge.s       fSignumX = f8, f1            // signum(x)
 287       shl            rSignBit = rSignBit, 31      // mask for sign bit
 288 }
 289 { .mfi
 290       adds           rBound = 0x3E0, r0
 291       nop.f          0
 292       adds           rSaturation = 0x408, r0
 293 }
 294 ;;
 295
 296 { .mfi
 297       andcm          rOffset2 = rArg, rMask
 298       fclass.m       p6,p0 = f8, 0xc7             // is x [S,Q]NaN or +/-0 ?
 299       shl            rBound = rBound, 20          // 0.125f in GR
 300 }
 301 { .mfb
 302       andcm          rAbsArg = rArg, rSignBit     // |x| in GR
 303       nop.f          0
 304 (p7)  br.cond.spnt   erff_denormal               // branch out if x is denormal
 305 }
 306 ;;
 307
 308 { .mfi
 309       adds           rCoeffAddr2 = 352, rDataPtr
 310       fclass.m       p9,p0 = f8, 0x23            // is x +/- inf?
 311       shr            rOffset2 = rOffset2, 21
 312 }
 313 { .mfi
 314       cmp.lt         p10, p8 = rAbsArg, rBound   // |x| < 0.125?
 315       nop.f          0
 316       adds           rCoeffAddr3 = 16, rDataPtr
 317 }
 318 ;;
 319
 320 { .mfi
 321 (p8)  sub            rBias = rOffset2, rBias2
 322       fma.s1         fArg4 = fArgSqr, fArgSqr, f0 // x^4
 323       shl            rSaturation = rSaturation, 20// 4.0 in GR (saturation bound)
 324 }
 325 { .mfb
 326 (p10) adds           rBias = 0x14, r0
 327 (p6)  fma.s.s0       f8 = f8,f1,f8                // NaN or +/-0
 328 (p6)  br.ret.spnt    b0                           // exit for x = NaN or +/-0
 329 }
 330 ;;
 331
 332 { .mfi
 333       shladd         rCoeffAddr1 = rBias, 4, rDataPtr
 334       fma.s1         fArg3Sgn = fArgSqr, f8, f0  // sign(x)*|x|^3
 335       // is |x| < 4.0?
 336       cmp.lt         p11, p12 = rAbsArg, rSaturation
 337 }
 338 { .mfi
 339       shladd         rCoeffAddr3 = rBias, 4, rCoeffAddr3
 340       fma.s1         fArg3 = fArgSqr, fAbsArg, f0 // |x|^3
 341       shladd         rCoeffAddr2 = rBias, 3, rCoeffAddr2
 342 }
 343 ;;
 344
 345 { .mfi
 346 (p11) ldfpd          fC0, fC1 = [rCoeffAddr1]
 347 (p9)  fmerge.s       f8 = f8,f1                   // +/- inf
 348 (p12) adds           rDataPtr = 512, rDataPtr
 349 }
 350 { .mfb
 351 (p11) ldfpd          fC2, fC3 = [rCoeffAddr3], 16
 352       nop.f          0
 353 (p9)  br.ret.spnt    b0                           // exit for x = +/- inf
 354 }
 355 ;;
 356
 357 { .mfi
 358 (p11) ldfpd          fA0, fA1 = [rCoeffAddr2], 16
 359       nop.f          0
 360       nop.i          0
 361 }
 362 { .mfi
 363       add            rCoeffAddr1 = 48, rCoeffAddr1
 364       nop.f          0
 365       nop.i          0
 366 }
 367 ;;
 368
 369 { .mfi
 370 (p11) ldfpd          fD0, fD1 = [rCoeffAddr3]
 371       nop.f          0
 372       nop.i          0
 373 }
 374 { .mfb
 375 (p11) ldfpd          fD2, fB0 = [rCoeffAddr1]
 376       // sign(x)*|x|^2
 377       fma.s1         fArgSqrSgn = fArgSqr, fSignumX, f0
 378 (p10) br.cond.spnt   erff_near_zero
 379 }
 380 ;;
 381
 382 { .mfi
 383 (p11) ldfpd          fA2, fA3 = [rCoeffAddr2], 16
 384       fcmp.lt.s1     p15, p14 = f8,f0
 385       nop.i          0
 386 }
 387 { .mfb
 388 (p12) ldfd           fA0 = [rDataPtr]
 389       fma.s1         fArg4Sgn = fArg4, fSignumX, f0 // sign(x)*|x|^4
 390 (p12) br.cond.spnt   erff_saturation
 391 }
 392 ;;
 393 { .mfi
 394       nop.m          0
 395       fma.s1         fArg7Sgn = fArg4, fArg3Sgn, f0  // sign(x)*|x|^7
 396       nop.i          0
 397 }
 398 { .mfi
 399       nop.m          0
 400       fma.s1         fArg6Sgn = fArg3, fArg3Sgn, f0  // sign(x)*|x|^6
 401       nop.i          0
 402 }
 403 ;;
 404
 405 { .mfi
 406       nop.m          0
 407       fma.s1         fPolC = fC3, fAbsArg, fC2    // C3*|x| + C2
 408       nop.i          0
 409 }
 410 { .mfi
 411       nop.m          0
 412       fma.s1         fPolCTmp = fC1, fAbsArg, fC0 // C1*|x| + C0
 413       nop.i          0
 414 };;
 415
 416 { .mfi
 417       nop.m          0
 418       fma.s1         fPolA = fA1, fAbsArg, fA0    // A1*|x| + A0
 419       nop.i          0
 420 }
 421 ;;
 422
 423 { .mfi
 424       nop.m          0
 425       fma.s1         fPolD = fD1, fAbsArg, fD0    // D1*|x| + D0
 426       nop.i          0
 427 }
 428 { .mfi
 429       nop.m          0
 430       // sign(x)*(|x|^7 + D2*x^6)
 431       fma.s1         fPolDTmp = fArg6Sgn, fD2, fArg7Sgn
 432       nop.i          0
 433 };;
 434
 435 { .mfi
 436       nop.m          0
 437       fma.s1         fPolATmp = fA3, fAbsArg, fA2  // A3*|x| + A2
 438       nop.i          0
 439 }
 440 { .mfi
 441       nop.m          0
 442       fma.s1         fB0 = fB0, fArg4, f0          // B0*x^4
 443       nop.i          0
 444 };;
 445
 446 { .mfi
 447       nop.m          0
 448       // C3*|x|^3 + C2*x^2 + C1*|x| + C0
 449       fma.s1         fPolC = fPolC, fArgSqr, fPolCTmp
 450       nop.i          0
 451 }
 452 ;;
 453
 454 { .mfi
 455       nop.m          0
 456       // PolD = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4)
 457       fma.d.s1       fPolD = fPolD, fArg4Sgn, fPolDTmp
 458       nop.i          0
 459 }
 460 ;;
 461
 462 { .mfi
 463       nop.m          0
 464       // PolA = A3|x|^3 + A2*x^2 + A1*|x| + A0
 465       fma.d.s1       fPolA = fPolATmp, fArgSqr, fPolA
 466       nop.i          0
 467 }
 468 ;;
 469
 470 { .mfi
 471       nop.m          0
 472       // PolC = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0
 473       fma.d.s1       fPolC = fPolC, f1, fB0
 474       nop.i          0
 475 }
 476 ;;
 477
 478 { .mfi
 479       nop.m          0
 480 (p14) fma.s.s0       f8 = fPolC, fPolD, fPolA     // for positive x
 481       nop.i          0
 482 }
 483 { .mfb
 484       nop.m          0
 485 (p15) fms.s.s0       f8 = fPolC, fPolD, fPolA     // for negative x
 486       br.ret.sptk    b0                           // Exit for 0.125 <=|x|< 4.0
 487 };;
 488
 489
 490 // Here if |x| < 0.125
 491 erff_near_zero:
 492 { .mfi
 493       nop.m          0
 494       fma.s1         fPolC = fC3, fArgSqr, fC2    // C3*x^2 + C2
 495       nop.i          0
 496 }
 497 { .mfi
 498       nop.m          0
 499       fma.s1         fPolCTmp = fC1, fArgSqr, fC0  // C1*x^2 + C0
 500       nop.i          0
 501 };;
 502
 503 { .mfi
 504       nop.m          0
 505       fma.s1         fPolC = fPolC, fArg4, fPolCTmp // C3*x^6 + C2*x^4 + C1*x^2 + C0
 506       nop.i          0
 507 };;
 508
 509 { .mfb
 510       nop.m          0
 511       // x*(C3*x^6 + C2*x^4 + C1*x^2 + C0)
 512       fma.s.s0       f8 = fPolC, f8, f0
 513       br.ret.sptk    b0                           // Exit for |x| < 0.125
 514 };;
 515
 516 // Here if 4.0 <= |x| < +inf
 517 erff_saturation:
 518 { .mfb
 519       nop.m          0
 520       fma.s.s0       f8 = fA0, fSignumX, f0       // sign(x)*(1.0d - 2^(-52))
 521       // Exit for 4.0 <= |x| < +inf
 522       br.ret.sptk    b0                           // Exit for 4.0 <=|x|< +inf
 523 }
 524 ;;
 525
 526 // Here if x is single precision denormal
 527 erff_denormal:
 528 { .mfi
 529       adds           rDataPtr = 520, rDataPtr     // address of C0
 530       fclass.m       p7,p8 = f8, 0x0a             // is x -denormal ?
 531       nop.i          0
 532 }
 533 ;;
 534 { .mfi
 535       ldfd           fC0 = [rDataPtr]             // C0
 536       nop.f          0
 537       nop.i          0
 538 }
 539 ;;
 540 { .mfi
 541       nop.m          0
 542       fma.s1         fC0 = fC0,f8,f0              // C0*x
 543       nop.i          0
 544 }
 545 ;;
 546 { .mfi
 547       nop.m          0
 548 (p7)  fma.s.s0       f8 = f8,f8,fC0               // -denormal
 549       nop.i          0
 550 }
 551 { .mfb
 552       nop.m          0
 553 (p8)  fnma.s.s0      f8 = f8,f8,fC0               // +denormal
 554       br.ret.sptk    b0                           // Exit for denormal
 555 }
 556 ;;
 557
 558 GLOBAL_LIBM_END(erff)