math.c

   1 /*
   2  * "slow math" ripping/stitching/cosmetics from musl lib 1.2.2, power of open
   3  * source
   4  */
   5 #include <float.h>
   6 #include <ulinux/compiler_types.h>
   7 #include <ulinux/types.h>
   8 #define u32 ulinux_u32
   9 #define s32 ulinux_s32
  10 #define f32 ulinux_f32
  11 #define s64 ulinux_s64
  12 #define u64 ulinux_u64
  13 #define f64 ulinux_f64
  14 /* cosmetized locally */
  15 #define asu64(f) ((union{f64 _f; u64 _i;}){f})._i
  16 #define asf64(i) ((union{u64 _i; f64 _f;}){i})._f
  17 #define WANT_ROUNDING 1
  18 #ifdef __GNUC__
  19 #define predict_true(x) __builtin_expect(!!(x), 1)
  20 #define predict_false(x) __builtin_expect(x, 0)
  21 #else
  22 #define predict_true(x) (x)
  23 #define predict_false(x) (x)
  24 #endif
  25 #define NAN             (0.0f/0.0f)
  26 #define INFINITY        1e5000f
  27 /*
  28  * the following is to workaround some compiler operations/optimizations
  29  *
  30  * XXX: OMFG! we have to stop that insanity non-sense and write 100% assembly
  31  * (fast) or 100% C ("slow") for this kind of code. Hope riscv will be a success.
  32  */
  33 static inline f64 eval_as_f64(f64 x)
  34 {
  35         f64 y = x;
  36         return y;
  37 }
  38 static inline f64 fp_barrier(f64 x)
  39 {
  40         volatile f64 y = x;
  41         return y;
  42 }
  43 static f64 __math_divzero(u32 sign)
  44 {
  45         return fp_barrier(sign ? -1.0 : 1.0) / 0.0;
  46 }
  47 static f64 __math_invalid(f64 x)
  48 {
  49         return (x - x) / (x - x);
  50 }
  51 static void fp_force_eval_f32(f32 x)
  52 {
  53         volatile f32 y;
  54         y = x;
  55 }
  56 static void fp_force_eval(f64 x)
  57 {
  58         volatile f64 y;
  59         y = x;
  60 }
  61 static f64 __math_xflow(u32 sign, f64 y)
  62 {
  63         return eval_as_f64(fp_barrier(sign ? -y : y) * y);
  64 }
  65 static f64 __math_uflow(u32 sign)
  66 {
  67         return __math_xflow(sign, 0x1p-767);
  68 }
  69 static f64 __math_oflow(u32 sign)
  70 {
  71         return __math_xflow(sign, 0x1p769);
  72 }
  73 static u64 F64_BITS(f64 __f)
  74 {
  75         union {f64 __f; u64 __i;} __u;
  76         __u.__f = __f;
  77         return __u.__i;
  78 }
  79 #define isnan(x) ((F64_BITS(x) & -1UL>>1) > 0x7ffUL<<52)
  80 #define EPS DBL_EPSILON
  81 #define toint 1/EPS
  82 static f64 floor(f64 x)
  83 {
  84         union {f64 f; u64 i;} u = {x};
  85         s32 e = u.i >> 52 & 0x7ff;
  86         f64 y;
  87
  88         if (e >= 0x3ff+52 || x == 0)
  89                 return x;
  90         /* y = int(x) - x, where int(x) is an integer neighbor of x */
  91         if (u.i >> 63)
  92                 y = x - toint + toint - x;
  93         else
  94                 y = x + toint - toint - x;
  95         /* special case because of non-nearest rounding modes */
  96         if (e <= 0x3ff-1) {
  97                 fp_force_eval(y);
  98                 return u.i >> 63 ? -1 : 0;
  99         }
 100         if (y > 0)
 101                 return x + y - 1;
 102         return x + y;
 103 }
 104 #undef toint
 105 static f64 trunc(f64 x)
 106 {
 107         union {f64 f; u64 i;} u;
 108         s32 e;
 109         u64 m;
 110
 111         u.f = x;
 112         e = (s32)(u.i >> 52 & 0x7ff) - 0x3ff + 12;
 113
 114         if (e >= 52 + 12)
 115                 return x;
 116         if (e < 12)
 117                 e = 1;
 118         m = -1UL >> e;
 119         if ((u.i & m) == 0)
 120                 return x;
 121         fp_force_eval(x + 0x1p120f);
 122         u.i &= ~m;
 123         return u.f;
 124 }
 125 #define toint 1/EPS
 126 static f64 round(f64 x)
 127 {
 128         union {f64 f; u64 i;} u;
 129         s32 e;
 130         f64 y;
 131
 132         u.f = x;
 133          e = u.i >> 52 & 0x7ff;
 134
 135         if (e >= 0x3ff+52)
 136                 return x;
 137         if (u.i >> 63)
 138                 x = -x;
 139         if (e < 0x3ff-1) {
 140                 /* raise inexact if x!=0 */
 141                 fp_force_eval(x + toint);
 142                 return 0*u.f;
 143         }
 144         y = x + toint - toint - x;
 145         if (y > 0.5)
 146                 y = y + x - 1;
 147         else if (y <= -0.5)
 148                 y = y + x + 1;
 149         else
 150                 y = y + x;
 151         if (u.i >> 63)
 152                 y = -y;
 153         return y;
 154 }
 155 #undef toint
 156 static f64 modf(f64 x, f64 *iptr)
 157 {
 158         union {f64 f; u64 i;} u = {x};
 159         u64 mask;
 160         s32 e = (s32)(u.i>>52 & 0x7ff) - 0x3ff;
 161
 162         /* no fractional part */
 163         if (e >= 52) {
 164                 *iptr = x;
 165                 if (e == 0x400 && u.i<<12 != 0) /* nan */
 166                         return x;
 167                 u.i &= 1UL<<63;
 168                 return u.f;
 169         }
 170
 171         /* no integral part*/
 172         if (e < 0) {
 173                 u.i &= 1UL<<63;
 174                 *iptr = u.f;
 175                 return x;
 176         }
 177
 178         mask = -1UL>>12>>e;
 179         if ((u.i & mask) == 0) {
 180                 *iptr = x;
 181                 u.i &= 1UL<<63;
 182                 return u.f;
 183         }
 184         u.i &= ~mask;
 185         *iptr = u.f;
 186         return x - u.f;
 187 }
 188 static f64 fmod(f64 x, f64 y)
 189 {
 190         union {f64 f; u64 i;} ux = {x}, uy = {y};
 191         s32 ex = ux.i>>52 & 0x7ff;
 192         s32 ey = uy.i>>52 & 0x7ff;
 193         s32 sx = ux.i>>63;
 194         u64 i;
 195
 196         /* in the followings uxi should be ux.i, but then gcc wrongly adds */
 197         /* float load/store to inner loops ruining performance and code size */
 198         u64 uxi = ux.i;
 199
 200         if (uy.i<<1 == 0 || isnan(y) || ex == 0x7ff)
 201                 return (x*y)/(x*y);
 202         if (uxi<<1 <= uy.i<<1) {
 203                 if (uxi<<1 == uy.i<<1)
 204                         return 0*x;
 205                 return x;
 206         }
 207
 208         /* normalize x and y */
 209         if (!ex) {
 210                 for (i = uxi<<12; i>>63 == 0; ex--, i <<= 1);
 211                 uxi <<= -ex + 1;
 212         } else {
 213                 uxi &= -1UL >> 12;
 214                 uxi |= 1UL << 52;
 215         }
 216         if (!ey) {
 217                 for (i = uy.i<<12; i>>63 == 0; ey--, i <<= 1);
 218                 uy.i <<= -ey + 1;
 219         } else {
 220                 uy.i &= -1UL >> 12;
 221                 uy.i |= 1UL << 52;
 222         }
 223
 224         /* x mod y */
 225         for (; ex > ey; ex--) {
 226                 i = uxi - uy.i;
 227                 if (i >> 63 == 0) {
 228                         if (i == 0)
 229                                 return 0*x;
 230                         uxi = i;
 231                 }
 232                 uxi <<= 1;
 233         }
 234         i = uxi - uy.i;
 235         if (i >> 63 == 0) {
 236                 if (i == 0)
 237                         return 0*x;
 238                 uxi = i;
 239         }
 240         for (; uxi>>52 == 0; uxi <<= 1, ex--);
 241
 242         /* scale result */
 243         if (ex > 0) {
 244                 uxi -= 1UL << 52;
 245                 uxi |= (u64)ex << 52;
 246         } else {
 247                 uxi >>= -ex + 1;
 248         }
 249         uxi |= (u64)sx << 63;
 250         ux.i = uxi;
 251         return ux.f;
 252 }
 253 /*
 254  * Double-precision log(x) function.
 255  *
 256  * Copyright (c) 2018, Arm Limited.
 257  * SPDX-License-Identifier: MIT
 258  */
 259 #define LOG_TABLE_BITS 7
 260 #define LOG_POLY_ORDER 6
 261 #define LOG_POLY1_ORDER 12
 262 #define N (1 << LOG_TABLE_BITS)
 263 struct log_data {
 264         f64 ln2hi;
 265         f64 ln2lo;
 266         f64 poly[LOG_POLY_ORDER - 1]; /* First coefficient is 1.  */
 267         f64 poly1[LOG_POLY1_ORDER - 1];
 268         struct {
 269                 f64 invc, logc;
 270         } tab[1 << LOG_TABLE_BITS];
 271 #if !__FP_FAST_FMA
 272         struct {
 273                 f64 chi, clo;
 274         } tab2[1 << LOG_TABLE_BITS];
 275 #endif
 276 } __log_data = {
 277 .ln2hi = 0x1.62e42fefa3800p-1,
 278 .ln2lo = 0x1.ef35793c76730p-45,
 279 .poly1 = {
 280 /* relative error: 0x1.c04d76cp-63 */
 281 /* in -0x1p-4 0x1.09p-4 (|log(1+x)| > 0x1p-4 outside the interval) */
 282 -0x1p-1,
 283 0x1.5555555555577p-2,
 284 -0x1.ffffffffffdcbp-3,
 285 0x1.999999995dd0cp-3,
 286 -0x1.55555556745a7p-3,
 287 0x1.24924a344de3p-3,
 288 -0x1.fffffa4423d65p-4,
 289 0x1.c7184282ad6cap-4,
 290 -0x1.999eb43b068ffp-4,
 291 0x1.78182f7afd085p-4,
 292 -0x1.5521375d145cdp-4,
 293 },
 294 .poly = {
 295 /* relative error: 0x1.926199e8p-56 */
 296 /* abs error: 0x1.882ff33p-65 */
 297 /* in -0x1.fp-9 0x1.fp-9 */
 298 -0x1.0000000000001p-1,
 299 0x1.555555551305bp-2,
 300 -0x1.fffffffeb459p-3,
 301 0x1.999b324f10111p-3,
 302 -0x1.55575e506c89fp-3,
 303 },
 304 /* Algorithm:
 305
 306         x = 2^k z
 307         log(x) = k ln2 + log(c) + log(z/c)
 308         log(z/c) = poly(z/c - 1)
 309
 310 where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls
 311 into the ith one, then table entries are computed as
 312
 313         tab[i].invc = 1/c
 314         tab[i].logc = (double)log(c)
 315         tab2[i].chi = (double)c
 316         tab2[i].clo = (double)(c - (double)c)
 317
 318 where c is near the center of the subinterval and is chosen by trying +-2^29
 319 floating point invc candidates around 1/center and selecting one for which
 320
 321         1) the rounding error in 0x1.8p9 + logc is 0,
 322         2) the rounding error in z - chi - clo is < 0x1p-66 and
 323         3) the rounding error in (double)log(c) is minimized (< 0x1p-66).
 324
 325 Note: 1) ensures that k*ln2hi + logc can be computed without rounding error,
 326 2) ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to
 327 a single rounding error when there is no fast fma for z*invc - 1, 3) ensures
 328 that logc + poly(z/c - 1) has small error, however near x == 1 when
 329 |log(x)| < 0x1p-4, this is not enough so that is special cased.  */
 330 .tab = {
 331 {0x1.734f0c3e0de9fp+0, -0x1.7cc7f79e69000p-2},
 332 {0x1.713786a2ce91fp+0, -0x1.76feec20d0000p-2},
 333 {0x1.6f26008fab5a0p+0, -0x1.713e31351e000p-2},
 334 {0x1.6d1a61f138c7dp+0, -0x1.6b85b38287800p-2},
 335 {0x1.6b1490bc5b4d1p+0, -0x1.65d5590807800p-2},
 336 {0x1.69147332f0cbap+0, -0x1.602d076180000p-2},
 337 {0x1.6719f18224223p+0, -0x1.5a8ca86909000p-2},
 338 {0x1.6524f99a51ed9p+0, -0x1.54f4356035000p-2},
 339 {0x1.63356aa8f24c4p+0, -0x1.4f637c36b4000p-2},
 340 {0x1.614b36b9ddc14p+0, -0x1.49da7fda85000p-2},
 341 {0x1.5f66452c65c4cp+0, -0x1.445923989a800p-2},
 342 {0x1.5d867b5912c4fp+0, -0x1.3edf439b0b800p-2},
 343 {0x1.5babccb5b90dep+0, -0x1.396ce448f7000p-2},
 344 {0x1.59d61f2d91a78p+0, -0x1.3401e17bda000p-2},
 345 {0x1.5805612465687p+0, -0x1.2e9e2ef468000p-2},
 346 {0x1.56397cee76bd3p+0, -0x1.2941b3830e000p-2},
 347 {0x1.54725e2a77f93p+0, -0x1.23ec58cda8800p-2},
 348 {0x1.52aff42064583p+0, -0x1.1e9e129279000p-2},
 349 {0x1.50f22dbb2bddfp+0, -0x1.1956d2b48f800p-2},
 350 {0x1.4f38f4734ded7p+0, -0x1.141679ab9f800p-2},
 351 {0x1.4d843cfde2840p+0, -0x1.0edd094ef9800p-2},
 352 {0x1.4bd3ec078a3c8p+0, -0x1.09aa518db1000p-2},
 353 {0x1.4a27fc3e0258ap+0, -0x1.047e65263b800p-2},
 354 {0x1.4880524d48434p+0, -0x1.feb224586f000p-3},
 355 {0x1.46dce1b192d0bp+0, -0x1.f474a7517b000p-3},
 356 {0x1.453d9d3391854p+0, -0x1.ea4443d103000p-3},
 357 {0x1.43a2744b4845ap+0, -0x1.e020d44e9b000p-3},
 358 {0x1.420b54115f8fbp+0, -0x1.d60a22977f000p-3},
 359 {0x1.40782da3ef4b1p+0, -0x1.cc00104959000p-3},
 360 {0x1.3ee8f5d57fe8fp+0, -0x1.c202956891000p-3},
 361 {0x1.3d5d9a00b4ce9p+0, -0x1.b81178d811000p-3},
 362 {0x1.3bd60c010c12bp+0, -0x1.ae2c9ccd3d000p-3},
 363 {0x1.3a5242b75dab8p+0, -0x1.a45402e129000p-3},
 364 {0x1.38d22cd9fd002p+0, -0x1.9a877681df000p-3},
 365 {0x1.3755bc5847a1cp+0, -0x1.90c6d69483000p-3},
 366 {0x1.35dce49ad36e2p+0, -0x1.87120a645c000p-3},
 367 {0x1.34679984dd440p+0, -0x1.7d68fb4143000p-3},
 368 {0x1.32f5cceffcb24p+0, -0x1.73cb83c627000p-3},
 369 {0x1.3187775a10d49p+0, -0x1.6a39a9b376000p-3},
 370 {0x1.301c8373e3990p+0, -0x1.60b3154b7a000p-3},
 371 {0x1.2eb4ebb95f841p+0, -0x1.5737d76243000p-3},
 372 {0x1.2d50a0219a9d1p+0, -0x1.4dc7b8fc23000p-3},
 373 {0x1.2bef9a8b7fd2ap+0, -0x1.4462c51d20000p-3},
 374 {0x1.2a91c7a0c1babp+0, -0x1.3b08abc830000p-3},
 375 {0x1.293726014b530p+0, -0x1.31b996b490000p-3},
 376 {0x1.27dfa5757a1f5p+0, -0x1.2875490a44000p-3},
 377 {0x1.268b39b1d3bbfp+0, -0x1.1f3b9f879a000p-3},
 378 {0x1.2539d838ff5bdp+0, -0x1.160c8252ca000p-3},
 379 {0x1.23eb7aac9083bp+0, -0x1.0ce7f57f72000p-3},
 380 {0x1.22a012ba940b6p+0, -0x1.03cdc49fea000p-3},
 381 {0x1.2157996cc4132p+0, -0x1.f57bdbc4b8000p-4},
 382 {0x1.201201dd2fc9bp+0, -0x1.e370896404000p-4},
 383 {0x1.1ecf4494d480bp+0, -0x1.d17983ef94000p-4},
 384 {0x1.1d8f5528f6569p+0, -0x1.bf9674ed8a000p-4},
 385 {0x1.1c52311577e7cp+0, -0x1.adc79202f6000p-4},
 386 {0x1.1b17c74cb26e9p+0, -0x1.9c0c3e7288000p-4},
 387 {0x1.19e010c2c1ab6p+0, -0x1.8a646b372c000p-4},
 388 {0x1.18ab07bb670bdp+0, -0x1.78d01b3ac0000p-4},
 389 {0x1.1778a25efbcb6p+0, -0x1.674f145380000p-4},
 390 {0x1.1648d354c31dap+0, -0x1.55e0e6d878000p-4},
 391 {0x1.151b990275fddp+0, -0x1.4485cdea1e000p-4},
 392 {0x1.13f0ea432d24cp+0, -0x1.333d94d6aa000p-4},
 393 {0x1.12c8b7210f9dap+0, -0x1.22079f8c56000p-4},
 394 {0x1.11a3028ecb531p+0, -0x1.10e4698622000p-4},
 395 {0x1.107fbda8434afp+0, -0x1.ffa6c6ad20000p-5},
 396 {0x1.0f5ee0f4e6bb3p+0, -0x1.dda8d4a774000p-5},
 397 {0x1.0e4065d2a9fcep+0, -0x1.bbcece4850000p-5},
 398 {0x1.0d244632ca521p+0, -0x1.9a1894012c000p-5},
 399 {0x1.0c0a77ce2981ap+0, -0x1.788583302c000p-5},
 400 {0x1.0af2f83c636d1p+0, -0x1.5715e67d68000p-5},
 401 {0x1.09ddb98a01339p+0, -0x1.35c8a49658000p-5},
 402 {0x1.08cabaf52e7dfp+0, -0x1.149e364154000p-5},
 403 {0x1.07b9f2f4e28fbp+0, -0x1.e72c082eb8000p-6},
 404 {0x1.06ab58c358f19p+0, -0x1.a55f152528000p-6},
 405 {0x1.059eea5ecf92cp+0, -0x1.63d62cf818000p-6},
 406 {0x1.04949cdd12c90p+0, -0x1.228fb8caa0000p-6},
 407 {0x1.038c6c6f0ada9p+0, -0x1.c317b20f90000p-7},
 408 {0x1.02865137932a9p+0, -0x1.419355daa0000p-7},
 409 {0x1.0182427ea7348p+0, -0x1.81203c2ec0000p-8},
 410 {0x1.008040614b195p+0, -0x1.0040979240000p-9},
 411 {0x1.fe01ff726fa1ap-1, 0x1.feff384900000p-9},
 412 {0x1.fa11cc261ea74p-1, 0x1.7dc41353d0000p-7},
 413 {0x1.f6310b081992ep-1, 0x1.3cea3c4c28000p-6},
 414 {0x1.f25f63ceeadcdp-1, 0x1.b9fc114890000p-6},
 415 {0x1.ee9c8039113e7p-1, 0x1.1b0d8ce110000p-5},
 416 {0x1.eae8078cbb1abp-1, 0x1.58a5bd001c000p-5},
 417 {0x1.e741aa29d0c9bp-1, 0x1.95c8340d88000p-5},
 418 {0x1.e3a91830a99b5p-1, 0x1.d276aef578000p-5},
 419 {0x1.e01e009609a56p-1, 0x1.07598e598c000p-4},
 420 {0x1.dca01e577bb98p-1, 0x1.253f5e30d2000p-4},
 421 {0x1.d92f20b7c9103p-1, 0x1.42edd8b380000p-4},
 422 {0x1.d5cac66fb5ccep-1, 0x1.606598757c000p-4},
 423 {0x1.d272caa5ede9dp-1, 0x1.7da76356a0000p-4},
 424 {0x1.cf26e3e6b2ccdp-1, 0x1.9ab434e1c6000p-4},
 425 {0x1.cbe6da2a77902p-1, 0x1.b78c7bb0d6000p-4},
 426 {0x1.c8b266d37086dp-1, 0x1.d431332e72000p-4},
 427 {0x1.c5894bd5d5804p-1, 0x1.f0a3171de6000p-4},
 428 {0x1.c26b533bb9f8cp-1, 0x1.067152b914000p-3},
 429 {0x1.bf583eeece73fp-1, 0x1.147858292b000p-3},
 430 {0x1.bc4fd75db96c1p-1, 0x1.2266ecdca3000p-3},
 431 {0x1.b951e0c864a28p-1, 0x1.303d7a6c55000p-3},
 432 {0x1.b65e2c5ef3e2cp-1, 0x1.3dfc33c331000p-3},
 433 {0x1.b374867c9888bp-1, 0x1.4ba366b7a8000p-3},
 434 {0x1.b094b211d304ap-1, 0x1.5933928d1f000p-3},
 435 {0x1.adbe885f2ef7ep-1, 0x1.66acd2418f000p-3},
 436 {0x1.aaf1d31603da2p-1, 0x1.740f8ec669000p-3},
 437 {0x1.a82e63fd358a7p-1, 0x1.815c0f51af000p-3},
 438 {0x1.a5740ef09738bp-1, 0x1.8e92954f68000p-3},
 439 {0x1.a2c2a90ab4b27p-1, 0x1.9bb3602f84000p-3},
 440 {0x1.a01a01393f2d1p-1, 0x1.a8bed1c2c0000p-3},
 441 {0x1.9d79f24db3c1bp-1, 0x1.b5b515c01d000p-3},
 442 {0x1.9ae2505c7b190p-1, 0x1.c2967ccbcc000p-3},
 443 {0x1.9852ef297ce2fp-1, 0x1.cf635d5486000p-3},
 444 {0x1.95cbaeea44b75p-1, 0x1.dc1bd3446c000p-3},
 445 {0x1.934c69de74838p-1, 0x1.e8c01b8cfe000p-3},
 446 {0x1.90d4f2f6752e6p-1, 0x1.f5509c0179000p-3},
 447 {0x1.8e6528effd79dp-1, 0x1.00e6c121fb800p-2},
 448 {0x1.8bfce9fcc007cp-1, 0x1.071b80e93d000p-2},
 449 {0x1.899c0dabec30ep-1, 0x1.0d46b9e867000p-2},
 450 {0x1.87427aa2317fbp-1, 0x1.13687334bd000p-2},
 451 {0x1.84f00acb39a08p-1, 0x1.1980d67234800p-2},
 452 {0x1.82a49e8653e55p-1, 0x1.1f8ffe0cc8000p-2},
 453 {0x1.8060195f40260p-1, 0x1.2595fd7636800p-2},
 454 {0x1.7e22563e0a329p-1, 0x1.2b9300914a800p-2},
 455 {0x1.7beb377dcb5adp-1, 0x1.3187210436000p-2},
 456 {0x1.79baa679725c2p-1, 0x1.377266dec1800p-2},
 457 {0x1.77907f2170657p-1, 0x1.3d54ffbaf3000p-2},
 458 {0x1.756cadbd6130cp-1, 0x1.432eee32fe000p-2},
 459 },
 460 #if !__FP_FAST_FMA
 461 .tab2 = {
 462 {0x1.61000014fb66bp-1, 0x1.e026c91425b3cp-56},
 463 {0x1.63000034db495p-1, 0x1.dbfea48005d41p-55},
 464 {0x1.650000d94d478p-1, 0x1.e7fa786d6a5b7p-55},
 465 {0x1.67000074e6fadp-1, 0x1.1fcea6b54254cp-57},
 466 {0x1.68ffffedf0faep-1, -0x1.c7e274c590efdp-56},
 467 {0x1.6b0000763c5bcp-1, -0x1.ac16848dcda01p-55},
 468 {0x1.6d0001e5cc1f6p-1, 0x1.33f1c9d499311p-55},
 469 {0x1.6efffeb05f63ep-1, -0x1.e80041ae22d53p-56},
 470 {0x1.710000e86978p-1, 0x1.bff6671097952p-56},
 471 {0x1.72ffffc67e912p-1, 0x1.c00e226bd8724p-55},
 472 {0x1.74fffdf81116ap-1, -0x1.e02916ef101d2p-57},
 473 {0x1.770000f679c9p-1, -0x1.7fc71cd549c74p-57},
 474 {0x1.78ffffa7ec835p-1, 0x1.1bec19ef50483p-55},
 475 {0x1.7affffe20c2e6p-1, -0x1.07e1729cc6465p-56},
 476 {0x1.7cfffed3fc9p-1, -0x1.08072087b8b1cp-55},
 477 {0x1.7efffe9261a76p-1, 0x1.dc0286d9df9aep-55},
 478 {0x1.81000049ca3e8p-1, 0x1.97fd251e54c33p-55},
 479 {0x1.8300017932c8fp-1, -0x1.afee9b630f381p-55},
 480 {0x1.850000633739cp-1, 0x1.9bfbf6b6535bcp-55},
 481 {0x1.87000204289c6p-1, -0x1.bbf65f3117b75p-55},
 482 {0x1.88fffebf57904p-1, -0x1.9006ea23dcb57p-55},
 483 {0x1.8b00022bc04dfp-1, -0x1.d00df38e04b0ap-56},
 484 {0x1.8cfffe50c1b8ap-1, -0x1.8007146ff9f05p-55},
 485 {0x1.8effffc918e43p-1, 0x1.3817bd07a7038p-55},
 486 {0x1.910001efa5fc7p-1, 0x1.93e9176dfb403p-55},
 487 {0x1.9300013467bb9p-1, 0x1.f804e4b980276p-56},
 488 {0x1.94fffe6ee076fp-1, -0x1.f7ef0d9ff622ep-55},
 489 {0x1.96fffde3c12d1p-1, -0x1.082aa962638bap-56},
 490 {0x1.98ffff4458a0dp-1, -0x1.7801b9164a8efp-55},
 491 {0x1.9afffdd982e3ep-1, -0x1.740e08a5a9337p-55},
 492 {0x1.9cfffed49fb66p-1, 0x1.fce08c19bep-60},
 493 {0x1.9f00020f19c51p-1, -0x1.a3faa27885b0ap-55},
 494 {0x1.a10001145b006p-1, 0x1.4ff489958da56p-56},
 495 {0x1.a300007bbf6fap-1, 0x1.cbeab8a2b6d18p-55},
 496 {0x1.a500010971d79p-1, 0x1.8fecadd78793p-55},
 497 {0x1.a70001df52e48p-1, -0x1.f41763dd8abdbp-55},
 498 {0x1.a90001c593352p-1, -0x1.ebf0284c27612p-55},
 499 {0x1.ab0002a4f3e4bp-1, -0x1.9fd043cff3f5fp-57},
 500 {0x1.acfffd7ae1ed1p-1, -0x1.23ee7129070b4p-55},
 501 {0x1.aefffee510478p-1, 0x1.a063ee00edea3p-57},
 502 {0x1.b0fffdb650d5bp-1, 0x1.a06c8381f0ab9p-58},
 503 {0x1.b2ffffeaaca57p-1, -0x1.9011e74233c1dp-56},
 504 {0x1.b4fffd995badcp-1, -0x1.9ff1068862a9fp-56},
 505 {0x1.b7000249e659cp-1, 0x1.aff45d0864f3ep-55},
 506 {0x1.b8ffff987164p-1, 0x1.cfe7796c2c3f9p-56},
 507 {0x1.bafffd204cb4fp-1, -0x1.3ff27eef22bc4p-57},
 508 {0x1.bcfffd2415c45p-1, -0x1.cffb7ee3bea21p-57},
 509 {0x1.beffff86309dfp-1, -0x1.14103972e0b5cp-55},
 510 {0x1.c0fffe1b57653p-1, 0x1.bc16494b76a19p-55},
 511 {0x1.c2ffff1fa57e3p-1, -0x1.4feef8d30c6edp-57},
 512 {0x1.c4fffdcbfe424p-1, -0x1.43f68bcec4775p-55},
 513 {0x1.c6fffed54b9f7p-1, 0x1.47ea3f053e0ecp-55},
 514 {0x1.c8fffeb998fd5p-1, 0x1.383068df992f1p-56},
 515 {0x1.cb0002125219ap-1, -0x1.8fd8e64180e04p-57},
 516 {0x1.ccfffdd94469cp-1, 0x1.e7ebe1cc7ea72p-55},
 517 {0x1.cefffeafdc476p-1, 0x1.ebe39ad9f88fep-55},
 518 {0x1.d1000169af82bp-1, 0x1.57d91a8b95a71p-56},
 519 {0x1.d30000d0ff71dp-1, 0x1.9c1906970c7dap-55},
 520 {0x1.d4fffea790fc4p-1, -0x1.80e37c558fe0cp-58},
 521 {0x1.d70002edc87e5p-1, -0x1.f80d64dc10f44p-56},
 522 {0x1.d900021dc82aap-1, -0x1.47c8f94fd5c5cp-56},
 523 {0x1.dafffd86b0283p-1, 0x1.c7f1dc521617ep-55},
 524 {0x1.dd000296c4739p-1, 0x1.8019eb2ffb153p-55},
 525 {0x1.defffe54490f5p-1, 0x1.e00d2c652cc89p-57},
 526 {0x1.e0fffcdabf694p-1, -0x1.f8340202d69d2p-56},
 527 {0x1.e2fffdb52c8ddp-1, 0x1.b00c1ca1b0864p-56},
 528 {0x1.e4ffff24216efp-1, 0x1.2ffa8b094ab51p-56},
 529 {0x1.e6fffe88a5e11p-1, -0x1.7f673b1efbe59p-58},
 530 {0x1.e9000119eff0dp-1, -0x1.4808d5e0bc801p-55},
 531 {0x1.eafffdfa51744p-1, 0x1.80006d54320b5p-56},
 532 {0x1.ed0001a127fa1p-1, -0x1.002f860565c92p-58},
 533 {0x1.ef00007babcc4p-1, -0x1.540445d35e611p-55},
 534 {0x1.f0ffff57a8d02p-1, -0x1.ffb3139ef9105p-59},
 535 {0x1.f30001ee58ac7p-1, 0x1.a81acf2731155p-55},
 536 {0x1.f4ffff5823494p-1, 0x1.a3f41d4d7c743p-55},
 537 {0x1.f6ffffca94c6bp-1, -0x1.202f41c987875p-57},
 538 {0x1.f8fffe1f9c441p-1, 0x1.77dd1f477e74bp-56},
 539 {0x1.fafffd2e0e37ep-1, -0x1.f01199a7ca331p-57},
 540 {0x1.fd0001c77e49ep-1, 0x1.181ee4bceacb1p-56},
 541 {0x1.feffff7e0c331p-1, -0x1.e05370170875ap-57},
 542 {0x1.00ffff465606ep+0, -0x1.a7ead491c0adap-55},
 543 {0x1.02ffff3867a58p+0, -0x1.77f69c3fcb2ep-54},
 544 {0x1.04ffffdfc0d17p+0, 0x1.7bffe34cb945bp-54},
 545 {0x1.0700003cd4d82p+0, 0x1.20083c0e456cbp-55},
 546 {0x1.08ffff9f2cbe8p+0, -0x1.dffdfbe37751ap-57},
 547 {0x1.0b000010cda65p+0, -0x1.13f7faee626ebp-54},
 548 {0x1.0d00001a4d338p+0, 0x1.07dfa79489ff7p-55},
 549 {0x1.0effffadafdfdp+0, -0x1.7040570d66bcp-56},
 550 {0x1.110000bbafd96p+0, 0x1.e80d4846d0b62p-55},
 551 {0x1.12ffffae5f45dp+0, 0x1.dbffa64fd36efp-54},
 552 {0x1.150000dd59ad9p+0, 0x1.a0077701250aep-54},
 553 {0x1.170000f21559ap+0, 0x1.dfdf9e2e3deeep-55},
 554 {0x1.18ffffc275426p+0, 0x1.10030dc3b7273p-54},
 555 {0x1.1b000123d3c59p+0, 0x1.97f7980030188p-54},
 556 {0x1.1cffff8299eb7p+0, -0x1.5f932ab9f8c67p-57},
 557 {0x1.1effff48ad4p+0, 0x1.37fbf9da75bebp-54},
 558 {0x1.210000c8b86a4p+0, 0x1.f806b91fd5b22p-54},
 559 {0x1.2300003854303p+0, 0x1.3ffc2eb9fbf33p-54},
 560 {0x1.24fffffbcf684p+0, 0x1.601e77e2e2e72p-56},
 561 {0x1.26ffff52921d9p+0, 0x1.ffcbb767f0c61p-56},
 562 {0x1.2900014933a3cp+0, -0x1.202ca3c02412bp-56},
 563 {0x1.2b00014556313p+0, -0x1.2808233f21f02p-54},
 564 {0x1.2cfffebfe523bp+0, -0x1.8ff7e384fdcf2p-55},
 565 {0x1.2f0000bb8ad96p+0, -0x1.5ff51503041c5p-55},
 566 {0x1.30ffffb7ae2afp+0, -0x1.10071885e289dp-55},
 567 {0x1.32ffffeac5f7fp+0, -0x1.1ff5d3fb7b715p-54},
 568 {0x1.350000ca66756p+0, 0x1.57f82228b82bdp-54},
 569 {0x1.3700011fbf721p+0, 0x1.000bac40dd5ccp-55},
 570 {0x1.38ffff9592fb9p+0, -0x1.43f9d2db2a751p-54},
 571 {0x1.3b00004ddd242p+0, 0x1.57f6b707638e1p-55},
 572 {0x1.3cffff5b2c957p+0, 0x1.a023a10bf1231p-56},
 573 {0x1.3efffeab0b418p+0, 0x1.87f6d66b152bp-54},
 574 {0x1.410001532aff4p+0, 0x1.7f8375f198524p-57},
 575 {0x1.4300017478b29p+0, 0x1.301e672dc5143p-55},
 576 {0x1.44fffe795b463p+0, 0x1.9ff69b8b2895ap-55},
 577 {0x1.46fffe80475ep+0, -0x1.5c0b19bc2f254p-54},
 578 {0x1.48fffef6fc1e7p+0, 0x1.b4009f23a2a72p-54},
 579 {0x1.4afffe5bea704p+0, -0x1.4ffb7bf0d7d45p-54},
 580 {0x1.4d000171027dep+0, -0x1.9c06471dc6a3dp-54},
 581 {0x1.4f0000ff03ee2p+0, 0x1.77f890b85531cp-54},
 582 {0x1.5100012dc4bd1p+0, 0x1.004657166a436p-57},
 583 {0x1.530001605277ap+0, -0x1.6bfcece233209p-54},
 584 {0x1.54fffecdb704cp+0, -0x1.902720505a1d7p-55},
 585 {0x1.56fffef5f54a9p+0, 0x1.bbfe60ec96412p-54},
 586 {0x1.5900017e61012p+0, 0x1.87ec581afef9p-55},
 587 {0x1.5b00003c93e92p+0, -0x1.f41080abf0ccp-54},
 588 {0x1.5d0001d4919bcp+0, -0x1.8812afb254729p-54},
 589 {0x1.5efffe7b87a89p+0, -0x1.47eb780ed6904p-54},
 590 },
 591 #endif
 592 };
 593 #define T __log_data.tab
 594 #define T2 __log_data.tab2
 595 #define B __log_data.poly1
 596 #define A __log_data.poly
 597 #define Ln2hi __log_data.ln2hi
 598 #define Ln2lo __log_data.ln2lo
 599 #define OFF 0x3fe6000000000000
 600 /* Top 16 bits of a double.  */
 601 static inline u32 top16(f64 x)
 602 {
 603         return asu64(x) >> 48;
 604 }
 605 static f64 log(f64 x)
 606 {
 607         f64 w, z, r, r2, r3, y, invc, logc, kd, hi, lo;
 608         u64 ix, iz, tmp;
 609         u32 top;
 610         s32 k, i;
 611
 612         ix = asu64(x);
 613         top = top16(x);
 614 #define LO asu64(1.0 - 0x1p-4)
 615 #define HI asu64(1.0 + 0x1.09p-4)
 616         if (predict_false(ix - LO < HI - LO)) {
 617                 f64 rhi;
 618                 f64 rlo;
 619
 620                 /* Handle close to 1.0 inputs separately.  */
 621                 /* Fix sign of zero with downward rounding when x==1.  */
 622                 if (WANT_ROUNDING && predict_false(ix == asu64(1.0)))
 623                         return 0;
 624                 r = x - 1.0;
 625                 r2 = r * r;
 626                 r3 = r * r2;
 627                 y = r3 *
 628                     (B[1] + r * B[2] + r2 * B[3] +
 629                      r3 * (B[4] + r * B[5] + r2 * B[6] +
 630                            r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10])));
 631                 /* Worst-case error is around 0.507 ULP.  */
 632                 w = r * 0x1p27;
 633                 rhi = r + w - w;
 634                 rlo = r - rhi;
 635                 w = rhi * rhi * B[0]; /* B[0] == -0.5.  */
 636                 hi = r + w;
 637                 lo = r - hi + w;
 638                 lo += B[0] * rlo * (rhi + r);
 639                 y += lo;
 640                 y += hi;
 641                 return eval_as_f64(y);
 642         }
 643         if (predict_false(top - 0x0010 >= 0x7ff0 - 0x0010)) {
 644                 /* x < 0x1p-1022 or inf or nan.  */
 645                 if (ix * 2 == 0)
 646                         return __math_divzero(1);
 647                 if (ix == asu64(INFINITY)) /* log(inf) == inf.  */
 648                         return x;
 649                 if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0)
 650                         return __math_invalid(x);
 651                 /* x is subnormal, normalize it.  */
 652                 ix = asu64(x * 0x1p52);
 653                 ix -= 52UL << 52;
 654         }
 655
 656         /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
 657            The range is split into N subintervals.
 658            The ith subinterval contains z and c is near its center.  */
 659         tmp = ix - OFF;
 660         i = (s32)((tmp >> (52 - LOG_TABLE_BITS)) % N);
 661         k = (s32)((s64)tmp >> 52); /* arithmetic shift */
 662         iz = ix - (tmp & 0xfffUL << 52);
 663         invc = T[i].invc;
 664         logc = T[i].logc;
 665         z = asf64(iz);
 666
 667         /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
 668         /* r ~= z/c - 1, |r| < 1/(2*N).  */
 669 #if __FP_FAST_FMA
 670         /* rounding error: 0x1p-55/N.  */
 671         r = __builtin_fma(z, invc, -1.0);
 672 #else
 673         /* rounding error: 0x1p-55/N + 0x1p-66.  */
 674         r = (z - T2[i].chi - T2[i].clo) * invc;
 675 #endif
 676         kd = (f64)k;
 677
 678         /* hi + lo = r + log(c) + k*Ln2.  */
 679         w = kd * Ln2hi + logc;
 680         hi = w + r;
 681         lo = w - hi + r + kd * Ln2lo;
 682
 683         /* log(x) = lo + (log1p(r) - r) + hi.  */
 684         r2 = r * r; /* rounding error: 0x1p-54/N^2.  */
 685         /* Worst case error if |y| > 0x1p-5:
 686            0.5 + 4.13/N + abs-poly-error*2^57 ULP (+ 0.002 ULP without fma)
 687            Worst case error if |y| > 0x1p-4:
 688            0.5 + 2.06/N + abs-poly-error*2^56 ULP (+ 0.001 ULP without fma).  */
 689         y = lo + r2 * A[0] +
 690             r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4])) + hi;
 691         return eval_as_f64(y);
 692 }
 693 #undef LOG_TABLE_BITS
 694 #undef LOG_POLY_ORDER
 695 #undef LOG_POLY1_ORDER
 696 #undef T
 697 #undef T2
 698 #undef B
 699 #undef A
 700 #undef Ln2hi
 701 #undef Ln2lo
 702 #undef N
 703 #undef OFF
 704 #undef LO
 705 #undef HI
 706 static f64 scalbn(f64 x, s32 n)
 707 {
 708         union {f64 f; u64 i;} u;
 709         f64 y = x;
 710
 711         if (n > 1023) {
 712                 y *= 0x1p1023;
 713                 n -= 1023;
 714                 if (n > 1023) {
 715                         y *= 0x1p1023;
 716                         n -= 1023;
 717                         if (n > 1023)
 718                                 n = 1023;
 719                 }
 720         } else if (n < -1022) {
 721                 /* make sure final n < -53 to avoid double
 722                    rounding in the subnormal range */
 723                 y *= 0x1p-1022 * 0x1p53;
 724                 n += 1022 - 53;
 725                 if (n < -1022) {
 726                         y *= 0x1p-1022 * 0x1p53;
 727                         n += 1022 - 53;
 728                         if (n < -1022)
 729                                 n = -1022;
 730                 }
 731         }
 732         u.i = (u64)(0x3ff+n)<<52;
 733         x = y * u.f;
 734         return x;
 735 }
 736 /* origin: FreeBSD /usr/src/lib/msun/src/k_rem_pio2.c */
 737 /*
 738  * ====================================================
 739  * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
 740  *
 741  * Developed at SunSoft, a Sun Microsystems, Inc. business.
 742  * Permission to use, copy, modify, and distribute this
 743  * software is freely granted, provided that this notice
 744  * is preserved.
 745  * ====================================================
 746  */
 747 /*
 748  * __rem_pio2_large(x,y,e0,nx,prec)
 749  * double x[],y[]; int e0,nx,prec;
 750  *
 751  * __rem_pio2_large return the last three digits of N with
 752  *              y = x - N*pi/2
 753  * so that |y| < pi/2.
 754  *
 755  * The method is to compute the integer (mod 8) and fraction parts of
 756  * (2/pi)*x without doing the full multiplication. In general we
 757  * skip the part of the product that are known to be a huge integer (
 758  * more accurately, = 0 mod 8 ). Thus the number of operations are
 759  * independent of the exponent of the input.
 760  *
 761  * (2/pi) is represented by an array of 24-bit integers in ipio2[].
 762  *
 763  * Input parameters:
 764  *      x[]     The input value (must be positive) is broken into nx
 765  *              pieces of 24-bit integers in double precision format.
 766  *              x[i] will be the i-th 24 bit of x. The scaled exponent
 767  *              of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
 768  *              match x's up to 24 bits.
 769  *
 770  *              Example of breaking a double positive z into x[0]+x[1]+x[2]:
 771  *                      e0 = ilogb(z)-23
 772  *                      z  = scalbn(z,-e0)
 773  *              for i = 0,1,2
 774  *                      x[i] = floor(z)
 775  *                      z    = (z-x[i])*2**24
 776  *
 777  *
 778  *      y[]     ouput result in an array of double precision numbers.
 779  *              The dimension of y[] is:
 780  *                      24-bit  precision       1
 781  *                      53-bit  precision       2
 782  *                      64-bit  precision       2
 783  *                      113-bit precision       3
 784  *              The actual value is the sum of them. Thus for 113-bit
 785  *              precison, one may have to do something like:
 786  *
 787  *              long double t,w,r_head, r_tail;
 788  *              t = (long double)y[2] + (long double)y[1];
 789  *              w = (long double)y[0];
 790  *              r_head = t+w;
 791  *              r_tail = w - (r_head - t);
 792  *
 793  *      e0      The exponent of x[0]. Must be <= 16360 or you need to
 794  *              expand the ipio2 table.
 795  *
 796  *      nx      dimension of x[]
 797  *
 798  *      prec    an integer indicating the precision:
 799  *                      0       24  bits (single)
 800  *                      1       53  bits (double)
 801  *                      2       64  bits (extended)
 802  *                      3       113 bits (quad)
 803  *
 804  * External function:
 805  *      double scalbn(), floor();
 806  *
 807  *
 808  * Here is the description of some local variables:
 809  *
 810  *      jk      jk+1 is the initial number of terms of ipio2[] needed
 811  *              in the computation. The minimum and recommended value
 812  *              for jk is 3,4,4,6 for single, double, extended, and quad.
 813  *              jk+1 must be 2 larger than you might expect so that our
 814  *              recomputation test works. (Up to 24 bits in the integer
 815  *              part (the 24 bits of it that we compute) and 23 bits in
 816  *              the fraction part may be lost to cancelation before we
 817  *              recompute.)
 818  *
 819  *      jz      local integer variable indicating the number of
 820  *              terms of ipio2[] used.
 821  *
 822  *      jx      nx - 1
 823  *
 824  *      jv      index for pointing to the suitable ipio2[] for the
 825  *              computation. In general, we want
 826  *                      ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
 827  *              is an integer. Thus
 828  *                      e0-3-24*jv >= 0 or (e0-3)/24 >= jv
 829  *              Hence jv = max(0,(e0-3)/24).
 830  *
 831  *      jp      jp+1 is the number of terms in PIo2[] needed, jp = jk.
 832  *
 833  *      q[]     double array with integral value, representing the
 834  *              24-bits chunk of the product of x and 2/pi.
 835  *
 836  *      q0      the corresponding exponent of q[0]. Note that the
 837  *              exponent for q[i] would be q0-24*i.
 838  *
 839  *      PIo2[]  double precision array, obtained by cutting pi/2
 840  *              into 24 bits chunks.
 841  *
 842  *      f[]     ipio2[] in floating point
 843  *
 844  *      iq[]    integer array by breaking up q[] in 24-bits chunk.
 845  *
 846  *      fq[]    final product of x*(2/pi) in fq[0],..,fq[jk]
 847  *
 848  *      ih      integer. If >0 it indicates q[] is >= 0.5, hence
 849  *              it also indicates the *sign* of the result.
 850  *
 851  */
 852 /*
 853  * Constants:
 854  * The hexadecimal values are the intended ones for the following
 855  * constants. The decimal values may be used, provided that the
 856  * compiler will convert from decimal to binary accurately enough
 857  * to produce the hexadecimal values shown.
 858  */
 859 static s32 init_jk[] = {3,4,4,6}; /* initial value for jk */
 860 /*
 861  * Table of constants for 2/pi, 396 Hex digits (476 decimal) of 2/pi
 862  *
 863  *              integer array, contains the (24*i)-th to (24*i+23)-th
 864  *              bit of 2/pi after binary point. The corresponding
 865  *              floating value is
 866  *
 867  *                      ipio2[i] * 2^(-24(i+1)).
 868  *
 869  * NB: This table must have at least (e0-3)/24 + jk terms.
 870  *     For quad precision (e0 <= 16360, jk = 6), this is 686.
 871  */
 872 static s32 ipio2[] = {
 873 0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62,
 874 0x95993C, 0x439041, 0xFE5163, 0xABDEBB, 0xC561B7, 0x246E3A,
 875 0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129,
 876 0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41,
 877 0x3991D6, 0x398353, 0x39F49C, 0x845F8B, 0xBDF928, 0x3B1FF8,
 878 0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF,
 879 0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5,
 880 0xF17B3D, 0x0739F7, 0x8A5292, 0xEA6BFB, 0x5FB11F, 0x8D5D08,
 881 0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3,
 882 0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880,
 883 0x4D7327, 0x310606, 0x1556CA, 0x73A8C9, 0x60E27B, 0xC08C6B,
 884
 885 #if LDBL_MAX_EXP > 1024
 886 0x47C419, 0xC367CD, 0xDCE809, 0x2A8359, 0xC4768B, 0x961CA6,
 887 0xDDAF44, 0xD15719, 0x053EA5, 0xFF0705, 0x3F7E33, 0xE832C2,
 888 0xDE4F98, 0x327DBB, 0xC33D26, 0xEF6B1E, 0x5EF89F, 0x3A1F35,
 889 0xCAF27F, 0x1D87F1, 0x21907C, 0x7C246A, 0xFA6ED5, 0x772D30,
 890 0x433B15, 0xC614B5, 0x9D19C3, 0xC2C4AD, 0x414D2C, 0x5D000C,
 891 0x467D86, 0x2D71E3, 0x9AC69B, 0x006233, 0x7CD2B4, 0x97A7B4,
 892 0xD55537, 0xF63ED7, 0x1810A3, 0xFC764D, 0x2A9D64, 0xABD770,
 893 0xF87C63, 0x57B07A, 0xE71517, 0x5649C0, 0xD9D63B, 0x3884A7,
 894 0xCB2324, 0x778AD6, 0x23545A, 0xB91F00, 0x1B0AF1, 0xDFCE19,
 895 0xFF319F, 0x6A1E66, 0x615799, 0x47FBAC, 0xD87F7E, 0xB76522,
 896 0x89E832, 0x60BFE6, 0xCDC4EF, 0x09366C, 0xD43F5D, 0xD7DE16,
 897 0xDE3B58, 0x929BDE, 0x2822D2, 0xE88628, 0x4D58E2, 0x32CAC6,
 898 0x16E308, 0xCB7DE0, 0x50C017, 0xA71DF3, 0x5BE018, 0x34132E,
 899 0x621283, 0x014883, 0x5B8EF5, 0x7FB0AD, 0xF2E91E, 0x434A48,
 900 0xD36710, 0xD8DDAA, 0x425FAE, 0xCE616A, 0xA4280A, 0xB499D3,
 901 0xF2A606, 0x7F775C, 0x83C2A3, 0x883C61, 0x78738A, 0x5A8CAF,
 902 0xBDD76F, 0x63A62D, 0xCBBFF4, 0xEF818D, 0x67C126, 0x45CA55,
 903 0x36D9CA, 0xD2A828, 0x8D61C2, 0x77C912, 0x142604, 0x9B4612,
 904 0xC459C4, 0x44C5C8, 0x91B24D, 0xF31700, 0xAD43D4, 0xE54929,
 905 0x10D5FD, 0xFCBE00, 0xCC941E, 0xEECE70, 0xF53E13, 0x80F1EC,
 906 0xC3E7B3, 0x28F8C7, 0x940593, 0x3E71C1, 0xB3092E, 0xF3450B,
 907 0x9C1288, 0x7B20AB, 0x9FB52E, 0xC29247, 0x2F327B, 0x6D550C,
 908 0x90A772, 0x1FE76B, 0x96CB31, 0x4A1679, 0xE27941, 0x89DFF4,
 909 0x9794E8, 0x84E6E2, 0x973199, 0x6BED88, 0x365F5F, 0x0EFDBB,
 910 0xB49A48, 0x6CA467, 0x427271, 0x325D8D, 0xB8159F, 0x09E5BC,
 911 0x25318D, 0x3974F7, 0x1C0530, 0x010C0D, 0x68084B, 0x58EE2C,
 912 0x90AA47, 0x02E774, 0x24D6BD, 0xA67DF7, 0x72486E, 0xEF169F,
 913 0xA6948E, 0xF691B4, 0x5153D1, 0xF20ACF, 0x339820, 0x7E4BF5,
 914 0x6863B2, 0x5F3EDD, 0x035D40, 0x7F8985, 0x295255, 0xC06437,
 915 0x10D86D, 0x324832, 0x754C5B, 0xD4714E, 0x6E5445, 0xC1090B,
 916 0x69F52A, 0xD56614, 0x9D0727, 0x50045D, 0xDB3BB4, 0xC576EA,
 917 0x17F987, 0x7D6B49, 0xBA271D, 0x296996, 0xACCCC6, 0x5414AD,
 918 0x6AE290, 0x89D988, 0x50722C, 0xBEA404, 0x940777, 0x7030F3,
 919 0x27FC00, 0xA871EA, 0x49C266, 0x3DE064, 0x83DD97, 0x973FA3,
 920 0xFD9443, 0x8C860D, 0xDE4131, 0x9D3992, 0x8C70DD, 0xE7B717,
 921 0x3BDF08, 0x2B3715, 0xA0805C, 0x93805A, 0x921110, 0xD8E80F,
 922 0xAF806C, 0x4BFFDB, 0x0F9038, 0x761859, 0x15A562, 0xBBCB61,
 923 0xB989C7, 0xBD4010, 0x04F2D2, 0x277549, 0xF6B6EB, 0xBB22DB,
 924 0xAA140A, 0x2F2689, 0x768364, 0x333B09, 0x1A940E, 0xAA3A51,
 925 0xC2A31D, 0xAEEDAF, 0x12265C, 0x4DC26D, 0x9C7A2D, 0x9756C0,
 926 0x833F03, 0xF6F009, 0x8C402B, 0x99316D, 0x07B439, 0x15200C,
 927 0x5BC3D8, 0xC492F5, 0x4BADC6, 0xA5CA4E, 0xCD37A7, 0x36A9E6,
 928 0x9492AB, 0x6842DD, 0xDE6319, 0xEF8C76, 0x528B68, 0x37DBFC,
 929 0xABA1AE, 0x3115DF, 0xA1AE00, 0xDAFB0C, 0x664D64, 0xB705ED,
 930 0x306529, 0xBF5657, 0x3AFF47, 0xB9F96A, 0xF3BE75, 0xDF9328,
 931 0x3080AB, 0xF68C66, 0x15CB04, 0x0622FA, 0x1DE4D9, 0xA4B33D,
 932 0x8F1B57, 0x09CD36, 0xE9424E, 0xA4BE13, 0xB52333, 0x1AAAF0,
 933 0xA8654F, 0xA5C1D2, 0x0F3F0B, 0xCD785B, 0x76F923, 0x048B7B,
 934 0x721789, 0x53A6C6, 0xE26E6F, 0x00EBEF, 0x584A9B, 0xB7DAC4,
 935 0xBA66AA, 0xCFCF76, 0x1D02D1, 0x2DF1B1, 0xC1998C, 0x77ADC3,
 936 0xDA4886, 0xA05DF7, 0xF480C6, 0x2FF0AC, 0x9AECDD, 0xBC5C3F,
 937 0x6DDED0, 0x1FC790, 0xB6DB2A, 0x3A25A3, 0x9AAF00, 0x9353AD,
 938 0x0457B6, 0xB42D29, 0x7E804B, 0xA707DA, 0x0EAA76, 0xA1597B,
 939 0x2A1216, 0x2DB7DC, 0xFDE5FA, 0xFEDB89, 0xFDBE89, 0x6C76E4,
 940 0xFCA906, 0x70803E, 0x156E85, 0xFF87FD, 0x073E28, 0x336761,
 941 0x86182A, 0xEABD4D, 0xAFE7B3, 0x6E6D8F, 0x396795, 0x5BBF31,
 942 0x48D784, 0x16DF30, 0x432DC7, 0x356125, 0xCE70C9, 0xB8CB30,
 943 0xFD6CBF, 0xA200A4, 0xE46C05, 0xA0DD5A, 0x476F21, 0xD21262,
 944 0x845CB9, 0x496170, 0xE0566B, 0x015299, 0x375550, 0xB7D51E,
 945 0xC4F133, 0x5F6E13, 0xE4305D, 0xA92E85, 0xC3B21D, 0x3632A1,
 946 0xA4B708, 0xD4B1EA, 0x21F716, 0xE4698F, 0x77FF27, 0x80030C,
 947 0x2D408D, 0xA0CD4F, 0x99A520, 0xD3A2B3, 0x0A5D2F, 0x42F9B4,
 948 0xCBDA11, 0xD0BE7D, 0xC1DB9B, 0xBD17AB, 0x81A2CA, 0x5C6A08,
 949 0x17552E, 0x550027, 0xF0147F, 0x8607E1, 0x640B14, 0x8D4196,
 950 0xDEBE87, 0x2AFDDA, 0xB6256B, 0x34897B, 0xFEF305, 0x9EBFB9,
 951 0x4F6A68, 0xA82A4A, 0x5AC44F, 0xBCF82D, 0x985AD7, 0x95C7F4,
 952 0x8D4D0D, 0xA63A20, 0x5F57A4, 0xB13F14, 0x953880, 0x0120CC,
 953 0x86DD71, 0xB6DEC9, 0xF560BF, 0x11654D, 0x6B0701, 0xACB08C,
 954 0xD0C0B2, 0x485551, 0x0EFB1E, 0xC37295, 0x3B06A3, 0x3540C0,
 955 0x7BDC06, 0xCC45E0, 0xFA294E, 0xC8CAD6, 0x41F3E8, 0xDE647C,
 956 0xD8649B, 0x31BED9, 0xC397A4, 0xD45877, 0xC5E369, 0x13DAF0,
 957 0x3C3ABA, 0x461846, 0x5F7555, 0xF5BDD2, 0xC6926E, 0x5D2EAC,
 958 0xED440E, 0x423E1C, 0x87C461, 0xE9FD29, 0xF3D6E7, 0xCA7C22,
 959 0x35916F, 0xC5E008, 0x8DD7FF, 0xE26A6E, 0xC6FDB0, 0xC10893,
 960 0x745D7C, 0xB2AD6B, 0x9D6ECD, 0x7B723E, 0x6A11C6, 0xA9CFF7,
 961 0xDF7329, 0xBAC9B5, 0x5100B7, 0x0DB2E2, 0x24BA74, 0x607DE5,
 962 0x8AD874, 0x2C150D, 0x0C1881, 0x94667E, 0x162901, 0x767A9F,
 963 0xBEFDFD, 0xEF4556, 0x367ED9, 0x13D9EC, 0xB9BA8B, 0xFC97C4,
 964 0x27A831, 0xC36EF1, 0x36C594, 0x56A8D8, 0xB5A8B4, 0x0ECCCF,
 965 0x2D8912, 0x34576F, 0x89562C, 0xE3CE99, 0xB920D6, 0xAA5E6B,
 966 0x9C2A3E, 0xCC5F11, 0x4A0BFD, 0xFBF4E1, 0x6D3B8E, 0x2C86E2,
 967 0x84D4E9, 0xA9B4FC, 0xD1EEEF, 0xC9352E, 0x61392F, 0x442138,
 968 0xC8D91B, 0x0AFC81, 0x6A4AFB, 0xD81C2F, 0x84B453, 0x8C994E,
 969 0xCC2254, 0xDC552A, 0xD6C6C0, 0x96190B, 0xB8701A, 0x649569,
 970 0x605A26, 0xEE523F, 0x0F117F, 0x11B5F4, 0xF5CBFC, 0x2DBC34,
 971 0xEEBC34, 0xCC5DE8, 0x605EDD, 0x9B8E67, 0xEF3392, 0xB817C9,
 972 0x9B5861, 0xBC57E1, 0xC68351, 0x103ED8, 0x4871DD, 0xDD1C2D,
 973 0xA118AF, 0x462C21, 0xD7F359, 0x987AD9, 0xC0549E, 0xFA864F,
 974 0xFC0656, 0xAE79E5, 0x362289, 0x22AD38, 0xDC9367, 0xAAE855,
 975 0x382682, 0x9BE7CA, 0xA40D51, 0xB13399, 0x0ED7A9, 0x480569,
 976 0xF0B265, 0xA7887F, 0x974C88, 0x36D1F9, 0xB39221, 0x4A827B,
 977 0x21CF98, 0xDC9F40, 0x5547DC, 0x3A74E1, 0x42EB67, 0xDF9DFE,
 978 0x5FD45E, 0xA4677B, 0x7AACBA, 0xA2F655, 0x23882B, 0x55BA41,
 979 0x086E59, 0x862A21, 0x834739, 0xE6E389, 0xD49EE5, 0x40FB49,
 980 0xE956FF, 0xCA0F1C, 0x8A59C5, 0x2BFA94, 0xC5C1D3, 0xCFC50F,
 981 0xAE5ADB, 0x86C547, 0x624385, 0x3B8621, 0x94792C, 0x876110,
 982 0x7B4C2A, 0x1A2C80, 0x12BF43, 0x902688, 0x893C78, 0xE4C4A8,
 983 0x7BDBE5, 0xC23AC4, 0xEAF426, 0x8A67F7, 0xBF920D, 0x2BA365,
 984 0xB1933D, 0x0B7CBD, 0xDC51A4, 0x63DD27, 0xDDE169, 0x19949A,
 985 0x9529A8, 0x28CE68, 0xB4ED09, 0x209F44, 0xCA984E, 0x638270,
 986 0x237C7E, 0x32B90F, 0x8EF5A7, 0xE75614, 0x08F121, 0x2A9DB5,
 987 0x4D7E6F, 0x5119A5, 0xABF9B5, 0xD6DF82, 0x61DD96, 0x023616,
 988 0x9F3AC4, 0xA1A283, 0x6DED72, 0x7A8D39, 0xA9B882, 0x5C326B,
 989 0x5B2746, 0xED3400, 0x7700D2, 0x55F4FC, 0x4D5901, 0x8071E0,
 990 #endif
 991 };
 992
 993 static f64 PIo2[] = {
 994   1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
 995   7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
 996   5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
 997   3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
 998   1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
 999   1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
1000   2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
1001   2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
1002 };
1003 static s32 __rem_pio2_large(f64 *x, f64 *y, s32 e0, s32 nx, s32 prec)
1004 {
1005         s32 jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
1006         f64 z,fw,f[20],fq[20],q[20];
1007
1008         /* initialize jk*/
1009         jk = init_jk[prec];
1010         jp = jk;
1011
1012         /* determine jx,jv,q0, note that 3>q0 */
1013         jx = nx-1;
1014         jv = (e0-3)/24;  if(jv<0) jv=0;
1015         q0 = e0-24*(jv+1);
1016
1017         /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
1018         j = jv-jx; m = jx+jk;
1019         for (i=0; i<=m; i++,j++)
1020                 f[i] = j<0 ? 0.0 : (f64)ipio2[j];
1021
1022         /* compute q[0],q[1],...q[jk] */
1023         for (i=0; i<=jk; i++) {
1024                 for (j=0,fw=0.0; j<=jx; j++)
1025                         fw += x[j]*f[jx+i-j];
1026                 q[i] = fw;
1027         }
1028
1029         jz = jk;
1030 recompute:
1031         /* distill q[] into iq[] reversingly */
1032         for (i=0,j=jz,z=q[jz]; j>0; i++,j--) {
1033                 fw    = (f64)(s32)(0x1p-24*z);
1034                 iq[i] = (s32)(z - 0x1p24*fw);
1035                 z     = q[j-1]+fw;
1036         }
1037
1038         /* compute n */
1039         z  = scalbn(z,q0);       /* actual value of z */
1040         z -= 8.0*floor(z*0.125); /* trim off integer >= 8 */
1041         n  = (s32)z;
1042         z -= (f64)n;
1043         ih = 0;
1044         if (q0 > 0) {  /* need iq[jz-1] to determine n */
1045                 i  = iq[jz-1]>>(24-q0); n += i;
1046                 iq[jz-1] -= i<<(24-q0);
1047                 ih = iq[jz-1]>>(23-q0);
1048         }
1049         else if (q0 == 0) ih = iq[jz-1]>>23;
1050         else if (z >= 0.5) ih = 2;
1051
1052         if (ih > 0) {  /* q > 0.5 */
1053                 n += 1; carry = 0;
1054                 for (i=0; i<jz; i++) {  /* compute 1-q */
1055                         j = iq[i];
1056                         if (carry == 0) {
1057                                 if (j != 0) {
1058                                         carry = 1;
1059                                         iq[i] = 0x1000000 - j;
1060                                 }
1061                         } else
1062                                 iq[i] = 0xffffff - j;
1063                 }
1064                 if (q0 > 0) {  /* rare case: chance is 1 in 12 */
1065                         switch(q0) {
1066                         case 1:
1067                                 iq[jz-1] &= 0x7fffff; break;
1068                         case 2:
1069                                 iq[jz-1] &= 0x3fffff; break;
1070                         }
1071                 }
1072                 if (ih == 2) {
1073                         z = 1.0 - z;
1074                         if (carry != 0)
1075                                 z -= scalbn(1.0,q0);
1076                 }
1077         }
1078
1079         /* check if recomputation is needed */
1080         if (z == 0.0) {
1081                 j = 0;
1082                 for (i=jz-1; i>=jk; i--) j |= iq[i];
1083                 if (j == 0) {  /* need recomputation */
1084                         for (k=1; iq[jk-k]==0; k++);  /* k = no. of terms needed */
1085
1086                         for (i=jz+1; i<=jz+k; i++) {  /* add q[jz+1] to q[jz+k] */
1087                                 f[jx+i] = (f64)ipio2[jv+i];
1088                                 for (j=0,fw=0.0; j<=jx; j++)
1089                                         fw += x[j]*f[jx+i-j];
1090                                 q[i] = fw;
1091                         }
1092                         jz += k;
1093                         goto recompute;
1094                 }
1095         }
1096
1097         /* chop off zero terms */
1098         if (z == 0.0) {
1099                 jz -= 1;
1100                 q0 -= 24;
1101                 while (iq[jz] == 0) {
1102                         jz--;
1103                         q0 -= 24;
1104                 }
1105         } else { /* break z into 24-bit if necessary */
1106                 z = scalbn(z,-q0);
1107                 if (z >= 0x1p24) {
1108                         fw = (f64)(s32)(0x1p-24*z);
1109                         iq[jz] = (s32)(z - 0x1p24*fw);
1110                         jz += 1;
1111                         q0 += 24;
1112                         iq[jz] = (s32)fw;
1113                 } else
1114                         iq[jz] = (s32)z;
1115         }
1116
1117         /* convert integer "bit" chunk to floating-point value */
1118         fw = scalbn(1.0,q0);
1119         for (i=jz; i>=0; i--) {
1120                 q[i] = fw*(f64)iq[i];
1121                 fw *= 0x1p-24;
1122         }
1123
1124         /* compute PIo2[0,...,jp]*q[jz,...,0] */
1125         for(i=jz; i>=0; i--) {
1126                 for (fw=0.0,k=0; k<=jp && k<=jz-i; k++)
1127                         fw += PIo2[k]*q[i+k];
1128                 fq[jz-i] = fw;
1129         }
1130
1131         /* compress fq[] into y[] */
1132         switch(prec) {
1133         case 0:
1134                 fw = 0.0;
1135                 for (i=jz; i>=0; i--)
1136                         fw += fq[i];
1137                 y[0] = ih==0 ? fw : -fw;
1138                 break;
1139         case 1:
1140         case 2:
1141                 fw = 0.0;
1142                 for (i=jz; i>=0; i--)
1143                         fw += fq[i];
1144                 // TODO: drop excess precision here once double_t is used
1145                 // XXX: our double_t is double
1146                 //fw = (f64)fw;
1147                 y[0] = ih==0 ? fw : -fw;
1148                 fw = fq[0]-fw;
1149                 for (i=1; i<=jz; i++)
1150                         fw += fq[i];
1151                 y[1] = ih==0 ? fw : -fw;
1152                 break;
1153         case 3:  /* painful */
1154                 for (i=jz; i>0; i--) {
1155                         fw      = fq[i-1]+fq[i];
1156                         fq[i]  += fq[i-1]-fw;
1157                         fq[i-1] = fw;
1158                 }
1159                 for (i=jz; i>1; i--) {
1160                         fw      = fq[i-1]+fq[i];
1161                         fq[i]  += fq[i-1]-fw;
1162                         fq[i-1] = fw;
1163                 }
1164                 for (fw=0.0,i=jz; i>=2; i--)
1165                         fw += fq[i];
1166                 if (ih==0) {
1167                         y[0] =  fq[0]; y[1] =  fq[1]; y[2] =  fw;
1168                 } else {
1169                         y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
1170                 }
1171         }
1172         return n&7;
1173 }
1174 /* origin: FreeBSD /usr/src/lib/msun/src/k_cos.c */
1175 /*
1176  * ====================================================
1177  * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
1178  *
1179  * Developed at SunSoft, a Sun Microsystems, Inc. business.
1180  * Permission to use, copy, modify, and distribute this
1181  * software is freely granted, provided that this notice
1182  * is preserved.
1183  * ====================================================
1184  */
1185 /*
1186  * __cos( x,  y )
1187  * kernel cos function on [-pi/4, pi/4], pi/4 ~ 0.785398164
1188  * Input x is assumed to be bounded by ~pi/4 in magnitude.
1189  * Input y is the tail of x.
1190  *
1191  * Algorithm
1192  *      1. Since cos(-x) = cos(x), we need only to consider positive x.
1193  *      2. if x < 2^-27 (hx<0x3e400000 0), return 1 with inexact if x!=0.
1194  *      3. cos(x) is approximated by a polynomial of degree 14 on
1195  *         [0,pi/4]
1196  *                                       4            14
1197  *              cos(x) ~ 1 - x*x/2 + C1*x + ... + C6*x
1198  *         where the remez error is
1199  *
1200  *      |              2     4     6     8     10    12     14 |     -58
1201  *      |cos(x)-(1-.5*x +C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  )| <= 2
1202  *      |                                                      |
1203  *
1204  *                     4     6     8     10    12     14
1205  *      4. let r = C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  , then
1206  *             cos(x) ~ 1 - x*x/2 + r
1207  *         since cos(x+y) ~ cos(x) - sin(x)*y
1208  *                        ~ cos(x) - x*y,
1209  *         a correction term is necessary in cos(x) and hence
1210  *              cos(x+y) = 1 - (x*x/2 - (r - x*y))
1211  *         For better accuracy, rearrange to
1212  *              cos(x+y) ~ w + (tmp + (r-x*y))
1213  *         where w = 1 - x*x/2 and tmp is a tiny correction term
1214  *         (1 - x*x/2 == w + tmp exactly in infinite precision).
1215  *         The exactness of w + tmp in infinite precision depends on w
1216  *         and tmp having the same precision as x.  If they have extra
1217  *         precision due to compiler bugs, then the extra precision is
1218  *         only good provided it is retained in all terms of the final
1219  *         expression for cos().  Retention happens in all cases tested
1220  *         under FreeBSD, so don't pessimize things by forcibly clipping
1221  *         any extra precision in w.
1222  */
1223 #define C1   4.16666666666666019037e-02 /* 0x3FA55555, 0x5555554C */
1224 #define C2  -1.38888888888741095749e-03 /* 0xBF56C16C, 0x16C15177 */
1225 #define C3   2.48015872894767294178e-05 /* 0x3EFA01A0, 0x19CB1590 */
1226 #define C4  -2.75573143513906633035e-07 /* 0xBE927E4F, 0x809C52AD */
1227 #define C5   2.08757232129817482790e-09 /* 0x3E21EE9E, 0xBDB4B1C4 */
1228 #define C6  -1.13596475577881948265e-11 /* 0xBDA8FAE9, 0xBE8838D4 */
1229 static f64 __cos(f64 x, f64 y)
1230 {
1231         f64 hz,z,r,w;
1232
1233         z  = x*x;
1234         w  = z*z;
1235         r  = z*(C1+z*(C2+z*C3)) + w*w*(C4+z*(C5+z*C6));
1236         hz = 0.5*z;
1237         w  = 1.0-hz;
1238         return w + (((1.0-w)-hz) + (z*r-x*y));
1239 }
1240 #undef C1
1241 #undef C2
1242 #undef C3
1243 #undef C4
1244 #undef C5
1245 #undef C6
1246 /* origin: FreeBSD /usr/src/lib/msun/src/e_rem_pio2.c */
1247 /*
1248  * ====================================================
1249  * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
1250  *
1251  * Developed at SunSoft, a Sun Microsystems, Inc. business.
1252  * Permission to use, copy, modify, and distribute this
1253  * software is freely granted, provided that this notice
1254  * is preserved.
1255  * ====================================================
1256  *
1257  * Optimized by Bruce D. Evans.
1258  */
1259 /* __rem_pio2(x,y)
1260  *
1261  * return the remainder of x rem pi/2 in y[0]+y[1]
1262  * use __rem_pio2_large() for large x
1263  */
1264 /*
1265  * invpio2:  53 bits of 2/pi
1266  * pio2_1:   first  33 bit of pi/2
1267  * pio2_1t:  pi/2 - pio2_1
1268  * pio2_2:   second 33 bit of pi/2
1269  * pio2_2t:  pi/2 - (pio2_1+pio2_2)
1270  * pio2_3:   third  33 bit of pi/2
1271  * pio2_3t:  pi/2 - (pio2_1+pio2_2+pio2_3)
1272  */
1273 #define toint   1.5/EPS
1274 #define pio4    0x1.921fb54442d18p-1
1275 #define invpio2 6.36619772367581382433e-01 /* 0x3FE45F30, 0x6DC9C883 */
1276 #define pio2_1  1.57079632673412561417e+00 /* 0x3FF921FB, 0x54400000 */
1277 #define pio2_1t 6.07710050650619224932e-11 /* 0x3DD0B461, 0x1A626331 */
1278 #define pio2_2  6.07710050630396597660e-11 /* 0x3DD0B461, 0x1A600000 */
1279 #define pio2_2t 2.02226624879595063154e-21 /* 0x3BA3198A, 0x2E037073 */
1280 #define pio2_3  2.02226624871116645580e-21 /* 0x3BA3198A, 0x2E000000 */
1281 #define pio2_3t 8.47842766036889956997e-32 /* 0x397B839A, 0x252049C1 */
1282 /* caller must handle the case when reduction is not needed: |x| ~<= pi/4 */
1283 static s32 __rem_pio2(f64 x, f64 *y)
1284 {
1285         union {f64 f; u64 i;} u = {x};
1286         f64 z,w,t,r,fn;
1287         f64 tx[3],ty[2];
1288         u32 ix;
1289         s32 sign, n, ex, ey, i;
1290
1291         sign = u.i>>63;
1292         ix = u.i>>32 & 0x7fffffff;
1293         if (ix <= 0x400f6a7a) {  /* |x| ~<= 5pi/4 */
1294                 if ((ix & 0xfffff) == 0x921fb)  /* |x| ~= pi/2 or 2pi/2 */
1295                         goto medium;  /* cancellation -- use medium case */
1296                 if (ix <= 0x4002d97c) {  /* |x| ~<= 3pi/4 */
1297                         if (!sign) {
1298                                 z = x - pio2_1;  /* one round good to 85 bits */
1299                                 y[0] = z - pio2_1t;
1300                                 y[1] = (z-y[0]) - pio2_1t;
1301                                 return 1;
1302                         } else {
1303                                 z = x + pio2_1;
1304                                 y[0] = z + pio2_1t;
1305                                 y[1] = (z-y[0]) + pio2_1t;
1306                                 return -1;
1307                         }
1308                 } else {
1309                         if (!sign) {
1310                                 z = x - 2*pio2_1;
1311                                 y[0] = z - 2*pio2_1t;
1312                                 y[1] = (z-y[0]) - 2*pio2_1t;
1313                                 return 2;
1314                         } else {
1315                                 z = x + 2*pio2_1;
1316                                 y[0] = z + 2*pio2_1t;
1317                                 y[1] = (z-y[0]) + 2*pio2_1t;
1318                                 return -2;
1319                         }
1320                 }
1321         }
1322         if (ix <= 0x401c463b) {  /* |x| ~<= 9pi/4 */
1323                 if (ix <= 0x4015fdbc) {  /* |x| ~<= 7pi/4 */
1324                         if (ix == 0x4012d97c)  /* |x| ~= 3pi/2 */
1325                                 goto medium;
1326                         if (!sign) {
1327                                 z = x - 3*pio2_1;
1328                                 y[0] = z - 3*pio2_1t;
1329                                 y[1] = (z-y[0]) - 3*pio2_1t;
1330                                 return 3;
1331                         } else {
1332                                 z = x + 3*pio2_1;
1333                                 y[0] = z + 3*pio2_1t;
1334                                 y[1] = (z-y[0]) + 3*pio2_1t;
1335                                 return -3;
1336                         }
1337                 } else {
1338                         if (ix == 0x401921fb)  /* |x| ~= 4pi/2 */
1339                                 goto medium;
1340                         if (!sign) {
1341                                 z = x - 4*pio2_1;
1342                                 y[0] = z - 4*pio2_1t;
1343                                 y[1] = (z-y[0]) - 4*pio2_1t;
1344                                 return 4;
1345                         } else {
1346                                 z = x + 4*pio2_1;
1347                                 y[0] = z + 4*pio2_1t;
1348                                 y[1] = (z-y[0]) + 4*pio2_1t;
1349                                 return -4;
1350                         }
1351                 }
1352         }
1353         if (ix < 0x413921fb) {  /* |x| ~< 2^20*(pi/2), medium size */
1354 medium:
1355                 /* rint(x/(pi/2)) */
1356                 fn = (f64)x*invpio2 + toint - toint;
1357                 n = (s32)fn;
1358                 r = x - fn*pio2_1;
1359                 w = fn*pio2_1t;  /* 1st round, good to 85 bits */
1360                 /* Matters with directed rounding. */
1361                 if (predict_false(r - w < -pio4)) {
1362                         n--;
1363                         fn--;
1364                         r = x - fn*pio2_1;
1365                         w = fn*pio2_1t;
1366                 } else if (predict_false(r - w > pio4)) {
1367                         n++;
1368                         fn++;
1369                         r = x - fn*pio2_1;
1370                         w = fn*pio2_1t;
1371                 }
1372                 y[0] = r - w;
1373                 u.f = y[0];
1374                 ey = u.i>>52 & 0x7ff;
1375                 ex = ix>>20;
1376                 if (ex - ey > 16) { /* 2nd round, good to 118 bits */
1377                         t = r;
1378                         w = fn*pio2_2;
1379                         r = t - w;
1380                         w = fn*pio2_2t - ((t-r)-w);
1381                         y[0] = r - w;
1382                         u.f = y[0];
1383                         ey = u.i>>52 & 0x7ff;
1384                         if (ex - ey > 49) {  /* 3rd round, good to 151 bits, covers all cases */
1385                                 t = r;
1386                                 w = fn*pio2_3;
1387                                 r = t - w;
1388                                 w = fn*pio2_3t - ((t-r)-w);
1389                                 y[0] = r - w;
1390                         }
1391                 }
1392                 y[1] = (r - y[0]) - w;
1393                 return n;
1394         }
1395         /*
1396          * all other (large) arguments
1397          */
1398         if (ix >= 0x7ff00000) {  /* x is inf or NaN */
1399                 y[0] = y[1] = x - x;
1400                 return 0;
1401         }
1402         /* set z = scalbn(|x|,-ilogb(x)+23) */
1403         u.f = x;
1404         u.i &= (u64)-1>>12;
1405         u.i |= (u64)(0x3ff + 23)<<52;
1406         z = u.f;
1407         for (i=0; i < 2; i++) {
1408                 tx[i] = (f64)(s32)z;
1409                 z     = (z-tx[i])*0x1p24;
1410         }
1411         tx[i] = z;
1412         /* skip zero terms, first term is non-zero */
1413         while (tx[i] == 0.0)
1414                 i--;
1415         n = __rem_pio2_large(tx,ty,(s32)(ix>>20)-(0x3ff+23),i+1,1);
1416         if (sign) {
1417                 y[0] = -ty[0];
1418                 y[1] = -ty[1];
1419                 return -n;
1420         }
1421         y[0] = ty[0];
1422         y[1] = ty[1];
1423         return n;
1424 }
1425 #undef toint
1426 #undef pio4
1427 #undef invpio2
1428 #undef pio2_1
1429 #undef pio2_1t
1430 #undef pio2_2
1431 #undef pio2_2t
1432 #undef pio2_3
1433 #undef pio2_3t
1434 /* origin: FreeBSD /usr/src/lib/msun/src/k_sin.c */
1435 /*
1436  * ====================================================
1437  * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
1438  *
1439  * Developed at SunSoft, a Sun Microsystems, Inc. business.
1440  * Permission to use, copy, modify, and distribute this
1441  * software is freely granted, provided that this notice
1442  * is preserved.
1443  * ====================================================
1444  */
1445 /* __sin( x, y, iy)
1446  * kernel sin function on ~[-pi/4, pi/4] (except on -0), pi/4 ~ 0.7854
1447  * Input x is assumed to be bounded by ~pi/4 in magnitude.
1448  * Input y is the tail of x.
1449  * Input iy indicates whether y is 0. (if iy=0, y assume to be 0).
1450  *
1451  * Algorithm
1452  *      1. Since sin(-x) = -sin(x), we need only to consider positive x.
1453  *      2. Callers must return sin(-0) = -0 without calling here since our
1454  *         odd polynomial is not evaluated in a way that preserves -0.
1455  *         Callers may do the optimization sin(x) ~ x for tiny x.
1456  *      3. sin(x) is approximated by a polynomial of degree 13 on
1457  *         [0,pi/4]
1458  *                               3            13
1459  *              sin(x) ~ x + S1*x + ... + S6*x
1460  *         where
1461  *
1462  *      |sin(x)         2     4     6     8     10     12  |     -58
1463  *      |----- - (1+S1*x +S2*x +S3*x +S4*x +S5*x  +S6*x   )| <= 2
1464  *      |  x                                               |
1465  *
1466  *      4. sin(x+y) = sin(x) + sin'(x')*y
1467  *                  ~ sin(x) + (1-x*x/2)*y
1468  *         For better accuracy, let
1469  *                   3      2      2      2      2
1470  *              r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6))))
1471  *         then                   3    2
1472  *              sin(x) = x + (S1*x + (x *(r-y/2)+y))
1473  */
1474 #define S1 -1.66666666666666324348e-01 /* 0xBFC55555, 0x55555549 */
1475 #define S2 8.33333333332248946124e-03 /* 0x3F811111, 0x1110F8A6 */
1476 #define S3 -1.98412698298579493134e-04 /* 0xBF2A01A0, 0x19C161D5 */
1477 #define S4 2.75573137070700676789e-06 /* 0x3EC71DE3, 0x57B1FE7D */
1478 #define S5 -2.50507602534068634195e-08 /* 0xBE5AE5E6, 0x8A2B9CEB */
1479 #define S6 1.58969099521155010221e-10 /* 0x3DE5D93A, 0x5ACFD57C */
1480 static f64 __sin(f64 x, f64 y, s32 iy)
1481 {
1482         f64 z,r,v,w;
1483
1484         z = x*x;
1485         w = z*z;
1486         r = S2 + z*(S3 + z*S4) + z*w*(S5 + z*S6);
1487         v = z*x;
1488         if (iy == 0)
1489                 return x + v*(S1 + z*r);
1490         else
1491                 return x - ((z*(0.5*y - v*r) - y) - v*S1);
1492 }
1493 #undef S1
1494 #undef S2
1495 #undef S3
1496 #undef S4
1497 #undef S5
1498 #undef S6
1499 /* origin: FreeBSD /usr/src/lib/msun/src/s_cos.c */
1500 /*
1501  * ====================================================
1502  * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
1503  *
1504  * Developed at SunPro, a Sun Microsystems, Inc. business.
1505  * Permission to use, copy, modify, and distribute this
1506  * software is freely granted, provided that this notice
1507  * is preserved.
1508  * ====================================================
1509  */
1510 /* cos(x)
1511  * Return cosine function of x.
1512  *
1513  * kernel function:
1514  *      __sin           ... sine function on [-pi/4,pi/4]
1515  *      __cos           ... cosine function on [-pi/4,pi/4]
1516  *      __rem_pio2      ... argument reduction routine
1517  *
1518  * Method.
1519  *      Let S,C and T denote the sin, cos and tan respectively on
1520  *      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
1521  *      in [-pi/4 , +pi/4], and let n = k mod 4.
1522  *      We have
1523  *
1524  *          n        sin(x)      cos(x)        tan(x)
1525  *     ----------------------------------------------------------
1526  *          0          S           C             T
1527  *          1          C          -S            -1/T
1528  *          2         -S          -C             T
1529  *          3         -C           S            -1/T
1530  *     ----------------------------------------------------------
1531  *
1532  * Special cases:
1533  *      Let trig be any of sin, cos, or tan.
1534  *      trig(+-INF)  is NaN, with signals;
1535  *      trig(NaN)    is that NaN;
1536  *
1537  * Accuracy:
1538  *      TRIG(x) returns trig(x) nearly rounded
1539  */
1540 static f64 cos(f64 x)
1541 {
1542         f64 y[2];
1543         u32 ix;
1544         u32 n;
1545
1546         ix = asu64(x) >> 32; /* get high word */
1547         ix &= 0x7fffffff;
1548
1549         /* |x| ~< pi/4 */
1550         if (ix <= 0x3fe921fb) {
1551                 if (ix < 0x3e46a09e) {  /* |x| < 2**-27 * sqrt(2) */
1552                         /* raise inexact if x!=0 */
1553                         fp_force_eval(x + 0x1p120f);
1554                         return 1.0;
1555                 }
1556                 return __cos(x, 0);
1557         }
1558
1559         /* cos(Inf or NaN) is NaN */
1560         if (ix >= 0x7ff00000)
1561                 return x-x;
1562
1563         /* argument reduction */
1564         n = __rem_pio2(x, y);
1565         switch (n&3) {
1566         case 0: return  __cos(y[0], y[1]);
1567         case 1: return -__sin(y[0], y[1], 1);
1568         case 2: return -__cos(y[0], y[1]);
1569         default:
1570                 return  __sin(y[0], y[1], 1);
1571         }
1572 }
1573 /* origin: FreeBSD /usr/src/lib/msun/src/s_sin.c */
1574 /*
1575  * ====================================================
1576  * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
1577  *
1578  * Developed at SunPro, a Sun Microsystems, Inc. business.
1579  * Permission to use, copy, modify, and distribute this
1580  * software is freely granted, provided that this notice
1581  * is preserved.
1582  * ====================================================
1583  */
1584 /* sin(x)
1585  * Return sine function of x.
1586  *
1587  * kernel function:
1588  *      __sin            ... sine function on [-pi/4,pi/4]
1589  *      __cos            ... cose function on [-pi/4,pi/4]
1590  *      __rem_pio2       ... argument reduction routine
1591  *
1592  * Method.
1593  *      Let S,C and T denote the sin, cos and tan respectively on
1594  *      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
1595  *      in [-pi/4 , +pi/4], and let n = k mod 4.
1596  *      We have
1597  *
1598  *          n        sin(x)      cos(x)        tan(x)
1599  *     ----------------------------------------------------------
1600  *          0          S           C             T
1601  *          1          C          -S            -1/T
1602  *          2         -S          -C             T
1603  *          3         -C           S            -1/T
1604  *     ----------------------------------------------------------
1605  *
1606  * Special cases:
1607  *      Let trig be any of sin, cos, or tan.
1608  *      trig(+-INF)  is NaN, with signals;
1609  *      trig(NaN)    is that NaN;
1610  *
1611  * Accuracy:
1612  *      TRIG(x) returns trig(x) nearly rounded
1613  */
1614 static f64 sin(f64 x)
1615 {
1616         f64 y[2];
1617         u32 ix;
1618         u32 n;
1619
1620         /* High word of x. */
1621         ix = asu64(x) >> 32; /* get high word */
1622         ix &= 0x7fffffff;
1623
1624         /* |x| ~< pi/4 */
1625         if (ix <= 0x3fe921fb) {
1626                 if (ix < 0x3e500000) {  /* |x| < 2**-26 */
1627                         /* raise inexact if x != 0 and underflow if subnormal*/
1628                         fp_force_eval(ix < 0x00100000 ? x/0x1p120f : x+0x1p120f);
1629                         return x;
1630                 }
1631                 return __sin(x, 0.0, 0);
1632         }
1633
1634         /* sin(Inf or NaN) is NaN */
1635         if (ix >= 0x7ff00000)
1636                 return x - x;
1637
1638         /* argument reduction needed */
1639         n = __rem_pio2(x, y);
1640         switch (n&3) {
1641         case 0: return  __sin(y[0], y[1], 1);
1642         case 1: return  __cos(y[0], y[1]);
1643         case 2: return -__sin(y[0], y[1], 1);
1644         default:
1645                 return -__cos(y[0], y[1]);
1646         }
1647 }
1648 static f64 fabs(f64 x)
1649 {
1650         union {f64 f; u64 i;} u;
1651
1652         u.f = x;
1653         u.i &= -1UL/2;
1654         return u.f;
1655 }
1656 /* origin: FreeBSD /usr/src/lib/msun/src/s_atan.c */
1657 /*
1658  * ====================================================
1659  * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
1660  *
1661  * Developed at SunPro, a Sun Microsystems, Inc. business.
1662  * Permission to use, copy, modify, and distribute this
1663  * software is freely granted, provided that this notice
1664  * is preserved.
1665  * ====================================================
1666  */
1667 /* atan(x)
1668  * Method
1669  *   1. Reduce x to positive by atan(x) = -atan(-x).
1670  *   2. According to the integer k=4t+0.25 chopped, t=x, the argument
1671  *      is further reduced to one of the following intervals and the
1672  *      arctangent of t is evaluated by the corresponding formula:
1673  *
1674  *      [0,7/16]      atan(x) = t-t^3*(a1+t^2*(a2+...(a10+t^2*a11)...)
1675  *      [7/16,11/16]  atan(x) = atan(1/2) + atan( (t-0.5)/(1+t/2) )
1676  *      [11/16.19/16] atan(x) = atan( 1 ) + atan( (t-1)/(1+t) )
1677  *      [19/16,39/16] atan(x) = atan(3/2) + atan( (t-1.5)/(1+1.5t) )
1678  *      [39/16,INF]   atan(x) = atan(INF) + atan( -1/t )
1679  *
1680  * Constants:
1681  * The hexadecimal values are the intended ones for the following
1682  * constants. The decimal values may be used, provided that the
1683  * compiler will convert from decimal to binary accurately enough
1684  * to produce the hexadecimal values shown.
1685  */
1686 static f64 atanhi[] = {
1687   4.63647609000806093515e-01, /* atan(0.5)hi 0x3FDDAC67, 0x0561BB4F */
1688   7.85398163397448278999e-01, /* atan(1.0)hi 0x3FE921FB, 0x54442D18 */
1689   9.82793723247329054082e-01, /* atan(1.5)hi 0x3FEF730B, 0xD281F69B */
1690   1.57079632679489655800e+00, /* atan(inf)hi 0x3FF921FB, 0x54442D18 */
1691 };
1692
1693 static f64 atanlo[] = {
1694   2.26987774529616870924e-17, /* atan(0.5)lo 0x3C7A2B7F, 0x222F65E2 */
1695   3.06161699786838301793e-17, /* atan(1.0)lo 0x3C81A626, 0x33145C07 */
1696   1.39033110312309984516e-17, /* atan(1.5)lo 0x3C700788, 0x7AF0CBBD */
1697   6.12323399573676603587e-17, /* atan(inf)lo 0x3C91A626, 0x33145C07 */
1698 };
1699
1700 static f64 aT[] = {
1701   3.33333333333329318027e-01, /* 0x3FD55555, 0x5555550D */
1702  -1.99999999998764832476e-01, /* 0xBFC99999, 0x9998EBC4 */
1703   1.42857142725034663711e-01, /* 0x3FC24924, 0x920083FF */
1704  -1.11111104054623557880e-01, /* 0xBFBC71C6, 0xFE231671 */
1705   9.09088713343650656196e-02, /* 0x3FB745CD, 0xC54C206E */
1706  -7.69187620504482999495e-02, /* 0xBFB3B0F2, 0xAF749A6D */
1707   6.66107313738753120669e-02, /* 0x3FB10D66, 0xA0D03D51 */
1708  -5.83357013379057348645e-02, /* 0xBFADDE2D, 0x52DEFD9A */
1709   4.97687799461593236017e-02, /* 0x3FA97B4B, 0x24760DEB */
1710  -3.65315727442169155270e-02, /* 0xBFA2B444, 0x2C6A6C2F */
1711   1.62858201153657823623e-02, /* 0x3F90AD3A, 0xE322DA11 */
1712 };
1713
1714 static f64 atan(f64 x)
1715 {
1716         f64 w,s1,s2,z;
1717         u32 ix,sign;
1718         s32 id;
1719
1720         ix = asu64(x) >> 32; /* get high word */
1721         sign = ix >> 31;
1722         ix &= 0x7fffffff;
1723         if (ix >= 0x44100000) {   /* if |x| >= 2^66 */
1724                 if (isnan(x))
1725                         return x;
1726                 z = atanhi[3] + 0x1p-120f;
1727                 return sign ? -z : z;
1728         }
1729         if (ix < 0x3fdc0000) {    /* |x| < 0.4375 */
1730                 if (ix < 0x3e400000) {  /* |x| < 2^-27 */
1731                         if (ix < 0x00100000)
1732                                 /* raise underflow for subnormal x */
1733                                 fp_force_eval_f32((f32)x);
1734                         return x;
1735                 }
1736                 id = -1;
1737         } else {
1738                 x = fabs(x);
1739                 if (ix < 0x3ff30000) {  /* |x| < 1.1875 */
1740                         if (ix < 0x3fe60000) {  /*  7/16 <= |x| < 11/16 */
1741                                 id = 0;
1742                                 x = (2.0*x-1.0)/(2.0+x);
1743                         } else {                /* 11/16 <= |x| < 19/16 */
1744                                 id = 1;
1745                                 x = (x-1.0)/(x+1.0);
1746                         }
1747                 } else {
1748                         if (ix < 0x40038000) {  /* |x| < 2.4375 */
1749                                 id = 2;
1750                                 x = (x-1.5)/(1.0+1.5*x);
1751                         } else {                /* 2.4375 <= |x| < 2^66 */
1752                                 id = 3;
1753                                 x = -1.0/x;
1754                         }
1755                 }
1756         }
1757         /* end of argument reduction */
1758         z = x*x;
1759         w = z*z;
1760         /* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
1761         s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*(aT[6]+w*(aT[8]+w*aT[10])))));
1762         s2 = w*(aT[1]+w*(aT[3]+w*(aT[5]+w*(aT[7]+w*aT[9]))));
1763         if (id < 0)
1764                 return x - x*(s1+s2);
1765         z = atanhi[id] - (x*(s1+s2) - atanlo[id] - x);
1766         return sign ? -z : z;
1767 }
1768 /*
1769  * Shared data between exp, exp2 and pow.
1770  *
1771  * Copyright (c) 2018, Arm Limited.
1772  * SPDX-License-Identifier: MIT
1773  */
1774 /*
1775  * Copyright (c) 2018, Arm Limited.
1776  * SPDX-License-Identifier: MIT
1777  */
1778 #define EXP_TABLE_BITS 7
1779 #define EXP_POLY_ORDER 5
1780 #define EXP_USE_TOINT_NARROW 0
1781 #define EXP2_POLY_ORDER 5
1782 #define N (1 << EXP_TABLE_BITS)
1783 static struct exp_data {
1784         f64 invln2N;
1785         f64 shift;
1786         f64 negln2hiN;
1787         f64 negln2loN;
1788         f64 poly[4]; /* Last four coefficients.  */
1789         f64 exp2_shift;
1790         f64 exp2_poly[EXP2_POLY_ORDER];
1791         u64 tab[2*(1 << EXP_TABLE_BITS)];
1792 } __exp_data = {
1793 // N/ln2
1794 .invln2N = 0x1.71547652b82fep0 * N,
1795 // -ln2/N
1796 .negln2hiN = -0x1.62e42fefa0000p-8,
1797 .negln2loN = -0x1.cf79abc9e3b3ap-47,
1798 // Used for rounding when !TOINT_INTRINSICS
1799 #if EXP_USE_TOINT_NARROW
1800 .shift = 0x1800000000.8p0,
1801 #else
1802 .shift = 0x1.8p52,
1803 #endif
1804 // exp polynomial coefficients.
1805 .poly = {
1806 // abs error: 1.555*2^-66
1807 // ulp error: 0.509 (0.511 without fma)
1808 // if |x| < ln2/256+eps
1809 // abs error if |x| < ln2/256+0x1p-15: 1.09*2^-65
1810 // abs error if |x| < ln2/128: 1.7145*2^-56
1811 0x1.ffffffffffdbdp-2,
1812 0x1.555555555543cp-3,
1813 0x1.55555cf172b91p-5,
1814 0x1.1111167a4d017p-7,
1815 },
1816 .exp2_shift = 0x1.8p52 / N,
1817 // exp2 polynomial coefficients.
1818 .exp2_poly = {
1819 // abs error: 1.2195*2^-65
1820 // ulp error: 0.507 (0.511 without fma)
1821 // if |x| < 1/256
1822 // abs error if |x| < 1/128: 1.9941*2^-56
1823 0x1.62e42fefa39efp-1,
1824 0x1.ebfbdff82c424p-3,
1825 0x1.c6b08d70cf4b5p-5,
1826 0x1.3b2abd24650ccp-7,
1827 0x1.5d7e09b4e3a84p-10,
1828 },
1829 // 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
1830 // tab[2*k] = asuint64(T[k])
1831 // tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
1832 .tab = {
1833 0x0, 0x3ff0000000000000,
1834 0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
1835 0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
1836 0xbc905e7a108766d1, 0x3fefe315e86e7f85,
1837 0x3c8cd2523567f613, 0x3fefd9b0d3158574,
1838 0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
1839 0x3c60f74e61e6c861, 0x3fefc74518759bc8,
1840 0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
1841 0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
1842 0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
1843 0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
1844 0xbc6a033489906e0b, 0x3fef9b66affed31b,
1845 0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
1846 0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
1847 0xbc91c923b9d5f416, 0x3fef829aaea92de0,
1848 0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
1849 0xbc801b15eaa59348, 0x3fef72b83c7d517b,
1850 0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
1851 0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
1852 0xbc96d99c7611eb26, 0x3fef5be084045cd4,
1853 0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
1854 0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
1855 0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
1856 0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
1857 0x3c968efde3a8a894, 0x3fef387a6e756238,
1858 0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
1859 0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
1860 0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
1861 0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
1862 0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
1863 0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
1864 0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
1865 0x3c834d754db0abb6, 0x3fef06fe0a31b715,
1866 0x3c864201e2ac744c, 0x3fef0170fc4cd831,
1867 0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
1868 0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
1869 0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
1870 0xbc9907f81b512d8e, 0x3feeecae6d05d866,
1871 0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
1872 0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
1873 0x3c859f48a72a4c6d, 0x3feedea64c123422,
1874 0xbc9312607a28698a, 0x3feeda4504ac801c,
1875 0xbc58a78f4817895b, 0x3feed60a21f72e2a,
1876 0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
1877 0x3c4363ed60c2ac11, 0x3feece086061892d,
1878 0x3c9666093b0664ef, 0x3feeca41ed1d0057,
1879 0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
1880 0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
1881 0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
1882 0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
1883 0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
1884 0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
1885 0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
1886 0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
1887 0x3c93350518fdd78e, 0x3feeaf4736b527da,
1888 0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
1889 0x3c9063e1e21c5409, 0x3feeab07dd485429,
1890 0x3c34c7855019c6ea, 0x3feea9268a5946b7,
1891 0x3c9432e62b64c035, 0x3feea76f15ad2148,
1892 0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
1893 0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
1894 0xbc845378892be9ae, 0x3feea34634ccc320,
1895 0xbc93cedd78565858, 0x3feea23882552225,
1896 0x3c5710aa807e1964, 0x3feea155d44ca973,
1897 0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
1898 0xbc6a12ad8734b982, 0x3feea012750bdabf,
1899 0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
1900 0xbc80dc3d54e08851, 0x3fee9f7df9519484,
1901 0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
1902 0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
1903 0xbc8619321e55e68a, 0x3fee9feb564267c9,
1904 0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
1905 0xbc7b32dcb94da51d, 0x3feea11473eb0187,
1906 0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
1907 0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
1908 0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
1909 0xbc9369b6f13b3734, 0x3feea589994cce13,
1910 0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
1911 0xbc94d450d872576e, 0x3feea8d99b4492ed,
1912 0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
1913 0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
1914 0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
1915 0x3c7bf68359f35f44, 0x3feeb1ae99157736,
1916 0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
1917 0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
1918 0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
1919 0xbc92434322f4f9aa, 0x3feebd829fde4e50,
1920 0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
1921 0x3c71affc2b91ce27, 0x3feec49182a3f090,
1922 0x3c6dd235e10a73bb, 0x3feec86319e32323,
1923 0xbc87c50422622263, 0x3feecc667b5de565,
1924 0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
1925 0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
1926 0x3c90cc319cee31d2, 0x3feed99e1330b358,
1927 0x3c8469846e735ab3, 0x3feede6b5579fdbf,
1928 0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
1929 0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
1930 0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
1931 0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
1932 0xbc90a40e3da6f640, 0x3feef9728de5593a,
1933 0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
1934 0xbc91eee26b588a35, 0x3fef05b030a1064a,
1935 0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
1936 0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
1937 0x3c736eae30af0cb3, 0x3fef199bdd85529c,
1938 0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
1939 0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
1940 0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
1941 0x3c676b2c6c921968, 0x3fef3720dcef9069,
1942 0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
1943 0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
1944 0xbc900dae3875a949, 0x3fef4f87080d89f2,
1945 0x3c74a385a63d07a7, 0x3fef5818dcfba487,
1946 0xbc82919e2040220f, 0x3fef60e316c98398,
1947 0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
1948 0x3c843a59ac016b4b, 0x3fef7321f301b460,
1949 0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
1950 0xbc892ab93b470dc9, 0x3fef864614f5a129,
1951 0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
1952 0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
1953 0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
1954 0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
1955 0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
1956 0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
1957 0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
1958 0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
1959 0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
1960 0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
1961 },
1962 };
1963 /*
1964  * Double-precision e^x function.
1965  *
1966  * Copyright (c) 2018, Arm Limited.
1967  * SPDX-License-Identifier: MIT
1968  */
1969 #define N (1 << EXP_TABLE_BITS)
1970 #define InvLn2N __exp_data.invln2N
1971 #define NegLn2hiN __exp_data.negln2hiN
1972 #define NegLn2loN __exp_data.negln2loN
1973 #define Shift __exp_data.shift
1974 #define T __exp_data.tab
1975 #define C2 __exp_data.poly[5 - EXP_POLY_ORDER]
1976 #define C3 __exp_data.poly[6 - EXP_POLY_ORDER]
1977 #define C4 __exp_data.poly[7 - EXP_POLY_ORDER]
1978 #define C5 __exp_data.poly[8 - EXP_POLY_ORDER]
1979 /* Handle cases that may overflow or underflow when computing the result that
1980    is scale*(1+TMP) without intermediate rounding.  The bit representation of
1981    scale is in SBITS, however it has a computed exponent that may have
1982    overflown into the sign bit so that needs to be adjusted before using it as
1983    a double.  (int32_t)KI is the k used in the argument reduction and exponent
1984    adjustment of scale, positive k here means the result may overflow and
1985    negative k means the result may underflow.  */
1986 static f64 exp_specialcase(f64 tmp, u64 sbits, u64 ki)
1987 {
1988         f64 scale, y;
1989
1990         if ((ki & 0x80000000) == 0) {
1991                 /* k > 0, the exponent of scale might have overflowed by <= 460.  */
1992                 sbits -= 1009ul << 52;
1993                 scale = asf64(sbits);
1994                 y = 0x1p1009 * (scale + scale * tmp);
1995                 return eval_as_f64(y);
1996         }
1997         /* k < 0, need special care in the subnormal range.  */
1998         sbits += 1022ul << 52;
1999         scale = asf64(sbits);
2000         y = scale + scale * tmp;
2001         if (y < 1.0) {
2002                 /* Round y to the right precision before scaling it into the subnormal
2003                  range to avoid double rounding that can cause 0.5+E/2 ulp error where
2004                  E is the worst-case ulp error outside the subnormal range.  So this
2005                  is only useful if the goal is better than 1 ulp worst-case error.  */
2006                 f64 hi, lo;
2007                 lo = scale - y + scale * tmp;
2008                 hi = 1.0 + y;
2009                 lo = 1.0 - hi + y + lo;
2010                 y = eval_as_f64(hi + lo) - 1.0;
2011                 /* Avoid -0.0 with downward rounding.  */
2012                 if (WANT_ROUNDING && y == 0.0)
2013                         y = 0.0;
2014                 /* The underflow exception needs to be signaled explicitly.  */
2015                 fp_force_eval(fp_barrier(0x1p-1022) * 0x1p-1022);
2016         }
2017         y = 0x1p-1022 * y;
2018         return eval_as_f64(y);
2019 }
2020 /* Top 12 bits of a double (sign and exponent bits).  */
2021 static u32 top12(f64 x)
2022 {
2023         return asu64(x) >> 52;
2024 }
2025 static f64 exp(f64 x)
2026 {
2027         u32 abstop;
2028         u64 ki, idx, top, sbits;
2029         f64 kd, z, r, r2, scale, tail, tmp;
2030
2031         abstop = top12(x) & 0x7ff;
2032         if (predict_false(abstop - top12(0x1p-54) >= top12(512.0) - top12(0x1p-54))) {
2033                 if (abstop - top12(0x1p-54) >= 0x80000000)
2034                         /* Avoid spurious underflow for tiny x.  */
2035                         /* Note: 0 is common input.  */
2036                         return WANT_ROUNDING ? 1.0 + x : 1.0;
2037                 if (abstop >= top12(1024.0)) {
2038                         if (asu64(x) == asu64(-INFINITY))
2039                                 return 0.0;
2040                         if (abstop >= top12(INFINITY))
2041                                 return 1.0 + x;
2042                         if (asu64(x) >> 63)
2043                                 return __math_uflow(0);
2044                         else
2045                                 return __math_oflow(0);
2046                 }
2047                 /* Large x is special cased below.  */
2048                 abstop = 0;
2049         }
2050
2051         /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
2052         /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
2053         z = InvLn2N * x;
2054 #if TOINT_INTRINSICS
2055         kd = roundtoint(z);
2056         ki = converttoint(z);
2057 #elif EXP_USE_TOINT_NARROW
2058         /* z - kd is in [-0.5-2^-16, 0.5] in all rounding modes.  */
2059         kd = eval_as_double(z + Shift);
2060         ki = asuint64(kd) >> 16;
2061         kd = (double_t)(int32_t)ki;
2062 #else
2063         /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
2064         kd = eval_as_f64(z + Shift);
2065         ki = asu64(kd);
2066         kd -= Shift;
2067 #endif
2068         r = x + kd * NegLn2hiN + kd * NegLn2loN;
2069         /* 2^(k/N) ~= scale * (1 + tail).  */
2070         idx = 2 * (ki % N);
2071         top = ki << (52 - EXP_TABLE_BITS);
2072         tail = asf64(T[idx]);
2073         /* This is only a valid scale when -1023*N < k < 1024*N.  */
2074         sbits = T[idx + 1] + top;
2075         /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1).  */
2076         /* Evaluation is optimized assuming superscalar pipelined execution.  */
2077         r2 = r * r;
2078         /* Without fma the worst case error is 0.25/N ulp larger.  */
2079         /* Worst case error is less than 0.5+1.11/N+(abs poly error * 2^53) ulp.  */
2080         tmp = tail + r + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5);
2081         if (predict_false(abstop == 0))
2082                 return exp_specialcase(tmp, sbits, ki);
2083         scale = asf64(sbits);
2084         /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
2085            is no spurious underflow here even without fma.  */
2086         return eval_as_f64(scale + scale * tmp);
2087 }
2088 #undef EXP_TABLE_BITS
2089 #undef EXP_POLY_ORDER
2090 #undef EXP_USE_TOINT_NARROW
2091 #undef EXP2_POLY_ORDER
2092 #undef N
2093 #undef InvLn2N
2094 #undef NegLn2hiN
2095 #undef NegLn2loN
2096 #undef Shift
2097 #undef T
2098 #undef C2
2099 #undef C3
2100 #undef C4
2101 #undef C5
2102 /*----------------------------------------------------------------------------*/
2103 #undef u32
2104 #undef s32
2105 #undef s64
2106 #undef u64
2107 #undef f64
2108 #undef WANT_ROUNDING
2109 #undef asu64
2110 #undef asf64
2111 #undef predict_true
2112 #undef predict_false
2113 #undef NAN
2114 #undef INFINITY
2115 #undef EPS
2116 #undef isnan