exp2l: Work around a NetBSD 10.0/i386 bug.
[gnulib.git] / lib / fma.c
blob40ad36789732a3393d00c728020f2425688d7578
1 /* Fused multiply-add.
2 Copyright (C) 2007, 2009, 2011-2024 Free Software Foundation, Inc.
4 This file is free software: you can redistribute it and/or modify
5 it under the terms of the GNU Lesser General Public License as
6 published by the Free Software Foundation, either version 3 of the
7 License, or (at your option) any later version.
9 This file is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Bruno Haible <bruno@clisp.org>, 2011. */
19 #if ! defined USE_LONG_DOUBLE
20 # include <config.h>
21 #endif
23 /* Specification. */
24 #include <math.h>
26 #include <limits.h>
27 #include <stdlib.h>
29 #if HAVE_FEGETROUND
30 # include <fenv.h>
31 #endif
33 #include "float+.h"
34 #include "integer_length.h"
36 #ifdef USE_LONG_DOUBLE
37 # define FUNC fmal
38 # define DOUBLE long double
39 # define FREXP frexpl
40 # define LDEXP ldexpl
41 # define MIN_EXP LDBL_MIN_EXP
42 # define MANT_BIT LDBL_MANT_BIT
43 # define L_(literal) literal##L
44 #elif ! defined USE_FLOAT
45 # define FUNC fma
46 # define DOUBLE double
47 # define FREXP frexp
48 # define LDEXP ldexp
49 # define MIN_EXP DBL_MIN_EXP
50 # define MANT_BIT DBL_MANT_BIT
51 # define L_(literal) literal
52 #else /* defined USE_FLOAT */
53 # define FUNC fmaf
54 # define DOUBLE float
55 # define FREXP frexpf
56 # define LDEXP ldexpf
57 # define MIN_EXP FLT_MIN_EXP
58 # define MANT_BIT FLT_MANT_BIT
59 # define L_(literal) literal##f
60 #endif
62 #undef MAX
63 #define MAX(a,b) ((a) > (b) ? (a) : (b))
65 #undef MIN
66 #define MIN(a,b) ((a) < (b) ? (a) : (b))
68 /* MSVC with option -fp:strict refuses to compile constant initializers that
69 contain floating-point operations. Pacify this compiler. */
70 #if defined _MSC_VER && !defined __clang__
71 # pragma fenv_access (off)
72 #endif
74 /* Work around GCC 4.2.1 bug on OpenBSD 5.1/SPARC64. */
75 #if defined __GNUC__ && defined __sparc__
76 # define VOLATILE volatile
77 #else
78 # define VOLATILE
79 #endif
81 /* It is possible to write an implementation of fused multiply-add with
82 floating-point operations alone. See
83 Sylvie Boldo, Guillaume Melquiond:
84 Emulation of FMA and correctly-rounded sums: proved algorithms using
85 rounding to odd.
86 <https://www.lri.fr/~melquion/doc/08-tc.pdf>
87 But is it complicated.
88 Here we take the simpler (and probably slower) approach of doing
89 multi-precision arithmetic. */
91 /* We use the naming conventions of GNU gmp, but vastly simpler (and slower)
92 algorithms. */
94 typedef unsigned int mp_limb_t;
95 #define GMP_LIMB_BITS 32
96 static_assert (sizeof (mp_limb_t) * CHAR_BIT == GMP_LIMB_BITS);
98 typedef unsigned long long mp_twolimb_t;
99 #define GMP_TWOLIMB_BITS 64
100 static_assert (sizeof (mp_twolimb_t) * CHAR_BIT == GMP_TWOLIMB_BITS);
102 /* Number of limbs needed for a single DOUBLE. */
103 #define NLIMBS1 ((MANT_BIT + GMP_LIMB_BITS - 1) / GMP_LIMB_BITS)
105 /* Number of limbs needed for the accumulator. */
106 #define NLIMBS3 (3 * NLIMBS1 + 1)
108 /* Assuming 0.5 <= x < 1.0:
109 Convert the mantissa (x * 2^DBL_MANT_BIT) to a sequence of limbs. */
110 static void
111 decode (DOUBLE x, mp_limb_t limbs[NLIMBS1])
113 /* I'm not sure whether it's safe to cast a 'double' value between
114 2^31 and 2^32 to 'unsigned int', therefore play safe and cast only
115 'double' values between 0 and 2^31 (to 'unsigned int' or 'int',
116 doesn't matter).
117 So, we split the MANT_BIT bits of x into a number of chunks of
118 at most 31 bits. */
119 enum { chunk_count = (MANT_BIT - 1) / 31 + 1 };
120 /* Variables used for storing the bits limb after limb. */
121 mp_limb_t *p = limbs + NLIMBS1 - 1;
122 mp_limb_t accu = 0;
123 unsigned int bits_needed = MANT_BIT - (NLIMBS1 - 1) * GMP_LIMB_BITS;
124 /* The bits bits_needed-1...0 need to be ORed into the accu.
125 1 <= bits_needed <= GMP_LIMB_BITS. */
126 /* Unroll the first 4 loop rounds. */
127 if (chunk_count > 0)
129 /* Here we still have MANT_BIT-0*31 bits to extract from x. */
130 enum { chunk_bits = MIN (31, MANT_BIT - 0 * 31) }; /* > 0, <= 31 */
131 mp_limb_t d;
132 x *= (mp_limb_t) 1 << chunk_bits;
133 d = (int) x; /* 0 <= d < 2^chunk_bits. */
134 x -= d;
135 if (!(x >= L_(0.0) && x < L_(1.0)))
136 abort ();
137 if (bits_needed < chunk_bits)
139 /* store bits_needed bits */
140 accu |= d >> (chunk_bits - bits_needed);
141 *p = accu;
142 if (p == limbs)
143 goto done;
144 p--;
145 /* hold (chunk_bits - bits_needed) bits */
146 accu = d << (GMP_LIMB_BITS - (chunk_bits - bits_needed));
147 bits_needed = GMP_LIMB_BITS - (chunk_bits - bits_needed);
149 else
151 /* store chunk_bits bits */
152 accu |= d << (bits_needed - chunk_bits);
153 bits_needed -= chunk_bits;
154 if (bits_needed == 0)
156 *p = accu;
157 if (p == limbs)
158 goto done;
159 p--;
160 accu = 0;
161 bits_needed = GMP_LIMB_BITS;
165 if (chunk_count > 1)
167 /* Here we still have MANT_BIT-1*31 bits to extract from x. */
168 enum { chunk_bits = MIN (31, MAX (MANT_BIT - 1 * 31, 0)) }; /* > 0, <= 31 */
169 mp_limb_t d;
170 x *= (mp_limb_t) 1 << chunk_bits;
171 d = (int) x; /* 0 <= d < 2^chunk_bits. */
172 x -= d;
173 if (!(x >= L_(0.0) && x < L_(1.0)))
174 abort ();
175 if (bits_needed < chunk_bits)
177 /* store bits_needed bits */
178 accu |= d >> (chunk_bits - bits_needed);
179 *p = accu;
180 if (p == limbs)
181 goto done;
182 p--;
183 /* hold (chunk_bits - bits_needed) bits */
184 accu = d << (GMP_LIMB_BITS - (chunk_bits - bits_needed));
185 bits_needed = GMP_LIMB_BITS - (chunk_bits - bits_needed);
187 else
189 /* store chunk_bits bits */
190 accu |= d << (bits_needed - chunk_bits);
191 bits_needed -= chunk_bits;
192 if (bits_needed == 0)
194 *p = accu;
195 if (p == limbs)
196 goto done;
197 p--;
198 accu = 0;
199 bits_needed = GMP_LIMB_BITS;
203 if (chunk_count > 2)
205 /* Here we still have MANT_BIT-2*31 bits to extract from x. */
206 enum { chunk_bits = MIN (31, MAX (MANT_BIT - 2 * 31, 0)) }; /* > 0, <= 31 */
207 mp_limb_t d;
208 x *= (mp_limb_t) 1 << chunk_bits;
209 d = (int) x; /* 0 <= d < 2^chunk_bits. */
210 x -= d;
211 if (!(x >= L_(0.0) && x < L_(1.0)))
212 abort ();
213 if (bits_needed < chunk_bits)
215 /* store bits_needed bits */
216 accu |= d >> (chunk_bits - bits_needed);
217 *p = accu;
218 if (p == limbs)
219 goto done;
220 p--;
221 /* hold (chunk_bits - bits_needed) bits */
222 accu = d << (GMP_LIMB_BITS - (chunk_bits - bits_needed));
223 bits_needed = GMP_LIMB_BITS - (chunk_bits - bits_needed);
225 else
227 /* store chunk_bits bits */
228 accu |= d << (bits_needed - chunk_bits);
229 bits_needed -= chunk_bits;
230 if (bits_needed == 0)
232 *p = accu;
233 if (p == limbs)
234 goto done;
235 p--;
236 accu = 0;
237 bits_needed = GMP_LIMB_BITS;
241 if (chunk_count > 3)
243 /* Here we still have MANT_BIT-3*31 bits to extract from x. */
244 enum { chunk_bits = MIN (31, MAX (MANT_BIT - 3 * 31, 0)) }; /* > 0, <= 31 */
245 mp_limb_t d;
246 x *= (mp_limb_t) 1 << chunk_bits;
247 d = (int) x; /* 0 <= d < 2^chunk_bits. */
248 x -= d;
249 if (!(x >= L_(0.0) && x < L_(1.0)))
250 abort ();
251 if (bits_needed < chunk_bits)
253 /* store bits_needed bits */
254 accu |= d >> (chunk_bits - bits_needed);
255 *p = accu;
256 if (p == limbs)
257 goto done;
258 p--;
259 /* hold (chunk_bits - bits_needed) bits */
260 accu = d << (GMP_LIMB_BITS - (chunk_bits - bits_needed));
261 bits_needed = GMP_LIMB_BITS - (chunk_bits - bits_needed);
263 else
265 /* store chunk_bits bits */
266 accu |= d << (bits_needed - chunk_bits);
267 bits_needed -= chunk_bits;
268 if (bits_needed == 0)
270 *p = accu;
271 if (p == limbs)
272 goto done;
273 p--;
274 accu = 0;
275 bits_needed = GMP_LIMB_BITS;
279 if (chunk_count > 4)
281 /* Here we still have MANT_BIT-4*31 bits to extract from x. */
282 /* Generic loop. */
283 size_t k;
284 for (k = 4; k < chunk_count; k++)
286 size_t chunk_bits = MIN (31, MANT_BIT - k * 31); /* > 0, <= 31 */
287 mp_limb_t d;
288 x *= (mp_limb_t) 1 << chunk_bits;
289 d = (int) x; /* 0 <= d < 2^chunk_bits. */
290 x -= d;
291 if (!(x >= L_(0.0) && x < L_(1.0)))
292 abort ();
293 if (bits_needed < chunk_bits)
295 /* store bits_needed bits */
296 accu |= d >> (chunk_bits - bits_needed);
297 *p = accu;
298 if (p == limbs)
299 goto done;
300 p--;
301 /* hold (chunk_bits - bits_needed) bits */
302 accu = d << (GMP_LIMB_BITS - (chunk_bits - bits_needed));
303 bits_needed = GMP_LIMB_BITS - (chunk_bits - bits_needed);
305 else
307 /* store chunk_bits bits */
308 accu |= d << (bits_needed - chunk_bits);
309 bits_needed -= chunk_bits;
310 if (bits_needed == 0)
312 *p = accu;
313 if (p == limbs)
314 goto done;
315 p--;
316 accu = 0;
317 bits_needed = GMP_LIMB_BITS;
322 /* We shouldn't get here. */
323 abort ();
325 done: ;
326 #ifndef USE_LONG_DOUBLE /* On FreeBSD 6.1/x86, 'long double' numbers sometimes
327 have excess precision. */
328 if (!(x == L_(0.0)))
329 abort ();
330 #endif
333 /* Multiply two sequences of limbs. */
334 static void
335 multiply (mp_limb_t xlimbs[NLIMBS1], mp_limb_t ylimbs[NLIMBS1],
336 mp_limb_t prod_limbs[2 * NLIMBS1])
338 size_t k, i, j;
339 enum { len1 = NLIMBS1 };
340 enum { len2 = NLIMBS1 };
342 for (k = len2; k > 0; )
343 prod_limbs[--k] = 0;
344 for (i = 0; i < len1; i++)
346 mp_limb_t digit1 = xlimbs[i];
347 mp_twolimb_t carry = 0;
348 for (j = 0; j < len2; j++)
350 mp_limb_t digit2 = ylimbs[j];
351 carry += (mp_twolimb_t) digit1 * (mp_twolimb_t) digit2;
352 carry += prod_limbs[i + j];
353 prod_limbs[i + j] = (mp_limb_t) carry;
354 carry = carry >> GMP_LIMB_BITS;
356 prod_limbs[i + len2] = (mp_limb_t) carry;
360 DOUBLE
361 FUNC (DOUBLE x, DOUBLE y, DOUBLE z)
363 if (isfinite (x) && isfinite (y))
365 if (isfinite (z))
367 /* x, y, z are all finite. */
368 if (x == L_(0.0) || y == L_(0.0))
369 return z;
370 if (z == L_(0.0))
371 return x * y;
372 /* x, y, z are all non-zero.
373 The result is x * y + z. */
375 int e; /* exponent of x * y + z */
376 int sign;
377 mp_limb_t sum[NLIMBS3];
378 size_t sum_len;
381 int xys; /* sign of x * y */
382 int zs; /* sign of z */
383 int xye; /* sum of exponents of x and y */
384 int ze; /* exponent of z */
385 mp_limb_t summand1[NLIMBS3];
386 size_t summand1_len;
387 mp_limb_t summand2[NLIMBS3];
388 size_t summand2_len;
391 mp_limb_t zlimbs[NLIMBS1];
392 mp_limb_t xylimbs[2 * NLIMBS1];
395 DOUBLE xn; /* normalized part of x */
396 DOUBLE yn; /* normalized part of y */
397 DOUBLE zn; /* normalized part of z */
398 int xe; /* exponent of x */
399 int ye; /* exponent of y */
400 mp_limb_t xlimbs[NLIMBS1];
401 mp_limb_t ylimbs[NLIMBS1];
403 xys = 0;
404 xn = x;
405 if (x < 0)
407 xys = 1;
408 xn = - x;
410 yn = y;
411 if (y < 0)
413 xys = 1 - xys;
414 yn = - y;
417 zs = 0;
418 zn = z;
419 if (z < 0)
421 zs = 1;
422 zn = - z;
425 /* xn, yn, zn are all positive.
426 The result is (-1)^xys * xn * yn + (-1)^zs * zn. */
427 xn = FREXP (xn, &xe);
428 yn = FREXP (yn, &ye);
429 zn = FREXP (zn, &ze);
430 xye = xe + ye;
431 /* xn, yn, zn are all < 1.0 and >= 0.5.
432 The result is
433 (-1)^xys * 2^xye * xn * yn + (-1)^zs * 2^ze * zn. */
434 if (xye < ze - MANT_BIT)
436 /* 2^xye * xn * yn < 2^xye <= 2^(ze-MANT_BIT-1) */
437 return z;
439 if (xye - 2 * MANT_BIT > ze)
441 /* 2^ze * zn < 2^ze <= 2^(xye-2*MANT_BIT-1).
442 We cannot simply do
443 return x * y;
444 because it would round differently: A round-to-even
445 in the multiplication can be a round-up or round-down
446 here, due to z. So replace z with a value that doesn't
447 require the use of long bignums but that rounds the
448 same way. */
449 zn = L_(0.5);
450 ze = xye - 2 * MANT_BIT - 1;
452 /* Convert mantissas of xn, yn, zn to limb sequences:
453 xlimbs = 2^MANT_BIT * xn
454 ylimbs = 2^MANT_BIT * yn
455 zlimbs = 2^MANT_BIT * zn */
456 decode (xn, xlimbs);
457 decode (yn, ylimbs);
458 decode (zn, zlimbs);
459 /* Multiply the mantissas of xn and yn:
460 xylimbs = xlimbs * ylimbs */
461 multiply (xlimbs, ylimbs, xylimbs);
463 /* The result is
464 (-1)^xys * 2^(xye-2*MANT_BIT) * xylimbs
465 + (-1)^zs * 2^(ze-MANT_BIT) * zlimbs.
466 Compute
467 e = min (xye-2*MANT_BIT, ze-MANT_BIT)
468 then
469 summand1 = 2^(xye-2*MANT_BIT-e) * xylimbs
470 summand2 = 2^(ze-MANT_BIT-e) * zlimbs */
471 e = MIN (xye - 2 * MANT_BIT, ze - MANT_BIT);
472 if (e == xye - 2 * MANT_BIT)
474 /* Simply copy the limbs of xylimbs. */
475 size_t i;
476 for (i = 0; i < 2 * NLIMBS1; i++)
477 summand1[i] = xylimbs[i];
478 summand1_len = 2 * NLIMBS1;
480 else
482 size_t ediff = xye - 2 * MANT_BIT - e;
483 /* Left shift the limbs of xylimbs by ediff bits. */
484 size_t ldiff = ediff / GMP_LIMB_BITS;
485 size_t shift = ediff % GMP_LIMB_BITS;
486 size_t i;
487 for (i = 0; i < ldiff; i++)
488 summand1[i] = 0;
489 if (shift > 0)
491 mp_limb_t carry = 0;
492 for (i = 0; i < 2 * NLIMBS1; i++)
494 summand1[ldiff + i] = (xylimbs[i] << shift) | carry;
495 carry = xylimbs[i] >> (GMP_LIMB_BITS - shift);
497 summand1[ldiff + 2 * NLIMBS1] = carry;
498 summand1_len = ldiff + 2 * NLIMBS1 + 1;
500 else
502 for (i = 0; i < 2 * NLIMBS1; i++)
503 summand1[ldiff + i] = xylimbs[i];
504 summand1_len = ldiff + 2 * NLIMBS1;
506 /* Estimation of needed array size:
507 ediff = (xye - 2 * MANT_BIT) - (ze - MANT_BIT) <= MANT_BIT + 1
508 therefore
509 summand1_len
510 = (ediff + GMP_LIMB_BITS - 1) / GMP_LIMB_BITS + 2 * NLIMBS1
511 <= (MANT_BIT + GMP_LIMB_BITS) / GMP_LIMB_BITS + 2 * NLIMBS1
512 <= 3 * NLIMBS1 + 1
513 = NLIMBS3 */
514 if (!(summand1_len <= NLIMBS3))
515 abort ();
517 if (e == ze - MANT_BIT)
519 /* Simply copy the limbs of zlimbs. */
520 size_t i;
521 for (i = 0; i < NLIMBS1; i++)
522 summand2[i] = zlimbs[i];
523 summand2_len = NLIMBS1;
525 else
527 size_t ediff = ze - MANT_BIT - e;
528 /* Left shift the limbs of zlimbs by ediff bits. */
529 size_t ldiff = ediff / GMP_LIMB_BITS;
530 size_t shift = ediff % GMP_LIMB_BITS;
531 size_t i;
532 for (i = 0; i < ldiff; i++)
533 summand2[i] = 0;
534 if (shift > 0)
536 mp_limb_t carry = 0;
537 for (i = 0; i < NLIMBS1; i++)
539 summand2[ldiff + i] = (zlimbs[i] << shift) | carry;
540 carry = zlimbs[i] >> (GMP_LIMB_BITS - shift);
542 summand2[ldiff + NLIMBS1] = carry;
543 summand2_len = ldiff + NLIMBS1 + 1;
545 else
547 for (i = 0; i < NLIMBS1; i++)
548 summand2[ldiff + i] = zlimbs[i];
549 summand2_len = ldiff + NLIMBS1;
551 /* Estimation of needed array size:
552 ediff = (ze - MANT_BIT) - (xye - 2 * MANT_BIT) <= 2 * MANT_BIT
553 therefore
554 summand2_len
555 = (ediff + GMP_LIMB_BITS - 1) / GMP_LIMB_BITS + NLIMBS1
556 <= (2 * MANT_BIT + GMP_LIMB_BITS - 1) / GMP_LIMB_BITS + NLIMBS1
557 <= 3 * NLIMBS1 + 1
558 = NLIMBS3 */
559 if (!(summand2_len <= NLIMBS3))
560 abort ();
563 /* The result is
564 (-1)^xys * 2^e * summand1 + (-1)^zs * 2^e * summand2. */
565 if (xys == zs)
567 /* Perform an addition. */
568 size_t i;
569 mp_limb_t carry;
571 sign = xys;
572 carry = 0;
573 for (i = 0; i < MIN (summand1_len, summand2_len); i++)
575 mp_limb_t digit1 = summand1[i];
576 mp_limb_t digit2 = summand2[i];
577 sum[i] = digit1 + digit2 + carry;
578 carry = (carry
579 ? digit1 >= (mp_limb_t)-1 - digit2
580 : digit1 > (mp_limb_t)-1 - digit2);
582 if (summand1_len > summand2_len)
583 for (; i < summand1_len; i++)
585 mp_limb_t digit1 = summand1[i];
586 sum[i] = carry + digit1;
587 carry = carry && digit1 == (mp_limb_t)-1;
589 else
590 for (; i < summand2_len; i++)
592 mp_limb_t digit2 = summand2[i];
593 sum[i] = carry + digit2;
594 carry = carry && digit2 == (mp_limb_t)-1;
596 if (carry)
597 sum[i++] = carry;
598 sum_len = i;
600 else
602 /* Perform a subtraction. */
603 /* Compare summand1 and summand2 by magnitude. */
604 while (summand1[summand1_len - 1] == 0)
605 summand1_len--;
606 while (summand2[summand2_len - 1] == 0)
607 summand2_len--;
608 if (summand1_len > summand2_len)
609 sign = xys;
610 else if (summand1_len < summand2_len)
611 sign = zs;
612 else
614 size_t i = summand1_len;
615 for (;;)
617 --i;
618 if (summand1[i] > summand2[i])
620 sign = xys;
621 break;
623 if (summand1[i] < summand2[i])
625 sign = zs;
626 break;
628 if (i == 0)
629 /* summand1 and summand2 are equal. */
630 return L_(0.0);
633 if (sign == xys)
635 /* Compute summand1 - summand2. */
636 size_t i;
637 mp_limb_t carry;
639 carry = 0;
640 for (i = 0; i < summand2_len; i++)
642 mp_limb_t digit1 = summand1[i];
643 mp_limb_t digit2 = summand2[i];
644 sum[i] = digit1 - digit2 - carry;
645 carry = (carry ? digit1 <= digit2 : digit1 < digit2);
647 for (; i < summand1_len; i++)
649 mp_limb_t digit1 = summand1[i];
650 sum[i] = digit1 - carry;
651 carry = carry && digit1 == 0;
653 if (carry)
654 abort ();
655 sum_len = summand1_len;
657 else
659 /* Compute summand2 - summand1. */
660 size_t i;
661 mp_limb_t carry;
663 carry = 0;
664 for (i = 0; i < summand1_len; i++)
666 mp_limb_t digit1 = summand1[i];
667 mp_limb_t digit2 = summand2[i];
668 sum[i] = digit2 - digit1 - carry;
669 carry = (carry ? digit2 <= digit1 : digit2 < digit1);
671 for (; i < summand2_len; i++)
673 mp_limb_t digit2 = summand2[i];
674 sum[i] = digit2 - carry;
675 carry = carry && digit2 == 0;
677 if (carry)
678 abort ();
679 sum_len = summand2_len;
683 /* The result is
684 (-1)^sign * 2^e * sum. */
685 /* Now perform the rounding to MANT_BIT mantissa bits. */
686 while (sum[sum_len - 1] == 0)
687 sum_len--;
688 /* Here we know that the most significant limb, sum[sum_len - 1], is
689 non-zero. */
691 /* How many bits the sum has. */
692 unsigned int sum_bits =
693 integer_length (sum[sum_len - 1]) + (sum_len - 1) * GMP_LIMB_BITS;
694 /* How many bits to keep when rounding. */
695 unsigned int keep_bits;
696 /* How many bits to round off. */
697 unsigned int roundoff_bits;
698 if (e + (int) sum_bits >= MIN_EXP)
699 /* 2^e * sum >= 2^(MIN_EXP-1).
700 result will be a normalized number. */
701 keep_bits = MANT_BIT;
702 else if (e + (int) sum_bits >= MIN_EXP - MANT_BIT)
703 /* 2^e * sum >= 2^(MIN_EXP-MANT_BIT-1).
704 result will be a denormalized number or rounded to zero. */
705 keep_bits = e + (int) sum_bits - (MIN_EXP + MANT_BIT);
706 else
707 /* 2^e * sum < 2^(MIN_EXP-MANT_BIT-1). Round to zero. */
708 return L_(0.0);
709 /* Note: 0 <= keep_bits <= MANT_BIT. */
710 if (sum_bits <= keep_bits)
712 /* Nothing to do. */
713 roundoff_bits = 0;
714 keep_bits = sum_bits;
716 else
718 int round_up;
719 roundoff_bits = sum_bits - keep_bits; /* > 0, <= sum_bits */
721 #if HAVE_FEGETROUND && defined FE_TOWARDZERO
722 /* Cf. <https://pubs.opengroup.org/onlinepubs/9699919799/functions/fegetround.html> */
723 int rounding_mode = fegetround ();
724 if (rounding_mode == FE_TOWARDZERO)
725 round_up = 0;
726 # if defined FE_DOWNWARD /* not defined on sh4 */
727 else if (rounding_mode == FE_DOWNWARD)
728 round_up = sign;
729 # endif
730 # if defined FE_UPWARD /* not defined on sh4 */
731 else if (rounding_mode == FE_UPWARD)
732 round_up = !sign;
733 # endif
734 #else
735 /* Cf. <https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/float.h.html> */
736 int rounding_mode = FLT_ROUNDS;
737 if (rounding_mode == 0) /* toward zero */
738 round_up = 0;
739 else if (rounding_mode == 3) /* toward negative infinity */
740 round_up = sign;
741 else if (rounding_mode == 2) /* toward positive infinity */
742 round_up = !sign;
743 #endif
744 else
746 /* Round to nearest. */
747 round_up = 0;
748 /* Test bit (roundoff_bits-1). */
749 if ((sum[(roundoff_bits - 1) / GMP_LIMB_BITS]
750 >> ((roundoff_bits - 1) % GMP_LIMB_BITS)) & 1)
752 /* Test bits roundoff_bits-1 .. 0. */
753 bool halfway =
754 ((sum[(roundoff_bits - 1) / GMP_LIMB_BITS]
755 & (((mp_limb_t) 1 << ((roundoff_bits - 1) % GMP_LIMB_BITS)) - 1))
756 == 0);
757 if (halfway)
759 int i;
760 for (i = (roundoff_bits - 1) / GMP_LIMB_BITS - 1; i >= 0; i--)
761 if (sum[i] != 0)
763 halfway = false;
764 break;
767 if (halfway)
768 /* Round to even. Test bit roundoff_bits. */
769 round_up = ((sum[roundoff_bits / GMP_LIMB_BITS]
770 >> (roundoff_bits % GMP_LIMB_BITS)) & 1);
771 else
772 /* Round up. */
773 round_up = 1;
777 /* Perform the rounding. */
779 size_t i = roundoff_bits / GMP_LIMB_BITS;
781 size_t j = i;
782 while (j > 0)
783 sum[--j] = 0;
785 if (round_up)
787 /* Round up. */
788 sum[i] =
789 (sum[i]
790 | (((mp_limb_t) 1 << (roundoff_bits % GMP_LIMB_BITS)) - 1))
791 + 1;
792 if (sum[i] == 0)
794 /* Propagate carry. */
795 while (i < sum_len - 1)
797 ++i;
798 sum[i] += 1;
799 if (sum[i] != 0)
800 break;
803 /* sum[i] is the most significant limb that was
804 incremented. */
805 if (i == sum_len - 1 && (sum[i] & (sum[i] - 1)) == 0)
807 /* Through the carry, one more bit is needed. */
808 if (sum[i] != 0)
809 sum_bits += 1;
810 else
812 /* Instead of requiring one more limb of memory,
813 perform a shift by one bit, and adjust the
814 exponent. */
815 sum[i] = (mp_limb_t) 1 << (GMP_LIMB_BITS - 1);
816 e += 1;
818 /* The bit sequence has the form 1000...000. */
819 keep_bits = 1;
822 else
824 /* Round down. */
825 sum[i] &= ((mp_limb_t) -1 << (roundoff_bits % GMP_LIMB_BITS));
826 if (i == sum_len - 1 && sum[i] == 0)
827 /* The entire sum has become zero. */
828 return L_(0.0);
832 /* The result is
833 (-1)^sign * 2^e * sum
834 and here we know that
835 2^(sum_bits-1) <= sum < 2^sum_bits,
836 and sum is a multiple of 2^(sum_bits-keep_bits), where
837 0 < keep_bits <= MANT_BIT and keep_bits <= sum_bits.
838 (If keep_bits was initially 0, the rounding either returned zero
839 or produced a bit sequence of the form 1000...000, setting
840 keep_bits to 1.) */
842 /* Split the keep_bits bits into chunks of at most 32 bits. */
843 unsigned int chunk_count = (keep_bits - 1) / GMP_LIMB_BITS + 1;
844 /* 1 <= chunk_count <= ceil (sum_bits / GMP_LIMB_BITS) = sum_len. */
845 static const DOUBLE chunk_multiplier = /* 2^-GMP_LIMB_BITS */
846 L_(1.0) / ((DOUBLE) (1 << (GMP_LIMB_BITS / 2))
847 * (DOUBLE) (1 << ((GMP_LIMB_BITS + 1) / 2)));
848 unsigned int shift = sum_bits % GMP_LIMB_BITS;
849 DOUBLE fsum;
850 if (MANT_BIT <= GMP_LIMB_BITS)
852 /* Since keep_bits <= MANT_BIT <= GMP_LIMB_BITS,
853 chunk_count is 1. No need for a loop. */
854 if (shift == 0)
855 fsum = (DOUBLE) sum[sum_len - 1];
856 else
857 fsum = (DOUBLE)
858 ((sum[sum_len - 1] << (GMP_LIMB_BITS - shift))
859 | (sum_len >= 2 ? sum[sum_len - 2] >> shift : 0));
861 else
863 int k;
864 k = chunk_count - 1;
865 if (shift == 0)
867 /* First loop round. */
868 fsum = (DOUBLE) sum[sum_len - k - 1];
869 /* General loop. */
870 while (--k >= 0)
872 fsum *= chunk_multiplier;
873 fsum += (DOUBLE) sum[sum_len - k - 1];
876 else
878 /* First loop round. */
880 VOLATILE mp_limb_t chunk =
881 (sum[sum_len - k - 1] << (GMP_LIMB_BITS - shift))
882 | (sum_len >= k + 2 ? sum[sum_len - k - 2] >> shift : 0);
883 fsum = (DOUBLE) chunk;
885 /* General loop. */
886 while (--k >= 0)
888 fsum *= chunk_multiplier;
890 VOLATILE mp_limb_t chunk =
891 (sum[sum_len - k - 1] << (GMP_LIMB_BITS - shift))
892 | (sum[sum_len - k - 2] >> shift);
893 fsum += (DOUBLE) chunk;
898 fsum = LDEXP (fsum, e + (int) sum_bits - GMP_LIMB_BITS);
899 return (sign ? - fsum : fsum);
904 else
905 return z;
907 else
908 return (x * y) + z;