malloc-h: New module.
[gnulib.git] / lib / fma.c
blob82d1fe5983b4a0622cec3bbbb25987f3e3ffd8b4
1 /* Fused multiply-add.
2 Copyright (C) 2007, 2009, 2011-2020 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Bruno Haible <bruno@clisp.org>, 2011. */
19 #if ! defined USE_LONG_DOUBLE
20 # include <config.h>
21 #endif
23 /* Specification. */
24 #include <math.h>
26 #include <limits.h>
27 #include <stdbool.h>
28 #include <stdlib.h>
30 #if HAVE_FEGETROUND
31 # include <fenv.h>
32 #endif
34 #include "float+.h"
35 #include "integer_length.h"
36 #include "verify.h"
38 #ifdef USE_LONG_DOUBLE
39 # define FUNC fmal
40 # define DOUBLE long double
41 # define FREXP frexpl
42 # define LDEXP ldexpl
43 # define MIN_EXP LDBL_MIN_EXP
44 # define MANT_BIT LDBL_MANT_BIT
45 # define L_(literal) literal##L
46 #elif ! defined USE_FLOAT
47 # define FUNC fma
48 # define DOUBLE double
49 # define FREXP frexp
50 # define LDEXP ldexp
51 # define MIN_EXP DBL_MIN_EXP
52 # define MANT_BIT DBL_MANT_BIT
53 # define L_(literal) literal
54 #else /* defined USE_FLOAT */
55 # define FUNC fmaf
56 # define DOUBLE float
57 # define FREXP frexpf
58 # define LDEXP ldexpf
59 # define MIN_EXP FLT_MIN_EXP
60 # define MANT_BIT FLT_MANT_BIT
61 # define L_(literal) literal##f
62 #endif
64 #undef MAX
65 #define MAX(a,b) ((a) > (b) ? (a) : (b))
67 #undef MIN
68 #define MIN(a,b) ((a) < (b) ? (a) : (b))
70 /* MSVC with option -fp:strict refuses to compile constant initializers that
71 contain floating-point operations. Pacify this compiler. */
72 #if defined _MSC_VER && !defined __clang__
73 # pragma fenv_access (off)
74 #endif
76 /* Work around GCC 4.2.1 bug on OpenBSD 5.1/SPARC64. */
77 #if defined __GNUC__ && defined __sparc__
78 # define VOLATILE volatile
79 #else
80 # define VOLATILE
81 #endif
83 /* It is possible to write an implementation of fused multiply-add with
84 floating-point operations alone. See
85 Sylvie Boldo, Guillaume Melquiond:
86 Emulation of FMA and correctly-rounded sums: proved algorithms using
87 rounding to odd.
88 <https://www.lri.fr/~melquion/doc/08-tc.pdf>
89 But is it complicated.
90 Here we take the simpler (and probably slower) approach of doing
91 multi-precision arithmetic. */
93 /* We use the naming conventions of GNU gmp, but vastly simpler (and slower)
94 algorithms. */
96 typedef unsigned int mp_limb_t;
97 #define GMP_LIMB_BITS 32
98 verify (sizeof (mp_limb_t) * CHAR_BIT == GMP_LIMB_BITS);
100 typedef unsigned long long mp_twolimb_t;
101 #define GMP_TWOLIMB_BITS 64
102 verify (sizeof (mp_twolimb_t) * CHAR_BIT == GMP_TWOLIMB_BITS);
104 /* Number of limbs needed for a single DOUBLE. */
105 #define NLIMBS1 ((MANT_BIT + GMP_LIMB_BITS - 1) / GMP_LIMB_BITS)
107 /* Number of limbs needed for the accumulator. */
108 #define NLIMBS3 (3 * NLIMBS1 + 1)
110 /* Assuming 0.5 <= x < 1.0:
111 Convert the mantissa (x * 2^DBL_MANT_BIT) to a sequence of limbs. */
112 static void
113 decode (DOUBLE x, mp_limb_t limbs[NLIMBS1])
115 /* I'm not sure whether it's safe to cast a 'double' value between
116 2^31 and 2^32 to 'unsigned int', therefore play safe and cast only
117 'double' values between 0 and 2^31 (to 'unsigned int' or 'int',
118 doesn't matter).
119 So, we split the MANT_BIT bits of x into a number of chunks of
120 at most 31 bits. */
121 enum { chunk_count = (MANT_BIT - 1) / 31 + 1 };
122 /* Variables used for storing the bits limb after limb. */
123 mp_limb_t *p = limbs + NLIMBS1 - 1;
124 mp_limb_t accu = 0;
125 unsigned int bits_needed = MANT_BIT - (NLIMBS1 - 1) * GMP_LIMB_BITS;
126 /* The bits bits_needed-1...0 need to be ORed into the accu.
127 1 <= bits_needed <= GMP_LIMB_BITS. */
128 /* Unroll the first 4 loop rounds. */
129 if (chunk_count > 0)
131 /* Here we still have MANT_BIT-0*31 bits to extract from x. */
132 enum { chunk_bits = MIN (31, MANT_BIT - 0 * 31) }; /* > 0, <= 31 */
133 mp_limb_t d;
134 x *= (mp_limb_t) 1 << chunk_bits;
135 d = (int) x; /* 0 <= d < 2^chunk_bits. */
136 x -= d;
137 if (!(x >= L_(0.0) && x < L_(1.0)))
138 abort ();
139 if (bits_needed < chunk_bits)
141 /* store bits_needed bits */
142 accu |= d >> (chunk_bits - bits_needed);
143 *p = accu;
144 if (p == limbs)
145 goto done;
146 p--;
147 /* hold (chunk_bits - bits_needed) bits */
148 accu = d << (GMP_LIMB_BITS - (chunk_bits - bits_needed));
149 bits_needed = GMP_LIMB_BITS - (chunk_bits - bits_needed);
151 else
153 /* store chunk_bits bits */
154 accu |= d << (bits_needed - chunk_bits);
155 bits_needed -= chunk_bits;
156 if (bits_needed == 0)
158 *p = accu;
159 if (p == limbs)
160 goto done;
161 p--;
162 accu = 0;
163 bits_needed = GMP_LIMB_BITS;
167 if (chunk_count > 1)
169 /* Here we still have MANT_BIT-1*31 bits to extract from x. */
170 enum { chunk_bits = MIN (31, MAX (MANT_BIT - 1 * 31, 0)) }; /* > 0, <= 31 */
171 mp_limb_t d;
172 x *= (mp_limb_t) 1 << chunk_bits;
173 d = (int) x; /* 0 <= d < 2^chunk_bits. */
174 x -= d;
175 if (!(x >= L_(0.0) && x < L_(1.0)))
176 abort ();
177 if (bits_needed < chunk_bits)
179 /* store bits_needed bits */
180 accu |= d >> (chunk_bits - bits_needed);
181 *p = accu;
182 if (p == limbs)
183 goto done;
184 p--;
185 /* hold (chunk_bits - bits_needed) bits */
186 accu = d << (GMP_LIMB_BITS - (chunk_bits - bits_needed));
187 bits_needed = GMP_LIMB_BITS - (chunk_bits - bits_needed);
189 else
191 /* store chunk_bits bits */
192 accu |= d << (bits_needed - chunk_bits);
193 bits_needed -= chunk_bits;
194 if (bits_needed == 0)
196 *p = accu;
197 if (p == limbs)
198 goto done;
199 p--;
200 accu = 0;
201 bits_needed = GMP_LIMB_BITS;
205 if (chunk_count > 2)
207 /* Here we still have MANT_BIT-2*31 bits to extract from x. */
208 enum { chunk_bits = MIN (31, MAX (MANT_BIT - 2 * 31, 0)) }; /* > 0, <= 31 */
209 mp_limb_t d;
210 x *= (mp_limb_t) 1 << chunk_bits;
211 d = (int) x; /* 0 <= d < 2^chunk_bits. */
212 x -= d;
213 if (!(x >= L_(0.0) && x < L_(1.0)))
214 abort ();
215 if (bits_needed < chunk_bits)
217 /* store bits_needed bits */
218 accu |= d >> (chunk_bits - bits_needed);
219 *p = accu;
220 if (p == limbs)
221 goto done;
222 p--;
223 /* hold (chunk_bits - bits_needed) bits */
224 accu = d << (GMP_LIMB_BITS - (chunk_bits - bits_needed));
225 bits_needed = GMP_LIMB_BITS - (chunk_bits - bits_needed);
227 else
229 /* store chunk_bits bits */
230 accu |= d << (bits_needed - chunk_bits);
231 bits_needed -= chunk_bits;
232 if (bits_needed == 0)
234 *p = accu;
235 if (p == limbs)
236 goto done;
237 p--;
238 accu = 0;
239 bits_needed = GMP_LIMB_BITS;
243 if (chunk_count > 3)
245 /* Here we still have MANT_BIT-3*31 bits to extract from x. */
246 enum { chunk_bits = MIN (31, MAX (MANT_BIT - 3 * 31, 0)) }; /* > 0, <= 31 */
247 mp_limb_t d;
248 x *= (mp_limb_t) 1 << chunk_bits;
249 d = (int) x; /* 0 <= d < 2^chunk_bits. */
250 x -= d;
251 if (!(x >= L_(0.0) && x < L_(1.0)))
252 abort ();
253 if (bits_needed < chunk_bits)
255 /* store bits_needed bits */
256 accu |= d >> (chunk_bits - bits_needed);
257 *p = accu;
258 if (p == limbs)
259 goto done;
260 p--;
261 /* hold (chunk_bits - bits_needed) bits */
262 accu = d << (GMP_LIMB_BITS - (chunk_bits - bits_needed));
263 bits_needed = GMP_LIMB_BITS - (chunk_bits - bits_needed);
265 else
267 /* store chunk_bits bits */
268 accu |= d << (bits_needed - chunk_bits);
269 bits_needed -= chunk_bits;
270 if (bits_needed == 0)
272 *p = accu;
273 if (p == limbs)
274 goto done;
275 p--;
276 accu = 0;
277 bits_needed = GMP_LIMB_BITS;
281 if (chunk_count > 4)
283 /* Here we still have MANT_BIT-4*31 bits to extract from x. */
284 /* Generic loop. */
285 size_t k;
286 for (k = 4; k < chunk_count; k++)
288 size_t chunk_bits = MIN (31, MANT_BIT - k * 31); /* > 0, <= 31 */
289 mp_limb_t d;
290 x *= (mp_limb_t) 1 << chunk_bits;
291 d = (int) x; /* 0 <= d < 2^chunk_bits. */
292 x -= d;
293 if (!(x >= L_(0.0) && x < L_(1.0)))
294 abort ();
295 if (bits_needed < chunk_bits)
297 /* store bits_needed bits */
298 accu |= d >> (chunk_bits - bits_needed);
299 *p = accu;
300 if (p == limbs)
301 goto done;
302 p--;
303 /* hold (chunk_bits - bits_needed) bits */
304 accu = d << (GMP_LIMB_BITS - (chunk_bits - bits_needed));
305 bits_needed = GMP_LIMB_BITS - (chunk_bits - bits_needed);
307 else
309 /* store chunk_bits bits */
310 accu |= d << (bits_needed - chunk_bits);
311 bits_needed -= chunk_bits;
312 if (bits_needed == 0)
314 *p = accu;
315 if (p == limbs)
316 goto done;
317 p--;
318 accu = 0;
319 bits_needed = GMP_LIMB_BITS;
324 /* We shouldn't get here. */
325 abort ();
327 done: ;
328 #ifndef USE_LONG_DOUBLE /* On FreeBSD 6.1/x86, 'long double' numbers sometimes
329 have excess precision. */
330 if (!(x == L_(0.0)))
331 abort ();
332 #endif
335 /* Multiply two sequences of limbs. */
336 static void
337 multiply (mp_limb_t xlimbs[NLIMBS1], mp_limb_t ylimbs[NLIMBS1],
338 mp_limb_t prod_limbs[2 * NLIMBS1])
340 size_t k, i, j;
341 enum { len1 = NLIMBS1 };
342 enum { len2 = NLIMBS1 };
344 for (k = len2; k > 0; )
345 prod_limbs[--k] = 0;
346 for (i = 0; i < len1; i++)
348 mp_limb_t digit1 = xlimbs[i];
349 mp_twolimb_t carry = 0;
350 for (j = 0; j < len2; j++)
352 mp_limb_t digit2 = ylimbs[j];
353 carry += (mp_twolimb_t) digit1 * (mp_twolimb_t) digit2;
354 carry += prod_limbs[i + j];
355 prod_limbs[i + j] = (mp_limb_t) carry;
356 carry = carry >> GMP_LIMB_BITS;
358 prod_limbs[i + len2] = (mp_limb_t) carry;
362 DOUBLE
363 FUNC (DOUBLE x, DOUBLE y, DOUBLE z)
365 if (isfinite (x) && isfinite (y))
367 if (isfinite (z))
369 /* x, y, z are all finite. */
370 if (x == L_(0.0) || y == L_(0.0))
371 return z;
372 if (z == L_(0.0))
373 return x * y;
374 /* x, y, z are all non-zero.
375 The result is x * y + z. */
377 int e; /* exponent of x * y + z */
378 int sign;
379 mp_limb_t sum[NLIMBS3];
380 size_t sum_len;
383 int xys; /* sign of x * y */
384 int zs; /* sign of z */
385 int xye; /* sum of exponents of x and y */
386 int ze; /* exponent of z */
387 mp_limb_t summand1[NLIMBS3];
388 size_t summand1_len;
389 mp_limb_t summand2[NLIMBS3];
390 size_t summand2_len;
393 mp_limb_t zlimbs[NLIMBS1];
394 mp_limb_t xylimbs[2 * NLIMBS1];
397 DOUBLE xn; /* normalized part of x */
398 DOUBLE yn; /* normalized part of y */
399 DOUBLE zn; /* normalized part of z */
400 int xe; /* exponent of x */
401 int ye; /* exponent of y */
402 mp_limb_t xlimbs[NLIMBS1];
403 mp_limb_t ylimbs[NLIMBS1];
405 xys = 0;
406 xn = x;
407 if (x < 0)
409 xys = 1;
410 xn = - x;
412 yn = y;
413 if (y < 0)
415 xys = 1 - xys;
416 yn = - y;
419 zs = 0;
420 zn = z;
421 if (z < 0)
423 zs = 1;
424 zn = - z;
427 /* xn, yn, zn are all positive.
428 The result is (-1)^xys * xn * yn + (-1)^zs * zn. */
429 xn = FREXP (xn, &xe);
430 yn = FREXP (yn, &ye);
431 zn = FREXP (zn, &ze);
432 xye = xe + ye;
433 /* xn, yn, zn are all < 1.0 and >= 0.5.
434 The result is
435 (-1)^xys * 2^xye * xn * yn + (-1)^zs * 2^ze * zn. */
436 if (xye < ze - MANT_BIT)
438 /* 2^xye * xn * yn < 2^xye <= 2^(ze-MANT_BIT-1) */
439 return z;
441 if (xye - 2 * MANT_BIT > ze)
443 /* 2^ze * zn < 2^ze <= 2^(xye-2*MANT_BIT-1).
444 We cannot simply do
445 return x * y;
446 because it would round differently: A round-to-even
447 in the multiplication can be a round-up or round-down
448 here, due to z. So replace z with a value that doesn't
449 require the use of long bignums but that rounds the
450 same way. */
451 zn = L_(0.5);
452 ze = xye - 2 * MANT_BIT - 1;
454 /* Convert mantissas of xn, yn, zn to limb sequences:
455 xlimbs = 2^MANT_BIT * xn
456 ylimbs = 2^MANT_BIT * yn
457 zlimbs = 2^MANT_BIT * zn */
458 decode (xn, xlimbs);
459 decode (yn, ylimbs);
460 decode (zn, zlimbs);
461 /* Multiply the mantissas of xn and yn:
462 xylimbs = xlimbs * ylimbs */
463 multiply (xlimbs, ylimbs, xylimbs);
465 /* The result is
466 (-1)^xys * 2^(xye-2*MANT_BIT) * xylimbs
467 + (-1)^zs * 2^(ze-MANT_BIT) * zlimbs.
468 Compute
469 e = min (xye-2*MANT_BIT, ze-MANT_BIT)
470 then
471 summand1 = 2^(xye-2*MANT_BIT-e) * xylimbs
472 summand2 = 2^(ze-MANT_BIT-e) * zlimbs */
473 e = MIN (xye - 2 * MANT_BIT, ze - MANT_BIT);
474 if (e == xye - 2 * MANT_BIT)
476 /* Simply copy the limbs of xylimbs. */
477 size_t i;
478 for (i = 0; i < 2 * NLIMBS1; i++)
479 summand1[i] = xylimbs[i];
480 summand1_len = 2 * NLIMBS1;
482 else
484 size_t ediff = xye - 2 * MANT_BIT - e;
485 /* Left shift the limbs of xylimbs by ediff bits. */
486 size_t ldiff = ediff / GMP_LIMB_BITS;
487 size_t shift = ediff % GMP_LIMB_BITS;
488 size_t i;
489 for (i = 0; i < ldiff; i++)
490 summand1[i] = 0;
491 if (shift > 0)
493 mp_limb_t carry = 0;
494 for (i = 0; i < 2 * NLIMBS1; i++)
496 summand1[ldiff + i] = (xylimbs[i] << shift) | carry;
497 carry = xylimbs[i] >> (GMP_LIMB_BITS - shift);
499 summand1[ldiff + 2 * NLIMBS1] = carry;
500 summand1_len = ldiff + 2 * NLIMBS1 + 1;
502 else
504 for (i = 0; i < 2 * NLIMBS1; i++)
505 summand1[ldiff + i] = xylimbs[i];
506 summand1_len = ldiff + 2 * NLIMBS1;
508 /* Estimation of needed array size:
509 ediff = (xye - 2 * MANT_BIT) - (ze - MANT_BIT) <= MANT_BIT + 1
510 therefore
511 summand1_len
512 = (ediff + GMP_LIMB_BITS - 1) / GMP_LIMB_BITS + 2 * NLIMBS1
513 <= (MANT_BIT + GMP_LIMB_BITS) / GMP_LIMB_BITS + 2 * NLIMBS1
514 <= 3 * NLIMBS1 + 1
515 = NLIMBS3 */
516 if (!(summand1_len <= NLIMBS3))
517 abort ();
519 if (e == ze - MANT_BIT)
521 /* Simply copy the limbs of zlimbs. */
522 size_t i;
523 for (i = 0; i < NLIMBS1; i++)
524 summand2[i] = zlimbs[i];
525 summand2_len = NLIMBS1;
527 else
529 size_t ediff = ze - MANT_BIT - e;
530 /* Left shift the limbs of zlimbs by ediff bits. */
531 size_t ldiff = ediff / GMP_LIMB_BITS;
532 size_t shift = ediff % GMP_LIMB_BITS;
533 size_t i;
534 for (i = 0; i < ldiff; i++)
535 summand2[i] = 0;
536 if (shift > 0)
538 mp_limb_t carry = 0;
539 for (i = 0; i < NLIMBS1; i++)
541 summand2[ldiff + i] = (zlimbs[i] << shift) | carry;
542 carry = zlimbs[i] >> (GMP_LIMB_BITS - shift);
544 summand2[ldiff + NLIMBS1] = carry;
545 summand2_len = ldiff + NLIMBS1 + 1;
547 else
549 for (i = 0; i < NLIMBS1; i++)
550 summand2[ldiff + i] = zlimbs[i];
551 summand2_len = ldiff + NLIMBS1;
553 /* Estimation of needed array size:
554 ediff = (ze - MANT_BIT) - (xye - 2 * MANT_BIT) <= 2 * MANT_BIT
555 therefore
556 summand2_len
557 = (ediff + GMP_LIMB_BITS - 1) / GMP_LIMB_BITS + NLIMBS1
558 <= (2 * MANT_BIT + GMP_LIMB_BITS - 1) / GMP_LIMB_BITS + NLIMBS1
559 <= 3 * NLIMBS1 + 1
560 = NLIMBS3 */
561 if (!(summand2_len <= NLIMBS3))
562 abort ();
565 /* The result is
566 (-1)^xys * 2^e * summand1 + (-1)^zs * 2^e * summand2. */
567 if (xys == zs)
569 /* Perform an addition. */
570 size_t i;
571 mp_limb_t carry;
573 sign = xys;
574 carry = 0;
575 for (i = 0; i < MIN (summand1_len, summand2_len); i++)
577 mp_limb_t digit1 = summand1[i];
578 mp_limb_t digit2 = summand2[i];
579 sum[i] = digit1 + digit2 + carry;
580 carry = (carry
581 ? digit1 >= (mp_limb_t)-1 - digit2
582 : digit1 > (mp_limb_t)-1 - digit2);
584 if (summand1_len > summand2_len)
585 for (; i < summand1_len; i++)
587 mp_limb_t digit1 = summand1[i];
588 sum[i] = carry + digit1;
589 carry = carry && digit1 == (mp_limb_t)-1;
591 else
592 for (; i < summand2_len; i++)
594 mp_limb_t digit2 = summand2[i];
595 sum[i] = carry + digit2;
596 carry = carry && digit2 == (mp_limb_t)-1;
598 if (carry)
599 sum[i++] = carry;
600 sum_len = i;
602 else
604 /* Perform a subtraction. */
605 /* Compare summand1 and summand2 by magnitude. */
606 while (summand1[summand1_len - 1] == 0)
607 summand1_len--;
608 while (summand2[summand2_len - 1] == 0)
609 summand2_len--;
610 if (summand1_len > summand2_len)
611 sign = xys;
612 else if (summand1_len < summand2_len)
613 sign = zs;
614 else
616 size_t i = summand1_len;
617 for (;;)
619 --i;
620 if (summand1[i] > summand2[i])
622 sign = xys;
623 break;
625 if (summand1[i] < summand2[i])
627 sign = zs;
628 break;
630 if (i == 0)
631 /* summand1 and summand2 are equal. */
632 return L_(0.0);
635 if (sign == xys)
637 /* Compute summand1 - summand2. */
638 size_t i;
639 mp_limb_t carry;
641 carry = 0;
642 for (i = 0; i < summand2_len; i++)
644 mp_limb_t digit1 = summand1[i];
645 mp_limb_t digit2 = summand2[i];
646 sum[i] = digit1 - digit2 - carry;
647 carry = (carry ? digit1 <= digit2 : digit1 < digit2);
649 for (; i < summand1_len; i++)
651 mp_limb_t digit1 = summand1[i];
652 sum[i] = digit1 - carry;
653 carry = carry && digit1 == 0;
655 if (carry)
656 abort ();
657 sum_len = summand1_len;
659 else
661 /* Compute summand2 - summand1. */
662 size_t i;
663 mp_limb_t carry;
665 carry = 0;
666 for (i = 0; i < summand1_len; i++)
668 mp_limb_t digit1 = summand1[i];
669 mp_limb_t digit2 = summand2[i];
670 sum[i] = digit2 - digit1 - carry;
671 carry = (carry ? digit2 <= digit1 : digit2 < digit1);
673 for (; i < summand2_len; i++)
675 mp_limb_t digit2 = summand2[i];
676 sum[i] = digit2 - carry;
677 carry = carry && digit2 == 0;
679 if (carry)
680 abort ();
681 sum_len = summand2_len;
685 /* The result is
686 (-1)^sign * 2^e * sum. */
687 /* Now perform the rounding to MANT_BIT mantissa bits. */
688 while (sum[sum_len - 1] == 0)
689 sum_len--;
690 /* Here we know that the most significant limb, sum[sum_len - 1], is
691 non-zero. */
693 /* How many bits the sum has. */
694 unsigned int sum_bits =
695 integer_length (sum[sum_len - 1]) + (sum_len - 1) * GMP_LIMB_BITS;
696 /* How many bits to keep when rounding. */
697 unsigned int keep_bits;
698 /* How many bits to round off. */
699 unsigned int roundoff_bits;
700 if (e + (int) sum_bits >= MIN_EXP)
701 /* 2^e * sum >= 2^(MIN_EXP-1).
702 result will be a normalized number. */
703 keep_bits = MANT_BIT;
704 else if (e + (int) sum_bits >= MIN_EXP - MANT_BIT)
705 /* 2^e * sum >= 2^(MIN_EXP-MANT_BIT-1).
706 result will be a denormalized number or rounded to zero. */
707 keep_bits = e + (int) sum_bits - (MIN_EXP + MANT_BIT);
708 else
709 /* 2^e * sum < 2^(MIN_EXP-MANT_BIT-1). Round to zero. */
710 return L_(0.0);
711 /* Note: 0 <= keep_bits <= MANT_BIT. */
712 if (sum_bits <= keep_bits)
714 /* Nothing to do. */
715 roundoff_bits = 0;
716 keep_bits = sum_bits;
718 else
720 int round_up;
721 roundoff_bits = sum_bits - keep_bits; /* > 0, <= sum_bits */
723 #if HAVE_FEGETROUND && defined FE_TOWARDZERO
724 /* Cf. <https://pubs.opengroup.org/onlinepubs/9699919799/functions/fegetround.html> */
725 int rounding_mode = fegetround ();
726 if (rounding_mode == FE_TOWARDZERO)
727 round_up = 0;
728 else if (rounding_mode == FE_DOWNWARD)
729 round_up = sign;
730 else if (rounding_mode == FE_UPWARD)
731 round_up = !sign;
732 #else
733 /* Cf. <https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/float.h.html> */
734 int rounding_mode = FLT_ROUNDS;
735 if (rounding_mode == 0) /* toward zero */
736 round_up = 0;
737 else if (rounding_mode == 3) /* toward negative infinity */
738 round_up = sign;
739 else if (rounding_mode == 2) /* toward positive infinity */
740 round_up = !sign;
741 #endif
742 else
744 /* Round to nearest. */
745 round_up = 0;
746 /* Test bit (roundoff_bits-1). */
747 if ((sum[(roundoff_bits - 1) / GMP_LIMB_BITS]
748 >> ((roundoff_bits - 1) % GMP_LIMB_BITS)) & 1)
750 /* Test bits roundoff_bits-1 .. 0. */
751 bool halfway =
752 ((sum[(roundoff_bits - 1) / GMP_LIMB_BITS]
753 & (((mp_limb_t) 1 << ((roundoff_bits - 1) % GMP_LIMB_BITS)) - 1))
754 == 0);
755 if (halfway)
757 int i;
758 for (i = (roundoff_bits - 1) / GMP_LIMB_BITS - 1; i >= 0; i--)
759 if (sum[i] != 0)
761 halfway = false;
762 break;
765 if (halfway)
766 /* Round to even. Test bit roundoff_bits. */
767 round_up = ((sum[roundoff_bits / GMP_LIMB_BITS]
768 >> (roundoff_bits % GMP_LIMB_BITS)) & 1);
769 else
770 /* Round up. */
771 round_up = 1;
775 /* Perform the rounding. */
777 size_t i = roundoff_bits / GMP_LIMB_BITS;
779 size_t j = i;
780 while (j > 0)
781 sum[--j] = 0;
783 if (round_up)
785 /* Round up. */
786 sum[i] =
787 (sum[i]
788 | (((mp_limb_t) 1 << (roundoff_bits % GMP_LIMB_BITS)) - 1))
789 + 1;
790 if (sum[i] == 0)
792 /* Propagate carry. */
793 while (i < sum_len - 1)
795 ++i;
796 sum[i] += 1;
797 if (sum[i] != 0)
798 break;
801 /* sum[i] is the most significant limb that was
802 incremented. */
803 if (i == sum_len - 1 && (sum[i] & (sum[i] - 1)) == 0)
805 /* Through the carry, one more bit is needed. */
806 if (sum[i] != 0)
807 sum_bits += 1;
808 else
810 /* Instead of requiring one more limb of memory,
811 perform a shift by one bit, and adjust the
812 exponent. */
813 sum[i] = (mp_limb_t) 1 << (GMP_LIMB_BITS - 1);
814 e += 1;
816 /* The bit sequence has the form 1000...000. */
817 keep_bits = 1;
820 else
822 /* Round down. */
823 sum[i] &= ((mp_limb_t) -1 << (roundoff_bits % GMP_LIMB_BITS));
824 if (i == sum_len - 1 && sum[i] == 0)
825 /* The entire sum has become zero. */
826 return L_(0.0);
830 /* The result is
831 (-1)^sign * 2^e * sum
832 and here we know that
833 2^(sum_bits-1) <= sum < 2^sum_bits,
834 and sum is a multiple of 2^(sum_bits-keep_bits), where
835 0 < keep_bits <= MANT_BIT and keep_bits <= sum_bits.
836 (If keep_bits was initially 0, the rounding either returned zero
837 or produced a bit sequence of the form 1000...000, setting
838 keep_bits to 1.) */
840 /* Split the keep_bits bits into chunks of at most 32 bits. */
841 unsigned int chunk_count = (keep_bits - 1) / GMP_LIMB_BITS + 1;
842 /* 1 <= chunk_count <= ceil (sum_bits / GMP_LIMB_BITS) = sum_len. */
843 static const DOUBLE chunk_multiplier = /* 2^-GMP_LIMB_BITS */
844 L_(1.0) / ((DOUBLE) (1 << (GMP_LIMB_BITS / 2))
845 * (DOUBLE) (1 << ((GMP_LIMB_BITS + 1) / 2)));
846 unsigned int shift = sum_bits % GMP_LIMB_BITS;
847 DOUBLE fsum;
848 if (MANT_BIT <= GMP_LIMB_BITS)
850 /* Since keep_bits <= MANT_BIT <= GMP_LIMB_BITS,
851 chunk_count is 1. No need for a loop. */
852 if (shift == 0)
853 fsum = (DOUBLE) sum[sum_len - 1];
854 else
855 fsum = (DOUBLE)
856 ((sum[sum_len - 1] << (GMP_LIMB_BITS - shift))
857 | (sum_len >= 2 ? sum[sum_len - 2] >> shift : 0));
859 else
861 int k;
862 k = chunk_count - 1;
863 if (shift == 0)
865 /* First loop round. */
866 fsum = (DOUBLE) sum[sum_len - k - 1];
867 /* General loop. */
868 while (--k >= 0)
870 fsum *= chunk_multiplier;
871 fsum += (DOUBLE) sum[sum_len - k - 1];
874 else
876 /* First loop round. */
878 VOLATILE mp_limb_t chunk =
879 (sum[sum_len - k - 1] << (GMP_LIMB_BITS - shift))
880 | (sum_len >= k + 2 ? sum[sum_len - k - 2] >> shift : 0);
881 fsum = (DOUBLE) chunk;
883 /* General loop. */
884 while (--k >= 0)
886 fsum *= chunk_multiplier;
888 VOLATILE mp_limb_t chunk =
889 (sum[sum_len - k - 1] << (GMP_LIMB_BITS - shift))
890 | (sum[sum_len - k - 2] >> shift);
891 fsum += (DOUBLE) chunk;
896 fsum = LDEXP (fsum, e + (int) sum_bits - GMP_LIMB_BITS);
897 return (sign ? - fsum : fsum);
902 else
903 return z;
905 else
906 return (x * y) + z;