autoupdate
[gnulib.git] / lib / fma.c
blob4d0a52b5dcaa8e329c89fa6b5bee4c4b2a5d2b7a
1 /* Fused multiply-add.
2 Copyright (C) 2007, 2009, 2011-2018 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Bruno Haible <bruno@clisp.org>, 2011. */
19 #if ! defined USE_LONG_DOUBLE
20 # include <config.h>
21 #endif
23 /* Specification. */
24 #include <math.h>
26 #include <stdbool.h>
27 #include <stdlib.h>
29 #if HAVE_FEGETROUND
30 # include <fenv.h>
31 #endif
33 #include "float+.h"
34 #include "integer_length.h"
35 #include "verify.h"
37 #ifdef USE_LONG_DOUBLE
38 # define FUNC fmal
39 # define DOUBLE long double
40 # define FREXP frexpl
41 # define LDEXP ldexpl
42 # define MIN_EXP LDBL_MIN_EXP
43 # define MANT_BIT LDBL_MANT_BIT
44 # define L_(literal) literal##L
45 #elif ! defined USE_FLOAT
46 # define FUNC fma
47 # define DOUBLE double
48 # define FREXP frexp
49 # define LDEXP ldexp
50 # define MIN_EXP DBL_MIN_EXP
51 # define MANT_BIT DBL_MANT_BIT
52 # define L_(literal) literal
53 #else /* defined USE_FLOAT */
54 # define FUNC fmaf
55 # define DOUBLE float
56 # define FREXP frexpf
57 # define LDEXP ldexpf
58 # define MIN_EXP FLT_MIN_EXP
59 # define MANT_BIT FLT_MANT_BIT
60 # define L_(literal) literal##f
61 #endif
63 #undef MAX
64 #define MAX(a,b) ((a) > (b) ? (a) : (b))
66 #undef MIN
67 #define MIN(a,b) ((a) < (b) ? (a) : (b))
69 /* MSVC with option -fp:strict refuses to compile constant initializers that
70 contain floating-point operations. Pacify this compiler. */
71 #ifdef _MSC_VER
72 # pragma fenv_access (off)
73 #endif
75 /* Work around GCC 4.2.1 bug on OpenBSD 5.1/SPARC64. */
76 #if defined __GNUC__ && defined __sparc__
77 # define VOLATILE volatile
78 #else
79 # define VOLATILE
80 #endif
82 /* It is possible to write an implementation of fused multiply-add with
83 floating-point operations alone. See
84 Sylvie Boldo, Guillaume Melquiond:
85 Emulation of FMA and correctly-rounded sums: proved algorithms using
86 rounding to odd.
87 <https://www.lri.fr/~melquion/doc/08-tc.pdf>
88 But is it complicated.
89 Here we take the simpler (and probably slower) approach of doing
90 multi-precision arithmetic. */
92 /* We use the naming conventions of GNU gmp, but vastly simpler (and slower)
93 algorithms. */
95 typedef unsigned int mp_limb_t;
96 #define GMP_LIMB_BITS 32
97 verify (sizeof (mp_limb_t) * CHAR_BIT == GMP_LIMB_BITS);
99 typedef unsigned long long mp_twolimb_t;
100 #define GMP_TWOLIMB_BITS 64
101 verify (sizeof (mp_twolimb_t) * CHAR_BIT == GMP_TWOLIMB_BITS);
103 /* Number of limbs needed for a single DOUBLE. */
104 #define NLIMBS1 ((MANT_BIT + GMP_LIMB_BITS - 1) / GMP_LIMB_BITS)
106 /* Number of limbs needed for the accumulator. */
107 #define NLIMBS3 (3 * NLIMBS1 + 1)
109 /* Assuming 0.5 <= x < 1.0:
110 Convert the mantissa (x * 2^DBL_MANT_BIT) to a sequence of limbs. */
111 static void
112 decode (DOUBLE x, mp_limb_t limbs[NLIMBS1])
114 /* I'm not sure whether it's safe to cast a 'double' value between
115 2^31 and 2^32 to 'unsigned int', therefore play safe and cast only
116 'double' values between 0 and 2^31 (to 'unsigned int' or 'int',
117 doesn't matter).
118 So, we split the MANT_BIT bits of x into a number of chunks of
119 at most 31 bits. */
120 enum { chunk_count = (MANT_BIT - 1) / 31 + 1 };
121 /* Variables used for storing the bits limb after limb. */
122 mp_limb_t *p = limbs + NLIMBS1 - 1;
123 mp_limb_t accu = 0;
124 unsigned int bits_needed = MANT_BIT - (NLIMBS1 - 1) * GMP_LIMB_BITS;
125 /* The bits bits_needed-1...0 need to be ORed into the accu.
126 1 <= bits_needed <= GMP_LIMB_BITS. */
127 /* Unroll the first 4 loop rounds. */
128 if (chunk_count > 0)
130 /* Here we still have MANT_BIT-0*31 bits to extract from x. */
131 enum { chunk_bits = MIN (31, MANT_BIT - 0 * 31) }; /* > 0, <= 31 */
132 mp_limb_t d;
133 x *= (mp_limb_t) 1 << chunk_bits;
134 d = (int) x; /* 0 <= d < 2^chunk_bits. */
135 x -= d;
136 if (!(x >= L_(0.0) && x < L_(1.0)))
137 abort ();
138 if (bits_needed < chunk_bits)
140 /* store bits_needed bits */
141 accu |= d >> (chunk_bits - bits_needed);
142 *p = accu;
143 if (p == limbs)
144 goto done;
145 p--;
146 /* hold (chunk_bits - bits_needed) bits */
147 accu = d << (GMP_LIMB_BITS - (chunk_bits - bits_needed));
148 bits_needed = GMP_LIMB_BITS - (chunk_bits - bits_needed);
150 else
152 /* store chunk_bits bits */
153 accu |= d << (bits_needed - chunk_bits);
154 bits_needed -= chunk_bits;
155 if (bits_needed == 0)
157 *p = accu;
158 if (p == limbs)
159 goto done;
160 p--;
161 accu = 0;
162 bits_needed = GMP_LIMB_BITS;
166 if (chunk_count > 1)
168 /* Here we still have MANT_BIT-1*31 bits to extract from x. */
169 enum { chunk_bits = MIN (31, MAX (MANT_BIT - 1 * 31, 0)) }; /* > 0, <= 31 */
170 mp_limb_t d;
171 x *= (mp_limb_t) 1 << chunk_bits;
172 d = (int) x; /* 0 <= d < 2^chunk_bits. */
173 x -= d;
174 if (!(x >= L_(0.0) && x < L_(1.0)))
175 abort ();
176 if (bits_needed < chunk_bits)
178 /* store bits_needed bits */
179 accu |= d >> (chunk_bits - bits_needed);
180 *p = accu;
181 if (p == limbs)
182 goto done;
183 p--;
184 /* hold (chunk_bits - bits_needed) bits */
185 accu = d << (GMP_LIMB_BITS - (chunk_bits - bits_needed));
186 bits_needed = GMP_LIMB_BITS - (chunk_bits - bits_needed);
188 else
190 /* store chunk_bits bits */
191 accu |= d << (bits_needed - chunk_bits);
192 bits_needed -= chunk_bits;
193 if (bits_needed == 0)
195 *p = accu;
196 if (p == limbs)
197 goto done;
198 p--;
199 accu = 0;
200 bits_needed = GMP_LIMB_BITS;
204 if (chunk_count > 2)
206 /* Here we still have MANT_BIT-2*31 bits to extract from x. */
207 enum { chunk_bits = MIN (31, MAX (MANT_BIT - 2 * 31, 0)) }; /* > 0, <= 31 */
208 mp_limb_t d;
209 x *= (mp_limb_t) 1 << chunk_bits;
210 d = (int) x; /* 0 <= d < 2^chunk_bits. */
211 x -= d;
212 if (!(x >= L_(0.0) && x < L_(1.0)))
213 abort ();
214 if (bits_needed < chunk_bits)
216 /* store bits_needed bits */
217 accu |= d >> (chunk_bits - bits_needed);
218 *p = accu;
219 if (p == limbs)
220 goto done;
221 p--;
222 /* hold (chunk_bits - bits_needed) bits */
223 accu = d << (GMP_LIMB_BITS - (chunk_bits - bits_needed));
224 bits_needed = GMP_LIMB_BITS - (chunk_bits - bits_needed);
226 else
228 /* store chunk_bits bits */
229 accu |= d << (bits_needed - chunk_bits);
230 bits_needed -= chunk_bits;
231 if (bits_needed == 0)
233 *p = accu;
234 if (p == limbs)
235 goto done;
236 p--;
237 accu = 0;
238 bits_needed = GMP_LIMB_BITS;
242 if (chunk_count > 3)
244 /* Here we still have MANT_BIT-3*31 bits to extract from x. */
245 enum { chunk_bits = MIN (31, MAX (MANT_BIT - 3 * 31, 0)) }; /* > 0, <= 31 */
246 mp_limb_t d;
247 x *= (mp_limb_t) 1 << chunk_bits;
248 d = (int) x; /* 0 <= d < 2^chunk_bits. */
249 x -= d;
250 if (!(x >= L_(0.0) && x < L_(1.0)))
251 abort ();
252 if (bits_needed < chunk_bits)
254 /* store bits_needed bits */
255 accu |= d >> (chunk_bits - bits_needed);
256 *p = accu;
257 if (p == limbs)
258 goto done;
259 p--;
260 /* hold (chunk_bits - bits_needed) bits */
261 accu = d << (GMP_LIMB_BITS - (chunk_bits - bits_needed));
262 bits_needed = GMP_LIMB_BITS - (chunk_bits - bits_needed);
264 else
266 /* store chunk_bits bits */
267 accu |= d << (bits_needed - chunk_bits);
268 bits_needed -= chunk_bits;
269 if (bits_needed == 0)
271 *p = accu;
272 if (p == limbs)
273 goto done;
274 p--;
275 accu = 0;
276 bits_needed = GMP_LIMB_BITS;
280 if (chunk_count > 4)
282 /* Here we still have MANT_BIT-4*31 bits to extract from x. */
283 /* Generic loop. */
284 size_t k;
285 for (k = 4; k < chunk_count; k++)
287 size_t chunk_bits = MIN (31, MANT_BIT - k * 31); /* > 0, <= 31 */
288 mp_limb_t d;
289 x *= (mp_limb_t) 1 << chunk_bits;
290 d = (int) x; /* 0 <= d < 2^chunk_bits. */
291 x -= d;
292 if (!(x >= L_(0.0) && x < L_(1.0)))
293 abort ();
294 if (bits_needed < chunk_bits)
296 /* store bits_needed bits */
297 accu |= d >> (chunk_bits - bits_needed);
298 *p = accu;
299 if (p == limbs)
300 goto done;
301 p--;
302 /* hold (chunk_bits - bits_needed) bits */
303 accu = d << (GMP_LIMB_BITS - (chunk_bits - bits_needed));
304 bits_needed = GMP_LIMB_BITS - (chunk_bits - bits_needed);
306 else
308 /* store chunk_bits bits */
309 accu |= d << (bits_needed - chunk_bits);
310 bits_needed -= chunk_bits;
311 if (bits_needed == 0)
313 *p = accu;
314 if (p == limbs)
315 goto done;
316 p--;
317 accu = 0;
318 bits_needed = GMP_LIMB_BITS;
323 /* We shouldn't get here. */
324 abort ();
326 done: ;
327 #ifndef USE_LONG_DOUBLE /* On FreeBSD 6.1/x86, 'long double' numbers sometimes
328 have excess precision. */
329 if (!(x == L_(0.0)))
330 abort ();
331 #endif
334 /* Multiply two sequences of limbs. */
335 static void
336 multiply (mp_limb_t xlimbs[NLIMBS1], mp_limb_t ylimbs[NLIMBS1],
337 mp_limb_t prod_limbs[2 * NLIMBS1])
339 size_t k, i, j;
340 enum { len1 = NLIMBS1 };
341 enum { len2 = NLIMBS1 };
343 for (k = len2; k > 0; )
344 prod_limbs[--k] = 0;
345 for (i = 0; i < len1; i++)
347 mp_limb_t digit1 = xlimbs[i];
348 mp_twolimb_t carry = 0;
349 for (j = 0; j < len2; j++)
351 mp_limb_t digit2 = ylimbs[j];
352 carry += (mp_twolimb_t) digit1 * (mp_twolimb_t) digit2;
353 carry += prod_limbs[i + j];
354 prod_limbs[i + j] = (mp_limb_t) carry;
355 carry = carry >> GMP_LIMB_BITS;
357 prod_limbs[i + len2] = (mp_limb_t) carry;
361 DOUBLE
362 FUNC (DOUBLE x, DOUBLE y, DOUBLE z)
364 if (isfinite (x) && isfinite (y))
366 if (isfinite (z))
368 /* x, y, z are all finite. */
369 if (x == L_(0.0) || y == L_(0.0))
370 return z;
371 if (z == L_(0.0))
372 return x * y;
373 /* x, y, z are all non-zero.
374 The result is x * y + z. */
376 int e; /* exponent of x * y + z */
377 int sign;
378 mp_limb_t sum[NLIMBS3];
379 size_t sum_len;
382 int xys; /* sign of x * y */
383 int zs; /* sign of z */
384 int xye; /* sum of exponents of x and y */
385 int ze; /* exponent of z */
386 mp_limb_t summand1[NLIMBS3];
387 size_t summand1_len;
388 mp_limb_t summand2[NLIMBS3];
389 size_t summand2_len;
392 mp_limb_t zlimbs[NLIMBS1];
393 mp_limb_t xylimbs[2 * NLIMBS1];
396 DOUBLE xn; /* normalized part of x */
397 DOUBLE yn; /* normalized part of y */
398 DOUBLE zn; /* normalized part of z */
399 int xe; /* exponent of x */
400 int ye; /* exponent of y */
401 mp_limb_t xlimbs[NLIMBS1];
402 mp_limb_t ylimbs[NLIMBS1];
404 xys = 0;
405 xn = x;
406 if (x < 0)
408 xys = 1;
409 xn = - x;
411 yn = y;
412 if (y < 0)
414 xys = 1 - xys;
415 yn = - y;
418 zs = 0;
419 zn = z;
420 if (z < 0)
422 zs = 1;
423 zn = - z;
426 /* xn, yn, zn are all positive.
427 The result is (-1)^xys * xn * yn + (-1)^zs * zn. */
428 xn = FREXP (xn, &xe);
429 yn = FREXP (yn, &ye);
430 zn = FREXP (zn, &ze);
431 xye = xe + ye;
432 /* xn, yn, zn are all < 1.0 and >= 0.5.
433 The result is
434 (-1)^xys * 2^xye * xn * yn + (-1)^zs * 2^ze * zn. */
435 if (xye < ze - MANT_BIT)
437 /* 2^xye * xn * yn < 2^xye <= 2^(ze-MANT_BIT-1) */
438 return z;
440 if (xye - 2 * MANT_BIT > ze)
442 /* 2^ze * zn < 2^ze <= 2^(xye-2*MANT_BIT-1).
443 We cannot simply do
444 return x * y;
445 because it would round differently: A round-to-even
446 in the multiplication can be a round-up or round-down
447 here, due to z. So replace z with a value that doesn't
448 require the use of long bignums but that rounds the
449 same way. */
450 zn = L_(0.5);
451 ze = xye - 2 * MANT_BIT - 1;
453 /* Convert mantissas of xn, yn, zn to limb sequences:
454 xlimbs = 2^MANT_BIT * xn
455 ylimbs = 2^MANT_BIT * yn
456 zlimbs = 2^MANT_BIT * zn */
457 decode (xn, xlimbs);
458 decode (yn, ylimbs);
459 decode (zn, zlimbs);
460 /* Multiply the mantissas of xn and yn:
461 xylimbs = xlimbs * ylimbs */
462 multiply (xlimbs, ylimbs, xylimbs);
464 /* The result is
465 (-1)^xys * 2^(xye-2*MANT_BIT) * xylimbs
466 + (-1)^zs * 2^(ze-MANT_BIT) * zlimbs.
467 Compute
468 e = min (xye-2*MANT_BIT, ze-MANT_BIT)
469 then
470 summand1 = 2^(xye-2*MANT_BIT-e) * xylimbs
471 summand2 = 2^(ze-MANT_BIT-e) * zlimbs */
472 e = MIN (xye - 2 * MANT_BIT, ze - MANT_BIT);
473 if (e == xye - 2 * MANT_BIT)
475 /* Simply copy the limbs of xylimbs. */
476 size_t i;
477 for (i = 0; i < 2 * NLIMBS1; i++)
478 summand1[i] = xylimbs[i];
479 summand1_len = 2 * NLIMBS1;
481 else
483 size_t ediff = xye - 2 * MANT_BIT - e;
484 /* Left shift the limbs of xylimbs by ediff bits. */
485 size_t ldiff = ediff / GMP_LIMB_BITS;
486 size_t shift = ediff % GMP_LIMB_BITS;
487 size_t i;
488 for (i = 0; i < ldiff; i++)
489 summand1[i] = 0;
490 if (shift > 0)
492 mp_limb_t carry = 0;
493 for (i = 0; i < 2 * NLIMBS1; i++)
495 summand1[ldiff + i] = (xylimbs[i] << shift) | carry;
496 carry = xylimbs[i] >> (GMP_LIMB_BITS - shift);
498 summand1[ldiff + 2 * NLIMBS1] = carry;
499 summand1_len = ldiff + 2 * NLIMBS1 + 1;
501 else
503 for (i = 0; i < 2 * NLIMBS1; i++)
504 summand1[ldiff + i] = xylimbs[i];
505 summand1_len = ldiff + 2 * NLIMBS1;
507 /* Estimation of needed array size:
508 ediff = (xye - 2 * MANT_BIT) - (ze - MANT_BIT) <= MANT_BIT + 1
509 therefore
510 summand1_len
511 = (ediff + GMP_LIMB_BITS - 1) / GMP_LIMB_BITS + 2 * NLIMBS1
512 <= (MANT_BIT + GMP_LIMB_BITS) / GMP_LIMB_BITS + 2 * NLIMBS1
513 <= 3 * NLIMBS1 + 1
514 = NLIMBS3 */
515 if (!(summand1_len <= NLIMBS3))
516 abort ();
518 if (e == ze - MANT_BIT)
520 /* Simply copy the limbs of zlimbs. */
521 size_t i;
522 for (i = 0; i < NLIMBS1; i++)
523 summand2[i] = zlimbs[i];
524 summand2_len = NLIMBS1;
526 else
528 size_t ediff = ze - MANT_BIT - e;
529 /* Left shift the limbs of zlimbs by ediff bits. */
530 size_t ldiff = ediff / GMP_LIMB_BITS;
531 size_t shift = ediff % GMP_LIMB_BITS;
532 size_t i;
533 for (i = 0; i < ldiff; i++)
534 summand2[i] = 0;
535 if (shift > 0)
537 mp_limb_t carry = 0;
538 for (i = 0; i < NLIMBS1; i++)
540 summand2[ldiff + i] = (zlimbs[i] << shift) | carry;
541 carry = zlimbs[i] >> (GMP_LIMB_BITS - shift);
543 summand2[ldiff + NLIMBS1] = carry;
544 summand2_len = ldiff + NLIMBS1 + 1;
546 else
548 for (i = 0; i < NLIMBS1; i++)
549 summand2[ldiff + i] = zlimbs[i];
550 summand2_len = ldiff + NLIMBS1;
552 /* Estimation of needed array size:
553 ediff = (ze - MANT_BIT) - (xye - 2 * MANT_BIT) <= 2 * MANT_BIT
554 therefore
555 summand2_len
556 = (ediff + GMP_LIMB_BITS - 1) / GMP_LIMB_BITS + NLIMBS1
557 <= (2 * MANT_BIT + GMP_LIMB_BITS - 1) / GMP_LIMB_BITS + NLIMBS1
558 <= 3 * NLIMBS1 + 1
559 = NLIMBS3 */
560 if (!(summand2_len <= NLIMBS3))
561 abort ();
564 /* The result is
565 (-1)^xys * 2^e * summand1 + (-1)^zs * 2^e * summand2. */
566 if (xys == zs)
568 /* Perform an addition. */
569 size_t i;
570 mp_limb_t carry;
572 sign = xys;
573 carry = 0;
574 for (i = 0; i < MIN (summand1_len, summand2_len); i++)
576 mp_limb_t digit1 = summand1[i];
577 mp_limb_t digit2 = summand2[i];
578 sum[i] = digit1 + digit2 + carry;
579 carry = (carry
580 ? digit1 >= (mp_limb_t)-1 - digit2
581 : digit1 > (mp_limb_t)-1 - digit2);
583 if (summand1_len > summand2_len)
584 for (; i < summand1_len; i++)
586 mp_limb_t digit1 = summand1[i];
587 sum[i] = carry + digit1;
588 carry = carry && digit1 == (mp_limb_t)-1;
590 else
591 for (; i < summand2_len; i++)
593 mp_limb_t digit2 = summand2[i];
594 sum[i] = carry + digit2;
595 carry = carry && digit2 == (mp_limb_t)-1;
597 if (carry)
598 sum[i++] = carry;
599 sum_len = i;
601 else
603 /* Perform a subtraction. */
604 /* Compare summand1 and summand2 by magnitude. */
605 while (summand1[summand1_len - 1] == 0)
606 summand1_len--;
607 while (summand2[summand2_len - 1] == 0)
608 summand2_len--;
609 if (summand1_len > summand2_len)
610 sign = xys;
611 else if (summand1_len < summand2_len)
612 sign = zs;
613 else
615 size_t i = summand1_len;
616 for (;;)
618 --i;
619 if (summand1[i] > summand2[i])
621 sign = xys;
622 break;
624 if (summand1[i] < summand2[i])
626 sign = zs;
627 break;
629 if (i == 0)
630 /* summand1 and summand2 are equal. */
631 return L_(0.0);
634 if (sign == xys)
636 /* Compute summand1 - summand2. */
637 size_t i;
638 mp_limb_t carry;
640 carry = 0;
641 for (i = 0; i < summand2_len; i++)
643 mp_limb_t digit1 = summand1[i];
644 mp_limb_t digit2 = summand2[i];
645 sum[i] = digit1 - digit2 - carry;
646 carry = (carry ? digit1 <= digit2 : digit1 < digit2);
648 for (; i < summand1_len; i++)
650 mp_limb_t digit1 = summand1[i];
651 sum[i] = digit1 - carry;
652 carry = carry && digit1 == 0;
654 if (carry)
655 abort ();
656 sum_len = summand1_len;
658 else
660 /* Compute summand2 - summand1. */
661 size_t i;
662 mp_limb_t carry;
664 carry = 0;
665 for (i = 0; i < summand1_len; i++)
667 mp_limb_t digit1 = summand1[i];
668 mp_limb_t digit2 = summand2[i];
669 sum[i] = digit2 - digit1 - carry;
670 carry = (carry ? digit2 <= digit1 : digit2 < digit1);
672 for (; i < summand2_len; i++)
674 mp_limb_t digit2 = summand2[i];
675 sum[i] = digit2 - carry;
676 carry = carry && digit2 == 0;
678 if (carry)
679 abort ();
680 sum_len = summand2_len;
684 /* The result is
685 (-1)^sign * 2^e * sum. */
686 /* Now perform the rounding to MANT_BIT mantissa bits. */
687 while (sum[sum_len - 1] == 0)
688 sum_len--;
689 /* Here we know that the most significant limb, sum[sum_len - 1], is
690 non-zero. */
692 /* How many bits the sum has. */
693 unsigned int sum_bits =
694 integer_length (sum[sum_len - 1]) + (sum_len - 1) * GMP_LIMB_BITS;
695 /* How many bits to keep when rounding. */
696 unsigned int keep_bits;
697 /* How many bits to round off. */
698 unsigned int roundoff_bits;
699 if (e + (int) sum_bits >= MIN_EXP)
700 /* 2^e * sum >= 2^(MIN_EXP-1).
701 result will be a normalized number. */
702 keep_bits = MANT_BIT;
703 else if (e + (int) sum_bits >= MIN_EXP - MANT_BIT)
704 /* 2^e * sum >= 2^(MIN_EXP-MANT_BIT-1).
705 result will be a denormalized number or rounded to zero. */
706 keep_bits = e + (int) sum_bits - (MIN_EXP + MANT_BIT);
707 else
708 /* 2^e * sum < 2^(MIN_EXP-MANT_BIT-1). Round to zero. */
709 return L_(0.0);
710 /* Note: 0 <= keep_bits <= MANT_BIT. */
711 if (sum_bits <= keep_bits)
713 /* Nothing to do. */
714 roundoff_bits = 0;
715 keep_bits = sum_bits;
717 else
719 int round_up;
720 roundoff_bits = sum_bits - keep_bits; /* > 0, <= sum_bits */
722 #if HAVE_FEGETROUND && defined FE_TOWARDZERO
723 /* Cf. <http://pubs.opengroup.org/onlinepubs/9699919799/functions/fegetround.html> */
724 int rounding_mode = fegetround ();
725 if (rounding_mode == FE_TOWARDZERO)
726 round_up = 0;
727 else if (rounding_mode == FE_DOWNWARD)
728 round_up = sign;
729 else if (rounding_mode == FE_UPWARD)
730 round_up = !sign;
731 #else
732 /* Cf. <http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/float.h.html> */
733 int rounding_mode = FLT_ROUNDS;
734 if (rounding_mode == 0) /* toward zero */
735 round_up = 0;
736 else if (rounding_mode == 3) /* toward negative infinity */
737 round_up = sign;
738 else if (rounding_mode == 2) /* toward positive infinity */
739 round_up = !sign;
740 #endif
741 else
743 /* Round to nearest. */
744 round_up = 0;
745 /* Test bit (roundoff_bits-1). */
746 if ((sum[(roundoff_bits - 1) / GMP_LIMB_BITS]
747 >> ((roundoff_bits - 1) % GMP_LIMB_BITS)) & 1)
749 /* Test bits roundoff_bits-1 .. 0. */
750 bool halfway =
751 ((sum[(roundoff_bits - 1) / GMP_LIMB_BITS]
752 & (((mp_limb_t) 1 << ((roundoff_bits - 1) % GMP_LIMB_BITS)) - 1))
753 == 0);
754 if (halfway)
756 int i;
757 for (i = (roundoff_bits - 1) / GMP_LIMB_BITS - 1; i >= 0; i--)
758 if (sum[i] != 0)
760 halfway = false;
761 break;
764 if (halfway)
765 /* Round to even. Test bit roundoff_bits. */
766 round_up = ((sum[roundoff_bits / GMP_LIMB_BITS]
767 >> (roundoff_bits % GMP_LIMB_BITS)) & 1);
768 else
769 /* Round up. */
770 round_up = 1;
774 /* Perform the rounding. */
776 size_t i = roundoff_bits / GMP_LIMB_BITS;
778 size_t j = i;
779 while (j > 0)
780 sum[--j] = 0;
782 if (round_up)
784 /* Round up. */
785 sum[i] =
786 (sum[i]
787 | (((mp_limb_t) 1 << (roundoff_bits % GMP_LIMB_BITS)) - 1))
788 + 1;
789 if (sum[i] == 0)
791 /* Propagate carry. */
792 while (i < sum_len - 1)
794 ++i;
795 sum[i] += 1;
796 if (sum[i] != 0)
797 break;
800 /* sum[i] is the most significant limb that was
801 incremented. */
802 if (i == sum_len - 1 && (sum[i] & (sum[i] - 1)) == 0)
804 /* Through the carry, one more bit is needed. */
805 if (sum[i] != 0)
806 sum_bits += 1;
807 else
809 /* Instead of requiring one more limb of memory,
810 perform a shift by one bit, and adjust the
811 exponent. */
812 sum[i] = (mp_limb_t) 1 << (GMP_LIMB_BITS - 1);
813 e += 1;
815 /* The bit sequence has the form 1000...000. */
816 keep_bits = 1;
819 else
821 /* Round down. */
822 sum[i] &= ((mp_limb_t) -1 << (roundoff_bits % GMP_LIMB_BITS));
823 if (i == sum_len - 1 && sum[i] == 0)
824 /* The entire sum has become zero. */
825 return L_(0.0);
829 /* The result is
830 (-1)^sign * 2^e * sum
831 and here we know that
832 2^(sum_bits-1) <= sum < 2^sum_bits,
833 and sum is a multiple of 2^(sum_bits-keep_bits), where
834 0 < keep_bits <= MANT_BIT and keep_bits <= sum_bits.
835 (If keep_bits was initially 0, the rounding either returned zero
836 or produced a bit sequence of the form 1000...000, setting
837 keep_bits to 1.) */
839 /* Split the keep_bits bits into chunks of at most 32 bits. */
840 unsigned int chunk_count = (keep_bits - 1) / GMP_LIMB_BITS + 1;
841 /* 1 <= chunk_count <= ceil (sum_bits / GMP_LIMB_BITS) = sum_len. */
842 static const DOUBLE chunk_multiplier = /* 2^-GMP_LIMB_BITS */
843 L_(1.0) / ((DOUBLE) (1 << (GMP_LIMB_BITS / 2))
844 * (DOUBLE) (1 << ((GMP_LIMB_BITS + 1) / 2)));
845 unsigned int shift = sum_bits % GMP_LIMB_BITS;
846 DOUBLE fsum;
847 if (MANT_BIT <= GMP_LIMB_BITS)
849 /* Since keep_bits <= MANT_BIT <= GMP_LIMB_BITS,
850 chunk_count is 1. No need for a loop. */
851 if (shift == 0)
852 fsum = (DOUBLE) sum[sum_len - 1];
853 else
854 fsum = (DOUBLE)
855 ((sum[sum_len - 1] << (GMP_LIMB_BITS - shift))
856 | (sum_len >= 2 ? sum[sum_len - 2] >> shift : 0));
858 else
860 int k;
861 k = chunk_count - 1;
862 if (shift == 0)
864 /* First loop round. */
865 fsum = (DOUBLE) sum[sum_len - k - 1];
866 /* General loop. */
867 while (--k >= 0)
869 fsum *= chunk_multiplier;
870 fsum += (DOUBLE) sum[sum_len - k - 1];
873 else
875 /* First loop round. */
877 VOLATILE mp_limb_t chunk =
878 (sum[sum_len - k - 1] << (GMP_LIMB_BITS - shift))
879 | (sum_len >= k + 2 ? sum[sum_len - k - 2] >> shift : 0);
880 fsum = (DOUBLE) chunk;
882 /* General loop. */
883 while (--k >= 0)
885 fsum *= chunk_multiplier;
887 VOLATILE mp_limb_t chunk =
888 (sum[sum_len - k - 1] << (GMP_LIMB_BITS - shift))
889 | (sum[sum_len - k - 2] >> shift);
890 fsum += (DOUBLE) chunk;
895 fsum = LDEXP (fsum, e + (int) sum_bits - GMP_LIMB_BITS);
896 return (sign ? - fsum : fsum);
901 else
902 return z;
904 else
905 return (x * y) + z;