4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
18 static void partsN(return_nan)(FloatPartsN *a, float_status *s)
21 case float_class_snan:
22 float_raise(float_flag_invalid | float_flag_invalid_snan, s);
23 if (s->default_nan_mode) {
24 parts_default_nan(a, s);
26 parts_silence_nan(a, s);
29 case float_class_qnan:
30 if (s->default_nan_mode) {
31 parts_default_nan(a, s);
35 g_assert_not_reached();
39 static FloatPartsN *partsN(pick_nan)(FloatPartsN *a, FloatPartsN *b,
42 if (is_snan(a->cls) || is_snan(b->cls)) {
43 float_raise(float_flag_invalid | float_flag_invalid_snan, s);
46 if (s->default_nan_mode) {
47 parts_default_nan(a, s);
49 int cmp = frac_cmp(a, b);
51 cmp = a->sign < b->sign;
54 if (pickNaN(a->cls, b->cls, cmp > 0, s)) {
57 if (is_snan(a->cls)) {
58 parts_silence_nan(a, s);
64 static FloatPartsN *partsN(pick_nan_muladd)(FloatPartsN *a, FloatPartsN *b,
65 FloatPartsN *c, float_status *s,
66 int ab_mask, int abc_mask)
70 if (unlikely(abc_mask & float_cmask_snan)) {
71 float_raise(float_flag_invalid | float_flag_invalid_snan, s);
74 which = pickNaNMulAdd(a->cls, b->cls, c->cls,
75 ab_mask == float_cmask_infzero, s);
77 if (s->default_nan_mode || which == 3) {
79 * Note that this check is after pickNaNMulAdd so that function
80 * has an opportunity to set the Invalid flag for infzero.
82 parts_default_nan(a, s);
96 g_assert_not_reached();
98 if (is_snan(a->cls)) {
99 parts_silence_nan(a, s);
105 * Canonicalize the FloatParts structure. Determine the class,
106 * unbias the exponent, and normalize the fraction.
108 static void partsN(canonicalize)(FloatPartsN *p, float_status *status,
111 if (unlikely(p->exp == 0)) {
112 if (likely(frac_eqz(p))) {
113 p->cls = float_class_zero;
114 } else if (status->flush_inputs_to_zero) {
115 float_raise(float_flag_input_denormal, status);
116 p->cls = float_class_zero;
119 int shift = frac_normalize(p);
120 p->cls = float_class_normal;
121 p->exp = fmt->frac_shift - fmt->exp_bias
122 - shift + !fmt->m68k_denormal;
124 } else if (likely(p->exp < fmt->exp_max) || fmt->arm_althp) {
125 p->cls = float_class_normal;
126 p->exp -= fmt->exp_bias;
127 frac_shl(p, fmt->frac_shift);
128 p->frac_hi |= DECOMPOSED_IMPLICIT_BIT;
129 } else if (likely(frac_eqz(p))) {
130 p->cls = float_class_inf;
132 frac_shl(p, fmt->frac_shift);
133 p->cls = (parts_is_snan_frac(p->frac_hi, status)
134 ? float_class_snan : float_class_qnan);
139 * Round and uncanonicalize a floating-point number by parts. There
140 * are FRAC_SHIFT bits that may require rounding at the bottom of the
141 * fraction; these bits will be removed. The exponent will be biased
142 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
144 static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s,
147 const int exp_max = fmt->exp_max;
148 const int frac_shift = fmt->frac_shift;
149 const uint64_t round_mask = fmt->round_mask;
150 const uint64_t frac_lsb = round_mask + 1;
151 const uint64_t frac_lsbm1 = round_mask ^ (round_mask >> 1);
152 const uint64_t roundeven_mask = round_mask | frac_lsb;
154 bool overflow_norm = false;
157 switch (s->float_rounding_mode) {
158 case float_round_nearest_even:
159 if (N > 64 && frac_lsb == 0) {
160 inc = ((p->frac_hi & 1) || (p->frac_lo & round_mask) != frac_lsbm1
163 inc = ((p->frac_lo & roundeven_mask) != frac_lsbm1
167 case float_round_ties_away:
170 case float_round_to_zero:
171 overflow_norm = true;
175 inc = p->sign ? 0 : round_mask;
176 overflow_norm = p->sign;
178 case float_round_down:
179 inc = p->sign ? round_mask : 0;
180 overflow_norm = !p->sign;
182 case float_round_to_odd:
183 overflow_norm = true;
185 case float_round_to_odd_inf:
186 if (N > 64 && frac_lsb == 0) {
187 inc = p->frac_hi & 1 ? 0 : round_mask;
189 inc = p->frac_lo & frac_lsb ? 0 : round_mask;
193 g_assert_not_reached();
196 exp = p->exp + fmt->exp_bias;
197 if (likely(exp > 0)) {
198 if (p->frac_lo & round_mask) {
199 flags |= float_flag_inexact;
200 if (frac_addi(p, p, inc)) {
202 p->frac_hi |= DECOMPOSED_IMPLICIT_BIT;
205 p->frac_lo &= ~round_mask;
208 if (fmt->arm_althp) {
209 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
210 if (unlikely(exp > exp_max)) {
211 /* Overflow. Return the maximum normal. */
212 flags = float_flag_invalid;
215 p->frac_lo &= ~round_mask;
217 } else if (unlikely(exp >= exp_max)) {
218 flags |= float_flag_overflow;
219 if (s->rebias_overflow) {
220 exp -= fmt->exp_re_bias;
221 } else if (overflow_norm) {
222 flags |= float_flag_inexact;
225 p->frac_lo &= ~round_mask;
227 flags |= float_flag_inexact;
228 p->cls = float_class_inf;
233 frac_shr(p, frac_shift);
234 } else if (unlikely(s->rebias_underflow)) {
235 flags |= float_flag_underflow;
236 exp += fmt->exp_re_bias;
237 if (p->frac_lo & round_mask) {
238 flags |= float_flag_inexact;
239 if (frac_addi(p, p, inc)) {
241 p->frac_hi |= DECOMPOSED_IMPLICIT_BIT;
244 p->frac_lo &= ~round_mask;
246 frac_shr(p, frac_shift);
247 } else if (s->flush_to_zero) {
248 flags |= float_flag_output_denormal;
249 p->cls = float_class_zero;
253 bool is_tiny = s->tininess_before_rounding || exp < 0;
257 is_tiny = !frac_addi(&discard, p, inc);
260 frac_shrjam(p, !fmt->m68k_denormal - exp);
262 if (p->frac_lo & round_mask) {
263 /* Need to recompute round-to-even/round-to-odd. */
264 switch (s->float_rounding_mode) {
265 case float_round_nearest_even:
266 if (N > 64 && frac_lsb == 0) {
267 inc = ((p->frac_hi & 1) ||
268 (p->frac_lo & round_mask) != frac_lsbm1
271 inc = ((p->frac_lo & roundeven_mask) != frac_lsbm1
275 case float_round_to_odd:
276 case float_round_to_odd_inf:
277 if (N > 64 && frac_lsb == 0) {
278 inc = p->frac_hi & 1 ? 0 : round_mask;
280 inc = p->frac_lo & frac_lsb ? 0 : round_mask;
286 flags |= float_flag_inexact;
287 frac_addi(p, p, inc);
288 p->frac_lo &= ~round_mask;
291 exp = (p->frac_hi & DECOMPOSED_IMPLICIT_BIT) && !fmt->m68k_denormal;
292 frac_shr(p, frac_shift);
294 if (is_tiny && (flags & float_flag_inexact)) {
295 flags |= float_flag_underflow;
297 if (exp == 0 && frac_eqz(p)) {
298 p->cls = float_class_zero;
302 float_raise(flags, s);
305 static void partsN(uncanon)(FloatPartsN *p, float_status *s,
308 if (likely(p->cls == float_class_normal)) {
309 parts_uncanon_normal(p, s, fmt);
312 case float_class_zero:
316 case float_class_inf:
317 g_assert(!fmt->arm_althp);
318 p->exp = fmt->exp_max;
321 case float_class_qnan:
322 case float_class_snan:
323 g_assert(!fmt->arm_althp);
324 p->exp = fmt->exp_max;
325 frac_shr(p, fmt->frac_shift);
330 g_assert_not_reached();
335 * Returns the result of adding or subtracting the values of the
336 * floating-point values `a' and `b'. The operation is performed
337 * according to the IEC/IEEE Standard for Binary Floating-Point
340 static FloatPartsN *partsN(addsub)(FloatPartsN *a, FloatPartsN *b,
341 float_status *s, bool subtract)
343 bool b_sign = b->sign ^ subtract;
344 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
346 if (a->sign != b_sign) {
348 if (likely(ab_mask == float_cmask_normal)) {
349 if (parts_sub_normal(a, b)) {
352 /* Subtract was exact, fall through to set sign. */
353 ab_mask = float_cmask_zero;
356 if (ab_mask == float_cmask_zero) {
357 a->sign = s->float_rounding_mode == float_round_down;
361 if (unlikely(ab_mask & float_cmask_anynan)) {
365 if (ab_mask & float_cmask_inf) {
366 if (a->cls != float_class_inf) {
370 if (b->cls != float_class_inf) {
375 float_raise(float_flag_invalid | float_flag_invalid_isi, s);
376 parts_default_nan(a, s);
381 if (likely(ab_mask == float_cmask_normal)) {
382 parts_add_normal(a, b);
386 if (ab_mask == float_cmask_zero) {
390 if (unlikely(ab_mask & float_cmask_anynan)) {
394 if (ab_mask & float_cmask_inf) {
395 a->cls = float_class_inf;
400 if (b->cls == float_class_zero) {
401 g_assert(a->cls == float_class_normal);
405 g_assert(a->cls == float_class_zero);
406 g_assert(b->cls == float_class_normal);
412 return parts_pick_nan(a, b, s);
416 * Returns the result of multiplying the floating-point values `a' and
417 * `b'. The operation is performed according to the IEC/IEEE Standard
418 * for Binary Floating-Point Arithmetic.
420 static FloatPartsN *partsN(mul)(FloatPartsN *a, FloatPartsN *b,
423 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
424 bool sign = a->sign ^ b->sign;
426 if (likely(ab_mask == float_cmask_normal)) {
429 frac_mulw(&tmp, a, b);
430 frac_truncjam(a, &tmp);
432 a->exp += b->exp + 1;
433 if (!(a->frac_hi & DECOMPOSED_IMPLICIT_BIT)) {
442 /* Inf * Zero == NaN */
443 if (unlikely(ab_mask == float_cmask_infzero)) {
444 float_raise(float_flag_invalid | float_flag_invalid_imz, s);
445 parts_default_nan(a, s);
449 if (unlikely(ab_mask & float_cmask_anynan)) {
450 return parts_pick_nan(a, b, s);
453 /* Multiply by 0 or Inf */
454 if (ab_mask & float_cmask_inf) {
455 a->cls = float_class_inf;
460 g_assert(ab_mask & float_cmask_zero);
461 a->cls = float_class_zero;
467 * Returns the result of multiplying the floating-point values `a' and
468 * `b' then adding 'c', with no intermediate rounding step after the
469 * multiplication. The operation is performed according to the
470 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
471 * The flags argument allows the caller to select negation of the
472 * addend, the intermediate product, or the final result. (The
473 * difference between this and having the caller do a separate
474 * negation is that negating externally will flip the sign bit on NaNs.)
476 * Requires A and C extracted into a double-sized structure to provide the
477 * extra space for the widening multiply.
479 static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b,
480 FloatPartsN *c, int flags, float_status *s)
482 int ab_mask, abc_mask;
483 FloatPartsW p_widen, c_widen;
485 ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
486 abc_mask = float_cmask(c->cls) | ab_mask;
489 * It is implementation-defined whether the cases of (0,inf,qnan)
490 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
491 * they return if they do), so we have to hand this information
492 * off to the target-specific pick-a-NaN routine.
494 if (unlikely(abc_mask & float_cmask_anynan)) {
495 return parts_pick_nan_muladd(a, b, c, s, ab_mask, abc_mask);
498 if (flags & float_muladd_negate_c) {
502 /* Compute the sign of the product into A. */
504 if (flags & float_muladd_negate_product) {
508 if (unlikely(ab_mask != float_cmask_normal)) {
509 if (unlikely(ab_mask == float_cmask_infzero)) {
510 float_raise(float_flag_invalid | float_flag_invalid_imz, s);
514 if (ab_mask & float_cmask_inf) {
515 if (c->cls == float_class_inf && a->sign != c->sign) {
516 float_raise(float_flag_invalid | float_flag_invalid_isi, s);
522 g_assert(ab_mask & float_cmask_zero);
523 if (c->cls == float_class_normal) {
527 if (c->cls == float_class_zero) {
528 if (a->sign != c->sign) {
529 goto return_sub_zero;
533 g_assert(c->cls == float_class_inf);
536 if (unlikely(c->cls == float_class_inf)) {
541 /* Perform the multiplication step. */
542 p_widen.sign = a->sign;
543 p_widen.exp = a->exp + b->exp + 1;
544 frac_mulw(&p_widen, a, b);
545 if (!(p_widen.frac_hi & DECOMPOSED_IMPLICIT_BIT)) {
546 frac_add(&p_widen, &p_widen, &p_widen);
550 /* Perform the addition step. */
551 if (c->cls != float_class_zero) {
552 /* Zero-extend C to less significant bits. */
553 frac_widen(&c_widen, c);
554 c_widen.exp = c->exp;
556 if (a->sign == c->sign) {
557 parts_add_normal(&p_widen, &c_widen);
558 } else if (!parts_sub_normal(&p_widen, &c_widen)) {
559 goto return_sub_zero;
563 /* Narrow with sticky bit, for proper rounding later. */
564 frac_truncjam(a, &p_widen);
565 a->sign = p_widen.sign;
566 a->exp = p_widen.exp;
569 if (flags & float_muladd_halve_result) {
573 if (flags & float_muladd_negate_result) {
579 a->sign = s->float_rounding_mode == float_round_down;
581 a->cls = float_class_zero;
585 a->cls = float_class_inf;
589 parts_default_nan(a, s);
594 * Returns the result of dividing the floating-point value `a' by the
595 * corresponding value `b'. The operation is performed according to
596 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
598 static FloatPartsN *partsN(div)(FloatPartsN *a, FloatPartsN *b,
601 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
602 bool sign = a->sign ^ b->sign;
604 if (likely(ab_mask == float_cmask_normal)) {
606 a->exp -= b->exp + frac_div(a, b);
610 /* 0/0 or Inf/Inf => NaN */
611 if (unlikely(ab_mask == float_cmask_zero)) {
612 float_raise(float_flag_invalid | float_flag_invalid_zdz, s);
615 if (unlikely(ab_mask == float_cmask_inf)) {
616 float_raise(float_flag_invalid | float_flag_invalid_idi, s);
620 /* All the NaN cases */
621 if (unlikely(ab_mask & float_cmask_anynan)) {
622 return parts_pick_nan(a, b, s);
628 if (a->cls == float_class_inf) {
633 if (a->cls == float_class_zero) {
638 if (b->cls == float_class_inf) {
639 a->cls = float_class_zero;
644 g_assert(b->cls == float_class_zero);
645 float_raise(float_flag_divbyzero, s);
646 a->cls = float_class_inf;
650 parts_default_nan(a, s);
655 * Floating point remainder, per IEC/IEEE, or modulus.
657 static FloatPartsN *partsN(modrem)(FloatPartsN *a, FloatPartsN *b,
658 uint64_t *mod_quot, float_status *s)
660 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
662 if (likely(ab_mask == float_cmask_normal)) {
663 frac_modrem(a, b, mod_quot);
671 /* All the NaN cases */
672 if (unlikely(ab_mask & float_cmask_anynan)) {
673 return parts_pick_nan(a, b, s);
677 if (a->cls == float_class_inf || b->cls == float_class_zero) {
678 float_raise(float_flag_invalid, s);
679 parts_default_nan(a, s);
684 g_assert(b->cls == float_class_inf || a->cls == float_class_zero);
691 * The base algorithm is lifted from
692 * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrtf.c
693 * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrt.c
694 * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrtl.c
695 * and is thus MIT licenced.
697 static void partsN(sqrt)(FloatPartsN *a, float_status *status,
700 const uint32_t three32 = 3u << 30;
701 const uint64_t three64 = 3ull << 62;
702 uint32_t d32, m32, r32, s32, u32; /* 32-bit computation */
703 uint64_t d64, m64, r64, s64, u64; /* 64-bit computation */
704 uint64_t dh, dl, rh, rl, sh, sl, uh, ul; /* 128-bit computation */
705 uint64_t d0h, d0l, d1h, d1l, d2h, d2l;
710 if (unlikely(a->cls != float_class_normal)) {
712 case float_class_snan:
713 case float_class_qnan:
714 parts_return_nan(a, status);
716 case float_class_zero:
718 case float_class_inf:
719 if (unlikely(a->sign)) {
724 g_assert_not_reached();
728 if (unlikely(a->sign)) {
733 * Argument reduction.
734 * x = 4^e frac; with integer e, and frac in [1, 4)
735 * m = frac fixed point at bit 62, since we're in base 4.
736 * If base-2 exponent is odd, exchange that for multiply by 2,
737 * which results in no shift.
739 exp_odd = a->exp & 1;
740 index = extract64(a->frac_hi, 57, 6) | (!exp_odd << 6);
746 * Approximate r ~= 1/sqrt(m) and s ~= sqrt(m) when m in [1, 4).
749 * 7-bit lookup table (1-bit exponent and 6-bit significand).
751 * The relative error (e = r0*sqrt(m)-1) of a linear estimate
752 * (r0 = a*m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best;
753 * a table lookup is faster and needs one less iteration.
754 * The 7-bit table gives |e| < 0x1.fdp-9.
756 * A Newton-Raphson iteration for r is
762 * Fixed point representations:
763 * m, s, d, u, three are all 2.30; r is 0.32
768 r32 = rsqrt_tab[index] << 16;
769 /* |r*sqrt(m) - 1| < 0x1.FDp-9 */
771 s32 = ((uint64_t)m32 * r32) >> 32;
772 d32 = ((uint64_t)s32 * r32) >> 32;
776 /* float64 or smaller */
778 r32 = ((uint64_t)r32 * u32) >> 31;
779 /* |r*sqrt(m) - 1| < 0x1.7Bp-16 */
781 s32 = ((uint64_t)m32 * r32) >> 32;
782 d32 = ((uint64_t)s32 * r32) >> 32;
785 if (fmt->frac_size <= 23) {
786 /* float32 or smaller */
788 s32 = ((uint64_t)s32 * u32) >> 32; /* 3.29 */
789 s32 = (s32 - 1) >> 6; /* 9.23 */
790 /* s < sqrt(m) < s + 0x1.08p-23 */
792 /* compute nearest rounded result to 2.23 bits */
793 uint32_t d0 = (m32 << 16) - s32 * s32;
794 uint32_t d1 = s32 - d0;
795 uint32_t d2 = d1 + s32 + 1;
797 a->frac_hi = (uint64_t)s32 << (64 - 25);
799 /* increment or decrement for inexact */
801 a->frac_hi += ((int32_t)(d1 ^ d2) < 0 ? -1 : 1);
808 r64 = (uint64_t)r32 * u32 * 2;
809 /* |r*sqrt(m) - 1| < 0x1.37-p29; convert to 64-bit arithmetic */
810 mul64To128(m64, r64, &s64, &discard);
811 mul64To128(s64, r64, &d64, &discard);
814 mul64To128(s64, u64, &s64, &discard); /* 3.61 */
815 s64 = (s64 - 2) >> 9; /* 12.52 */
817 /* Compute nearest rounded result */
818 uint64_t d0 = (m64 << 42) - s64 * s64;
819 uint64_t d1 = s64 - d0;
820 uint64_t d2 = d1 + s64 + 1;
822 a->frac_hi = s64 << (64 - 54);
824 /* increment or decrement for inexact */
826 a->frac_hi += ((int64_t)(d1 ^ d2) < 0 ? -1 : 1);
831 r64 = (uint64_t)r32 * u32 * 2;
832 /* |r*sqrt(m) - 1| < 0x1.7Bp-16; convert to 64-bit arithmetic */
834 mul64To128(m64, r64, &s64, &discard);
835 mul64To128(s64, r64, &d64, &discard);
837 mul64To128(u64, r64, &r64, &discard);
839 /* |r*sqrt(m) - 1| < 0x1.a5p-31 */
841 mul64To128(m64, r64, &s64, &discard);
842 mul64To128(s64, r64, &d64, &discard);
844 mul64To128(u64, r64, &rh, &rl);
845 add128(rh, rl, rh, rl, &rh, &rl);
846 /* |r*sqrt(m) - 1| < 0x1.c001p-59; change to 128-bit arithmetic */
848 mul128To256(a->frac_hi, a->frac_lo, rh, rl, &sh, &sl, &discard, &discard);
849 mul128To256(sh, sl, rh, rl, &dh, &dl, &discard, &discard);
850 sub128(three64, 0, dh, dl, &uh, &ul);
851 mul128To256(uh, ul, sh, sl, &sh, &sl, &discard, &discard); /* 3.125 */
852 /* -0x1p-116 < s - sqrt(m) < 0x3.8001p-125 */
854 sub128(sh, sl, 0, 4, &sh, &sl);
855 shift128Right(sh, sl, 13, &sh, &sl); /* 16.112 */
856 /* s < sqrt(m) < s + 1ulp */
858 /* Compute nearest rounded result */
859 mul64To128(sl, sl, &d0h, &d0l);
861 sub128(a->frac_lo << 34, 0, d0h, d0l, &d0h, &d0l);
862 sub128(sh, sl, d0h, d0l, &d1h, &d1l);
863 add128(sh, sl, 0, 1, &d2h, &d2l);
864 add128(d2h, d2l, d1h, d1l, &d2h, &d2l);
865 add128(sh, sl, 0, d1h >> 63, &sh, &sl);
866 shift128Left(sh, sl, 128 - 114, &sh, &sl);
868 /* increment or decrement for inexact */
870 if ((int64_t)(d1h ^ d2h) < 0) {
871 sub128(sh, sl, 0, 1, &sh, &sl);
873 add128(sh, sl, 0, 1, &sh, &sl);
880 /* Convert back from base 4 to base 2. */
882 if (!(a->frac_hi & DECOMPOSED_IMPLICIT_BIT)) {
890 float_raise(float_flag_invalid | float_flag_invalid_sqrt, status);
891 parts_default_nan(a, status);
895 * Rounds the floating-point value `a' to an integer, and returns the
896 * result as a floating-point value. The operation is performed
897 * according to the IEC/IEEE Standard for Binary Floating-Point
900 * parts_round_to_int_normal is an internal helper function for
901 * normal numbers only, returning true for inexact but not directly
902 * raising float_flag_inexact.
904 static bool partsN(round_to_int_normal)(FloatPartsN *a, FloatRoundMode rmode,
905 int scale, int frac_size)
907 uint64_t frac_lsb, frac_lsbm1, rnd_even_mask, rnd_mask, inc;
910 scale = MIN(MAX(scale, -0x10000), 0x10000);
918 case float_round_nearest_even:
922 /* Shift left one, discarding DECOMPOSED_IMPLICIT_BIT */
923 frac_add(&tmp, a, a);
924 /* Anything remaining means frac > 0.5. */
925 one = !frac_eqz(&tmp);
928 case float_round_ties_away:
931 case float_round_to_zero:
937 case float_round_down:
940 case float_round_to_odd:
944 g_assert_not_reached();
950 a->frac_hi = DECOMPOSED_IMPLICIT_BIT;
952 a->cls = float_class_zero;
957 if (a->exp >= frac_size) {
962 if (N > 64 && a->exp < N - 64) {
964 * Rounding is not in the low word -- shift lsb to bit 2,
965 * which leaves room for sticky and rounding bit.
967 shift_adj = (N - 1) - (a->exp + 2);
968 frac_shrjam(a, shift_adj);
972 frac_lsb = DECOMPOSED_IMPLICIT_BIT >> (a->exp & 63);
975 frac_lsbm1 = frac_lsb >> 1;
976 rnd_mask = frac_lsb - 1;
977 rnd_even_mask = rnd_mask | frac_lsb;
979 if (!(a->frac_lo & rnd_mask)) {
980 /* Fractional bits already clear, undo the shift above. */
981 frac_shl(a, shift_adj);
986 case float_round_nearest_even:
987 inc = ((a->frac_lo & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
989 case float_round_ties_away:
992 case float_round_to_zero:
996 inc = a->sign ? 0 : rnd_mask;
998 case float_round_down:
999 inc = a->sign ? rnd_mask : 0;
1001 case float_round_to_odd:
1002 inc = a->frac_lo & frac_lsb ? 0 : rnd_mask;
1005 g_assert_not_reached();
1008 if (shift_adj == 0) {
1009 if (frac_addi(a, a, inc)) {
1011 a->frac_hi |= DECOMPOSED_IMPLICIT_BIT;
1014 a->frac_lo &= ~rnd_mask;
1016 frac_addi(a, a, inc);
1017 a->frac_lo &= ~rnd_mask;
1018 /* Be careful shifting back, not to overflow */
1019 frac_shl(a, shift_adj - 1);
1020 if (a->frac_hi & DECOMPOSED_IMPLICIT_BIT) {
1029 static void partsN(round_to_int)(FloatPartsN *a, FloatRoundMode rmode,
1030 int scale, float_status *s,
1031 const FloatFmt *fmt)
1034 case float_class_qnan:
1035 case float_class_snan:
1036 parts_return_nan(a, s);
1038 case float_class_zero:
1039 case float_class_inf:
1041 case float_class_normal:
1042 if (parts_round_to_int_normal(a, rmode, scale, fmt->frac_size)) {
1043 float_raise(float_flag_inexact, s);
1047 g_assert_not_reached();
1052 * Returns the result of converting the floating-point value `a' to
1053 * the two's complement integer format. The conversion is performed
1054 * according to the IEC/IEEE Standard for Binary Floating-Point
1055 * Arithmetic---which means in particular that the conversion is
1056 * rounded according to the current rounding mode. If `a' is a NaN,
1057 * the largest positive integer is returned. Otherwise, if the
1058 * conversion overflows, the largest integer with the same sign as `a'
1061 static int64_t partsN(float_to_sint)(FloatPartsN *p, FloatRoundMode rmode,
1062 int scale, int64_t min, int64_t max,
1069 case float_class_snan:
1070 flags |= float_flag_invalid_snan;
1072 case float_class_qnan:
1073 flags |= float_flag_invalid;
1077 case float_class_inf:
1078 flags = float_flag_invalid | float_flag_invalid_cvti;
1079 r = p->sign ? min : max;
1082 case float_class_zero:
1085 case float_class_normal:
1086 /* TODO: N - 2 is frac_size for rounding; could use input fmt. */
1087 if (parts_round_to_int_normal(p, rmode, scale, N - 2)) {
1088 flags = float_flag_inexact;
1091 if (p->exp <= DECOMPOSED_BINARY_POINT) {
1092 r = p->frac_hi >> (DECOMPOSED_BINARY_POINT - p->exp);
1097 if (r <= -(uint64_t)min) {
1100 flags = float_flag_invalid | float_flag_invalid_cvti;
1103 } else if (r > max) {
1104 flags = float_flag_invalid | float_flag_invalid_cvti;
1110 g_assert_not_reached();
1113 float_raise(flags, s);
1118 * Returns the result of converting the floating-point value `a' to
1119 * the unsigned integer format. The conversion is performed according
1120 * to the IEC/IEEE Standard for Binary Floating-Point
1121 * Arithmetic---which means in particular that the conversion is
1122 * rounded according to the current rounding mode. If `a' is a NaN,
1123 * the largest unsigned integer is returned. Otherwise, if the
1124 * conversion overflows, the largest unsigned integer is returned. If
1125 * the 'a' is negative, the result is rounded and zero is returned;
1126 * values that do not round to zero will raise the inexact exception
1129 static uint64_t partsN(float_to_uint)(FloatPartsN *p, FloatRoundMode rmode,
1130 int scale, uint64_t max, float_status *s)
1136 case float_class_snan:
1137 flags |= float_flag_invalid_snan;
1139 case float_class_qnan:
1140 flags |= float_flag_invalid;
1144 case float_class_inf:
1145 flags = float_flag_invalid | float_flag_invalid_cvti;
1146 r = p->sign ? 0 : max;
1149 case float_class_zero:
1152 case float_class_normal:
1153 /* TODO: N - 2 is frac_size for rounding; could use input fmt. */
1154 if (parts_round_to_int_normal(p, rmode, scale, N - 2)) {
1155 flags = float_flag_inexact;
1156 if (p->cls == float_class_zero) {
1163 flags = float_flag_invalid | float_flag_invalid_cvti;
1165 } else if (p->exp > DECOMPOSED_BINARY_POINT) {
1166 flags = float_flag_invalid | float_flag_invalid_cvti;
1169 r = p->frac_hi >> (DECOMPOSED_BINARY_POINT - p->exp);
1171 flags = float_flag_invalid | float_flag_invalid_cvti;
1178 g_assert_not_reached();
1181 float_raise(flags, s);
1186 * Like partsN(float_to_sint), except do not saturate the result.
1187 * Instead, return the rounded unbounded precision two's compliment result,
1188 * modulo 2**(bitsm1 + 1).
1190 static int64_t partsN(float_to_sint_modulo)(FloatPartsN *p,
1191 FloatRoundMode rmode,
1192 int bitsm1, float_status *s)
1196 bool overflow = false;
1199 case float_class_snan:
1200 flags |= float_flag_invalid_snan;
1202 case float_class_qnan:
1203 flags |= float_flag_invalid;
1207 case float_class_inf:
1212 case float_class_zero:
1215 case float_class_normal:
1216 /* TODO: N - 2 is frac_size for rounding; could use input fmt. */
1217 if (parts_round_to_int_normal(p, rmode, 0, N - 2)) {
1218 flags = float_flag_inexact;
1221 if (p->exp <= DECOMPOSED_BINARY_POINT) {
1223 * Because we rounded to integral, and exp < 64,
1224 * we know frac_low is zero.
1226 r = p->frac_hi >> (DECOMPOSED_BINARY_POINT - p->exp);
1227 if (p->exp < bitsm1) {
1228 /* Result in range. */
1229 } else if (p->exp == bitsm1) {
1230 /* The only in-range value is INT_MIN. */
1231 overflow = !p->sign || p->frac_hi != DECOMPOSED_IMPLICIT_BIT;
1236 /* Overflow, but there might still be bits to return. */
1237 int shl = p->exp - DECOMPOSED_BINARY_POINT;
1253 g_assert_not_reached();
1257 flags = float_flag_invalid | float_flag_invalid_cvti;
1259 float_raise(flags, s);
1264 * Integer to float conversions
1266 * Returns the result of converting the two's complement integer `a'
1267 * to the floating-point format. The conversion is performed according
1268 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1270 static void partsN(sint_to_float)(FloatPartsN *p, int64_t a,
1271 int scale, float_status *s)
1276 memset(p, 0, sizeof(*p));
1279 p->cls = float_class_zero;
1283 p->cls = float_class_normal;
1289 scale = MIN(MAX(scale, -0x10000), 0x10000);
1291 p->exp = DECOMPOSED_BINARY_POINT - shift + scale;
1292 p->frac_hi = f << shift;
1296 * Unsigned Integer to float conversions
1298 * Returns the result of converting the unsigned integer `a' to the
1299 * floating-point format. The conversion is performed according to the
1300 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1302 static void partsN(uint_to_float)(FloatPartsN *p, uint64_t a,
1303 int scale, float_status *status)
1305 memset(p, 0, sizeof(*p));
1308 p->cls = float_class_zero;
1310 int shift = clz64(a);
1311 scale = MIN(MAX(scale, -0x10000), 0x10000);
1312 p->cls = float_class_normal;
1313 p->exp = DECOMPOSED_BINARY_POINT - shift + scale;
1314 p->frac_hi = a << shift;
1321 static FloatPartsN *partsN(minmax)(FloatPartsN *a, FloatPartsN *b,
1322 float_status *s, int flags)
1324 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
1325 int a_exp, b_exp, cmp;
1327 if (unlikely(ab_mask & float_cmask_anynan)) {
1329 * For minNum/maxNum (IEEE 754-2008)
1330 * or minimumNumber/maximumNumber (IEEE 754-2019),
1331 * if one operand is a QNaN, and the other
1332 * operand is numerical, then return numerical argument.
1334 if ((flags & (minmax_isnum | minmax_isnumber))
1335 && !(ab_mask & float_cmask_snan)
1336 && (ab_mask & ~float_cmask_qnan)) {
1337 return is_nan(a->cls) ? b : a;
1341 * In IEEE 754-2019, minNum, maxNum, minNumMag and maxNumMag
1342 * are removed and replaced with minimum, minimumNumber, maximum
1343 * and maximumNumber.
1344 * minimumNumber/maximumNumber behavior for SNaN is changed to:
1345 * If both operands are NaNs, a QNaN is returned.
1346 * If either operand is a SNaN,
1347 * an invalid operation exception is signaled,
1348 * but unless both operands are NaNs,
1349 * the SNaN is otherwise ignored and not converted to a QNaN.
1351 if ((flags & minmax_isnumber)
1352 && (ab_mask & float_cmask_snan)
1353 && (ab_mask & ~float_cmask_anynan)) {
1354 float_raise(float_flag_invalid, s);
1355 return is_nan(a->cls) ? b : a;
1358 return parts_pick_nan(a, b, s);
1364 if (unlikely(ab_mask != float_cmask_normal)) {
1366 case float_class_normal:
1368 case float_class_inf:
1371 case float_class_zero:
1375 g_assert_not_reached();
1379 case float_class_normal:
1381 case float_class_inf:
1384 case float_class_zero:
1388 g_assert_not_reached();
1393 /* Compare magnitudes. */
1394 cmp = a_exp - b_exp;
1396 cmp = frac_cmp(a, b);
1400 * Take the sign into account.
1401 * For ismag, only do this if the magnitudes are equal.
1403 if (!(flags & minmax_ismag) || cmp == 0) {
1404 if (a->sign != b->sign) {
1405 /* For differing signs, the negative operand is less. */
1406 cmp = a->sign ? -1 : 1;
1407 } else if (a->sign) {
1408 /* For two negative operands, invert the magnitude comparison. */
1413 if (flags & minmax_ismin) {
1416 return cmp < 0 ? b : a;
1420 * Floating point compare
1422 static FloatRelation partsN(compare)(FloatPartsN *a, FloatPartsN *b,
1423 float_status *s, bool is_quiet)
1425 int ab_mask = float_cmask(a->cls) | float_cmask(b->cls);
1427 if (likely(ab_mask == float_cmask_normal)) {
1430 if (a->sign != b->sign) {
1433 if (a->exp == b->exp) {
1434 cmp = frac_cmp(a, b);
1435 } else if (a->exp < b->exp) {
1436 cmp = float_relation_less;
1438 cmp = float_relation_greater;
1446 if (unlikely(ab_mask & float_cmask_anynan)) {
1447 if (ab_mask & float_cmask_snan) {
1448 float_raise(float_flag_invalid | float_flag_invalid_snan, s);
1449 } else if (!is_quiet) {
1450 float_raise(float_flag_invalid, s);
1452 return float_relation_unordered;
1455 if (ab_mask & float_cmask_zero) {
1456 if (ab_mask == float_cmask_zero) {
1457 return float_relation_equal;
1458 } else if (a->cls == float_class_zero) {
1465 if (ab_mask == float_cmask_inf) {
1466 if (a->sign == b->sign) {
1467 return float_relation_equal;
1469 } else if (b->cls == float_class_inf) {
1472 g_assert(a->cls == float_class_inf);
1476 return a->sign ? float_relation_less : float_relation_greater;
1478 return b->sign ? float_relation_greater : float_relation_less;
1482 * Multiply A by 2 raised to the power N.
1484 static void partsN(scalbn)(FloatPartsN *a, int n, float_status *s)
1487 case float_class_snan:
1488 case float_class_qnan:
1489 parts_return_nan(a, s);
1491 case float_class_zero:
1492 case float_class_inf:
1494 case float_class_normal:
1495 a->exp += MIN(MAX(n, -0x10000), 0x10000);
1498 g_assert_not_reached();
1505 static void partsN(log2)(FloatPartsN *a, float_status *s, const FloatFmt *fmt)
1507 uint64_t a0, a1, r, t, ign;
1509 int i, n, a_exp, f_exp;
1511 if (unlikely(a->cls != float_class_normal)) {
1513 case float_class_snan:
1514 case float_class_qnan:
1515 parts_return_nan(a, s);
1517 case float_class_zero:
1518 float_raise(float_flag_divbyzero, s);
1519 /* log2(0) = -inf */
1520 a->cls = float_class_inf;
1523 case float_class_inf:
1524 if (unlikely(a->sign)) {
1531 g_assert_not_reached();
1533 if (unlikely(a->sign)) {
1537 /* TODO: This algorithm looses bits too quickly for float128. */
1544 t = DECOMPOSED_IMPLICIT_BIT;
1548 n = fmt->frac_size + 2;
1549 if (unlikely(a_exp == -1)) {
1551 * When a_exp == -1, we're computing the log2 of a value [0.5,1.0).
1552 * When the value is very close to 1.0, there are lots of 1's in
1553 * the msb parts of the fraction. At the end, when we subtract
1554 * this value from -1.0, we can see a catastrophic loss of precision,
1555 * as 0x800..000 - 0x7ff..ffx becomes 0x000..00y, leaving only the
1556 * bits of y in the final result. To minimize this, compute as many
1558 * ??? This case needs another algorithm to avoid this.
1560 n = fmt->frac_size * 2 + 2;
1561 /* Don't compute a value overlapping the sticky bit */
1565 for (i = 0; i < n; i++) {
1567 mul128To256(a0, a1, a0, a1, &a0, &a1, &ign, &ign);
1568 } else if (a0 & 0xffffffffull) {
1569 mul64To128(a0, a0, &a0, &a1);
1570 } else if (a0 & ~DECOMPOSED_IMPLICIT_BIT) {
1577 if (a0 & DECOMPOSED_IMPLICIT_BIT) {
1578 if (unlikely(a_exp == 0 && r == 0)) {
1580 * When a_exp == 0, we're computing the log2 of a value
1581 * [1.0,2.0). When the value is very close to 1.0, there
1582 * are lots of 0's in the msb parts of the fraction.
1583 * We need to compute more digits to produce a correct
1584 * result -- restart at the top of the fraction.
1585 * ??? This is likely to lose precision quickly, as for
1586 * float128; we may need another method.
1589 t = r = DECOMPOSED_IMPLICIT_BIT;
1595 add128(a0, a1, a0, a1, &a0, &a1);
1600 /* Set sticky for inexact. */
1601 r |= (a1 || a0 & ~DECOMPOSED_IMPLICIT_BIT);
1604 parts_sint_to_float(a, a_exp, 0, s);
1609 memset(&f, 0, sizeof(f));
1610 f.cls = float_class_normal;
1612 f.exp = f_exp - frac_normalize(&f);
1615 parts_sub_normal(a, &f);
1616 } else if (a_exp > 0) {
1617 parts_add_normal(a, &f);
1624 float_raise(float_flag_invalid, s);
1625 parts_default_nan(a, s);