iotests: Skip test for ENOMEM error
[qemu/ar7.git] / fpu / softfloat.c
blobe124df9f7e795ee35641f4201e45fa31152f0573
1 /*
2 * QEMU float support
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine: (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output. These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
112 static inline uint32_t extractFloat16Frac(float16 a)
114 return float16_val(a) & 0x3ff;
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
121 static inline int extractFloat16Exp(float16 a)
123 return (float16_val(a) >> 10) & 0x1f;
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
130 static inline flag extractFloat16Sign(float16 a)
132 return float16_val(a)>>15;
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
139 static inline uint32_t extractFloat32Frac(float32 a)
141 return float32_val(a) & 0x007FFFFF;
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
148 static inline int extractFloat32Exp(float32 a)
150 return (float32_val(a) >> 23) & 0xFF;
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
157 static inline flag extractFloat32Sign(float32 a)
159 return float32_val(a) >> 31;
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
166 static inline uint64_t extractFloat64Frac(float64 a)
168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
175 static inline int extractFloat64Exp(float64 a)
177 return (float64_val(a) >> 52) & 0x7FF;
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
184 static inline flag extractFloat64Sign(float64 a)
186 return float64_val(a) >> 63;
190 * Classify a floating point number. Everything above float_class_qnan
191 * is a NaN so cls >= float_class_qnan is any NaN.
194 typedef enum __attribute__ ((__packed__)) {
195 float_class_unclassified,
196 float_class_zero,
197 float_class_normal,
198 float_class_inf,
199 float_class_qnan, /* all NaNs from here */
200 float_class_snan,
201 float_class_dnan,
202 float_class_msnan, /* maybe silenced */
203 } FloatClass;
206 * Structure holding all of the decomposed parts of a float. The
207 * exponent is unbiased and the fraction is normalized. All
208 * calculations are done with a 64 bit fraction and then rounded as
209 * appropriate for the final format.
211 * Thanks to the packed FloatClass a decent compiler should be able to
212 * fit the whole structure into registers and avoid using the stack
213 * for parameter passing.
216 typedef struct {
217 uint64_t frac;
218 int32_t exp;
219 FloatClass cls;
220 bool sign;
221 } FloatParts;
223 #define DECOMPOSED_BINARY_POINT (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
227 /* Structure holding all of the relevant parameters for a format.
228 * exp_size: the size of the exponent field
229 * exp_bias: the offset applied to the exponent field
230 * exp_max: the maximum normalised exponent
231 * frac_size: the size of the fraction field
232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233 * The following are computed based the size of fraction
234 * frac_lsb: least significant bit of fraction
235 * fram_lsbm1: the bit bellow the least significant bit (for rounding)
236 * round_mask/roundeven_mask: masks used for rounding
238 typedef struct {
239 int exp_size;
240 int exp_bias;
241 int exp_max;
242 int frac_size;
243 int frac_shift;
244 uint64_t frac_lsb;
245 uint64_t frac_lsbm1;
246 uint64_t round_mask;
247 uint64_t roundeven_mask;
248 } FloatFmt;
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F) \
252 .exp_size = E, \
253 .exp_bias = ((1 << E) - 1) >> 1, \
254 .exp_max = (1 << E) - 1, \
255 .frac_size = F, \
256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
262 static const FloatFmt float16_params = {
263 FLOAT_PARAMS(5, 10)
266 static const FloatFmt float32_params = {
267 FLOAT_PARAMS(8, 23)
270 static const FloatFmt float64_params = {
271 FLOAT_PARAMS(11, 52)
274 /* Unpack a float to parts, but do not canonicalize. */
275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
277 const int sign_pos = fmt.frac_size + fmt.exp_size;
279 return (FloatParts) {
280 .cls = float_class_unclassified,
281 .sign = extract64(raw, sign_pos, 1),
282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283 .frac = extract64(raw, 0, fmt.frac_size),
287 static inline FloatParts float16_unpack_raw(float16 f)
289 return unpack_raw(float16_params, f);
292 static inline FloatParts float32_unpack_raw(float32 f)
294 return unpack_raw(float32_params, f);
297 static inline FloatParts float64_unpack_raw(float64 f)
299 return unpack_raw(float64_params, f);
302 /* Pack a float from parts, but do not canonicalize. */
303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
305 const int sign_pos = fmt.frac_size + fmt.exp_size;
306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307 return deposit64(ret, sign_pos, 1, p.sign);
310 static inline float16 float16_pack_raw(FloatParts p)
312 return make_float16(pack_raw(float16_params, p));
315 static inline float32 float32_pack_raw(FloatParts p)
317 return make_float32(pack_raw(float32_params, p));
320 static inline float64 float64_pack_raw(FloatParts p)
322 return make_float64(pack_raw(float64_params, p));
325 /* Canonicalize EXP and FRAC, setting CLS. */
326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327 float_status *status)
329 if (part.exp == parm->exp_max) {
330 if (part.frac == 0) {
331 part.cls = float_class_inf;
332 } else {
333 #ifdef NO_SIGNALING_NANS
334 part.cls = float_class_qnan;
335 #else
336 int64_t msb = part.frac << (parm->frac_shift + 2);
337 if ((msb < 0) == status->snan_bit_is_one) {
338 part.cls = float_class_snan;
339 } else {
340 part.cls = float_class_qnan;
342 #endif
344 } else if (part.exp == 0) {
345 if (likely(part.frac == 0)) {
346 part.cls = float_class_zero;
347 } else if (status->flush_inputs_to_zero) {
348 float_raise(float_flag_input_denormal, status);
349 part.cls = float_class_zero;
350 part.frac = 0;
351 } else {
352 int shift = clz64(part.frac) - 1;
353 part.cls = float_class_normal;
354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
355 part.frac <<= shift;
357 } else {
358 part.cls = float_class_normal;
359 part.exp -= parm->exp_bias;
360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
362 return part;
365 /* Round and uncanonicalize a floating-point number by parts. There
366 * are FRAC_SHIFT bits that may require rounding at the bottom of the
367 * fraction; these bits will be removed. The exponent will be biased
368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
371 static FloatParts round_canonical(FloatParts p, float_status *s,
372 const FloatFmt *parm)
374 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375 const uint64_t round_mask = parm->round_mask;
376 const uint64_t roundeven_mask = parm->roundeven_mask;
377 const int exp_max = parm->exp_max;
378 const int frac_shift = parm->frac_shift;
379 uint64_t frac, inc;
380 int exp, flags = 0;
381 bool overflow_norm;
383 frac = p.frac;
384 exp = p.exp;
386 switch (p.cls) {
387 case float_class_normal:
388 switch (s->float_rounding_mode) {
389 case float_round_nearest_even:
390 overflow_norm = false;
391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
392 break;
393 case float_round_ties_away:
394 overflow_norm = false;
395 inc = frac_lsbm1;
396 break;
397 case float_round_to_zero:
398 overflow_norm = true;
399 inc = 0;
400 break;
401 case float_round_up:
402 inc = p.sign ? 0 : round_mask;
403 overflow_norm = p.sign;
404 break;
405 case float_round_down:
406 inc = p.sign ? round_mask : 0;
407 overflow_norm = !p.sign;
408 break;
409 default:
410 g_assert_not_reached();
413 exp += parm->exp_bias;
414 if (likely(exp > 0)) {
415 if (frac & round_mask) {
416 flags |= float_flag_inexact;
417 frac += inc;
418 if (frac & DECOMPOSED_OVERFLOW_BIT) {
419 frac >>= 1;
420 exp++;
423 frac >>= frac_shift;
425 if (unlikely(exp >= exp_max)) {
426 flags |= float_flag_overflow | float_flag_inexact;
427 if (overflow_norm) {
428 exp = exp_max - 1;
429 frac = -1;
430 } else {
431 p.cls = float_class_inf;
432 goto do_inf;
435 } else if (s->flush_to_zero) {
436 flags |= float_flag_output_denormal;
437 p.cls = float_class_zero;
438 goto do_zero;
439 } else {
440 bool is_tiny = (s->float_detect_tininess
441 == float_tininess_before_rounding)
442 || (exp < 0)
443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
445 shift64RightJamming(frac, 1 - exp, &frac);
446 if (frac & round_mask) {
447 /* Need to recompute round-to-even. */
448 if (s->float_rounding_mode == float_round_nearest_even) {
449 inc = ((frac & roundeven_mask) != frac_lsbm1
450 ? frac_lsbm1 : 0);
452 flags |= float_flag_inexact;
453 frac += inc;
456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
457 frac >>= frac_shift;
459 if (is_tiny && (flags & float_flag_inexact)) {
460 flags |= float_flag_underflow;
462 if (exp == 0 && frac == 0) {
463 p.cls = float_class_zero;
466 break;
468 case float_class_zero:
469 do_zero:
470 exp = 0;
471 frac = 0;
472 break;
474 case float_class_inf:
475 do_inf:
476 exp = exp_max;
477 frac = 0;
478 break;
480 case float_class_qnan:
481 case float_class_snan:
482 exp = exp_max;
483 break;
485 default:
486 g_assert_not_reached();
489 float_raise(flags, s);
490 p.exp = exp;
491 p.frac = frac;
492 return p;
495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
497 return canonicalize(float16_unpack_raw(f), &float16_params, s);
500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
502 switch (p.cls) {
503 case float_class_dnan:
504 return float16_default_nan(s);
505 case float_class_msnan:
506 return float16_maybe_silence_nan(float16_pack_raw(p), s);
507 default:
508 p = round_canonical(p, s, &float16_params);
509 return float16_pack_raw(p);
513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
515 return canonicalize(float32_unpack_raw(f), &float32_params, s);
518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
520 switch (p.cls) {
521 case float_class_dnan:
522 return float32_default_nan(s);
523 case float_class_msnan:
524 return float32_maybe_silence_nan(float32_pack_raw(p), s);
525 default:
526 p = round_canonical(p, s, &float32_params);
527 return float32_pack_raw(p);
531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
533 return canonicalize(float64_unpack_raw(f), &float64_params, s);
536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
538 switch (p.cls) {
539 case float_class_dnan:
540 return float64_default_nan(s);
541 case float_class_msnan:
542 return float64_maybe_silence_nan(float64_pack_raw(p), s);
543 default:
544 p = round_canonical(p, s, &float64_params);
545 return float64_pack_raw(p);
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c)
552 return unlikely(c >= float_class_qnan);
554 static bool is_snan(FloatClass c)
556 return c == float_class_snan;
558 static bool is_qnan(FloatClass c)
560 return c == float_class_qnan;
563 static FloatParts return_nan(FloatParts a, float_status *s)
565 switch (a.cls) {
566 case float_class_snan:
567 s->float_exception_flags |= float_flag_invalid;
568 a.cls = float_class_msnan;
569 /* fall through */
570 case float_class_qnan:
571 if (s->default_nan_mode) {
572 a.cls = float_class_dnan;
574 break;
576 default:
577 g_assert_not_reached();
579 return a;
582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
584 if (is_snan(a.cls) || is_snan(b.cls)) {
585 s->float_exception_flags |= float_flag_invalid;
588 if (s->default_nan_mode) {
589 a.cls = float_class_dnan;
590 } else {
591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
592 is_qnan(b.cls), is_snan(b.cls),
593 a.frac > b.frac ||
594 (a.frac == b.frac && a.sign < b.sign))) {
595 a = b;
597 a.cls = float_class_msnan;
599 return a;
602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
603 bool inf_zero, float_status *s)
605 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
606 s->float_exception_flags |= float_flag_invalid;
609 if (s->default_nan_mode) {
610 a.cls = float_class_dnan;
611 } else {
612 switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
613 is_qnan(b.cls), is_snan(b.cls),
614 is_qnan(c.cls), is_snan(c.cls),
615 inf_zero, s)) {
616 case 0:
617 break;
618 case 1:
619 a = b;
620 break;
621 case 2:
622 a = c;
623 break;
624 case 3:
625 a.cls = float_class_dnan;
626 return a;
627 default:
628 g_assert_not_reached();
631 a.cls = float_class_msnan;
633 return a;
637 * Returns the result of adding or subtracting the values of the
638 * floating-point values `a' and `b'. The operation is performed
639 * according to the IEC/IEEE Standard for Binary Floating-Point
640 * Arithmetic.
643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
644 float_status *s)
646 bool a_sign = a.sign;
647 bool b_sign = b.sign ^ subtract;
649 if (a_sign != b_sign) {
650 /* Subtraction */
652 if (a.cls == float_class_normal && b.cls == float_class_normal) {
653 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
654 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
655 a.frac = a.frac - b.frac;
656 } else {
657 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
658 a.frac = b.frac - a.frac;
659 a.exp = b.exp;
660 a_sign ^= 1;
663 if (a.frac == 0) {
664 a.cls = float_class_zero;
665 a.sign = s->float_rounding_mode == float_round_down;
666 } else {
667 int shift = clz64(a.frac) - 1;
668 a.frac = a.frac << shift;
669 a.exp = a.exp - shift;
670 a.sign = a_sign;
672 return a;
674 if (is_nan(a.cls) || is_nan(b.cls)) {
675 return pick_nan(a, b, s);
677 if (a.cls == float_class_inf) {
678 if (b.cls == float_class_inf) {
679 float_raise(float_flag_invalid, s);
680 a.cls = float_class_dnan;
682 return a;
684 if (a.cls == float_class_zero && b.cls == float_class_zero) {
685 a.sign = s->float_rounding_mode == float_round_down;
686 return a;
688 if (a.cls == float_class_zero || b.cls == float_class_inf) {
689 b.sign = a_sign ^ 1;
690 return b;
692 if (b.cls == float_class_zero) {
693 return a;
695 } else {
696 /* Addition */
697 if (a.cls == float_class_normal && b.cls == float_class_normal) {
698 if (a.exp > b.exp) {
699 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
700 } else if (a.exp < b.exp) {
701 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
702 a.exp = b.exp;
704 a.frac += b.frac;
705 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
706 a.frac >>= 1;
707 a.exp += 1;
709 return a;
711 if (is_nan(a.cls) || is_nan(b.cls)) {
712 return pick_nan(a, b, s);
714 if (a.cls == float_class_inf || b.cls == float_class_zero) {
715 return a;
717 if (b.cls == float_class_inf || a.cls == float_class_zero) {
718 b.sign = b_sign;
719 return b;
722 g_assert_not_reached();
726 * Returns the result of adding or subtracting the floating-point
727 * values `a' and `b'. The operation is performed according to the
728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
731 float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
732 float_status *status)
734 FloatParts pa = float16_unpack_canonical(a, status);
735 FloatParts pb = float16_unpack_canonical(b, status);
736 FloatParts pr = addsub_floats(pa, pb, false, status);
738 return float16_round_pack_canonical(pr, status);
741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
742 float_status *status)
744 FloatParts pa = float32_unpack_canonical(a, status);
745 FloatParts pb = float32_unpack_canonical(b, status);
746 FloatParts pr = addsub_floats(pa, pb, false, status);
748 return float32_round_pack_canonical(pr, status);
751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
752 float_status *status)
754 FloatParts pa = float64_unpack_canonical(a, status);
755 FloatParts pb = float64_unpack_canonical(b, status);
756 FloatParts pr = addsub_floats(pa, pb, false, status);
758 return float64_round_pack_canonical(pr, status);
761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
762 float_status *status)
764 FloatParts pa = float16_unpack_canonical(a, status);
765 FloatParts pb = float16_unpack_canonical(b, status);
766 FloatParts pr = addsub_floats(pa, pb, true, status);
768 return float16_round_pack_canonical(pr, status);
771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
772 float_status *status)
774 FloatParts pa = float32_unpack_canonical(a, status);
775 FloatParts pb = float32_unpack_canonical(b, status);
776 FloatParts pr = addsub_floats(pa, pb, true, status);
778 return float32_round_pack_canonical(pr, status);
781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
782 float_status *status)
784 FloatParts pa = float64_unpack_canonical(a, status);
785 FloatParts pb = float64_unpack_canonical(b, status);
786 FloatParts pr = addsub_floats(pa, pb, true, status);
788 return float64_round_pack_canonical(pr, status);
792 * Returns the result of multiplying the floating-point values `a' and
793 * `b'. The operation is performed according to the IEC/IEEE Standard
794 * for Binary Floating-Point Arithmetic.
797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
799 bool sign = a.sign ^ b.sign;
801 if (a.cls == float_class_normal && b.cls == float_class_normal) {
802 uint64_t hi, lo;
803 int exp = a.exp + b.exp;
805 mul64To128(a.frac, b.frac, &hi, &lo);
806 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
807 if (lo & DECOMPOSED_OVERFLOW_BIT) {
808 shift64RightJamming(lo, 1, &lo);
809 exp += 1;
812 /* Re-use a */
813 a.exp = exp;
814 a.sign = sign;
815 a.frac = lo;
816 return a;
818 /* handle all the NaN cases */
819 if (is_nan(a.cls) || is_nan(b.cls)) {
820 return pick_nan(a, b, s);
822 /* Inf * Zero == NaN */
823 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
824 (a.cls == float_class_zero && b.cls == float_class_inf)) {
825 s->float_exception_flags |= float_flag_invalid;
826 a.cls = float_class_dnan;
827 a.sign = sign;
828 return a;
830 /* Multiply by 0 or Inf */
831 if (a.cls == float_class_inf || a.cls == float_class_zero) {
832 a.sign = sign;
833 return a;
835 if (b.cls == float_class_inf || b.cls == float_class_zero) {
836 b.sign = sign;
837 return b;
839 g_assert_not_reached();
842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
843 float_status *status)
845 FloatParts pa = float16_unpack_canonical(a, status);
846 FloatParts pb = float16_unpack_canonical(b, status);
847 FloatParts pr = mul_floats(pa, pb, status);
849 return float16_round_pack_canonical(pr, status);
852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
853 float_status *status)
855 FloatParts pa = float32_unpack_canonical(a, status);
856 FloatParts pb = float32_unpack_canonical(b, status);
857 FloatParts pr = mul_floats(pa, pb, status);
859 return float32_round_pack_canonical(pr, status);
862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
863 float_status *status)
865 FloatParts pa = float64_unpack_canonical(a, status);
866 FloatParts pb = float64_unpack_canonical(b, status);
867 FloatParts pr = mul_floats(pa, pb, status);
869 return float64_round_pack_canonical(pr, status);
873 * Returns the result of multiplying the floating-point values `a' and
874 * `b' then adding 'c', with no intermediate rounding step after the
875 * multiplication. The operation is performed according to the
876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
877 * The flags argument allows the caller to select negation of the
878 * addend, the intermediate product, or the final result. (The
879 * difference between this and having the caller do a separate
880 * negation is that negating externally will flip the sign bit on
881 * NaNs.)
884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
885 int flags, float_status *s)
887 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
888 ((1 << float_class_inf) | (1 << float_class_zero));
889 bool p_sign;
890 bool sign_flip = flags & float_muladd_negate_result;
891 FloatClass p_class;
892 uint64_t hi, lo;
893 int p_exp;
895 /* It is implementation-defined whether the cases of (0,inf,qnan)
896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
897 * they return if they do), so we have to hand this information
898 * off to the target-specific pick-a-NaN routine.
900 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
901 return pick_nan_muladd(a, b, c, inf_zero, s);
904 if (inf_zero) {
905 s->float_exception_flags |= float_flag_invalid;
906 a.cls = float_class_dnan;
907 return a;
910 if (flags & float_muladd_negate_c) {
911 c.sign ^= 1;
914 p_sign = a.sign ^ b.sign;
916 if (flags & float_muladd_negate_product) {
917 p_sign ^= 1;
920 if (a.cls == float_class_inf || b.cls == float_class_inf) {
921 p_class = float_class_inf;
922 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
923 p_class = float_class_zero;
924 } else {
925 p_class = float_class_normal;
928 if (c.cls == float_class_inf) {
929 if (p_class == float_class_inf && p_sign != c.sign) {
930 s->float_exception_flags |= float_flag_invalid;
931 a.cls = float_class_dnan;
932 } else {
933 a.cls = float_class_inf;
934 a.sign = c.sign ^ sign_flip;
936 return a;
939 if (p_class == float_class_inf) {
940 a.cls = float_class_inf;
941 a.sign = p_sign ^ sign_flip;
942 return a;
945 if (p_class == float_class_zero) {
946 if (c.cls == float_class_zero) {
947 if (p_sign != c.sign) {
948 p_sign = s->float_rounding_mode == float_round_down;
950 c.sign = p_sign;
951 } else if (flags & float_muladd_halve_result) {
952 c.exp -= 1;
954 c.sign ^= sign_flip;
955 return c;
958 /* a & b should be normals now... */
959 assert(a.cls == float_class_normal &&
960 b.cls == float_class_normal);
962 p_exp = a.exp + b.exp;
964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
965 * result.
967 mul64To128(a.frac, b.frac, &hi, &lo);
968 /* binary point now at bit 124 */
970 /* check for overflow */
971 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
972 shift128RightJamming(hi, lo, 1, &hi, &lo);
973 p_exp += 1;
976 /* + add/sub */
977 if (c.cls == float_class_zero) {
978 /* move binary point back to 62 */
979 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
980 } else {
981 int exp_diff = p_exp - c.exp;
982 if (p_sign == c.sign) {
983 /* Addition */
984 if (exp_diff <= 0) {
985 shift128RightJamming(hi, lo,
986 DECOMPOSED_BINARY_POINT - exp_diff,
987 &hi, &lo);
988 lo += c.frac;
989 p_exp = c.exp;
990 } else {
991 uint64_t c_hi, c_lo;
992 /* shift c to the same binary point as the product (124) */
993 c_hi = c.frac >> 2;
994 c_lo = 0;
995 shift128RightJamming(c_hi, c_lo,
996 exp_diff,
997 &c_hi, &c_lo);
998 add128(hi, lo, c_hi, c_lo, &hi, &lo);
999 /* move binary point back to 62 */
1000 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1003 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004 shift64RightJamming(lo, 1, &lo);
1005 p_exp += 1;
1008 } else {
1009 /* Subtraction */
1010 uint64_t c_hi, c_lo;
1011 /* make C binary point match product at bit 124 */
1012 c_hi = c.frac >> 2;
1013 c_lo = 0;
1015 if (exp_diff <= 0) {
1016 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1017 if (exp_diff == 0
1019 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1021 } else {
1022 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1023 p_sign ^= 1;
1024 p_exp = c.exp;
1026 } else {
1027 shift128RightJamming(c_hi, c_lo,
1028 exp_diff,
1029 &c_hi, &c_lo);
1030 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1033 if (hi == 0 && lo == 0) {
1034 a.cls = float_class_zero;
1035 a.sign = s->float_rounding_mode == float_round_down;
1036 a.sign ^= sign_flip;
1037 return a;
1038 } else {
1039 int shift;
1040 if (hi != 0) {
1041 shift = clz64(hi);
1042 } else {
1043 shift = clz64(lo) + 64;
1045 /* Normalizing to a binary point of 124 is the
1046 correct adjust for the exponent. However since we're
1047 shifting, we might as well put the binary point back
1048 at 62 where we really want it. Therefore shift as
1049 if we're leaving 1 bit at the top of the word, but
1050 adjust the exponent as if we're leaving 3 bits. */
1051 shift -= 1;
1052 if (shift >= 64) {
1053 lo = lo << (shift - 64);
1054 } else {
1055 hi = (hi << shift) | (lo >> (64 - shift));
1056 lo = hi | ((lo << shift) != 0);
1058 p_exp -= shift - 2;
1063 if (flags & float_muladd_halve_result) {
1064 p_exp -= 1;
1067 /* finally prepare our result */
1068 a.cls = float_class_normal;
1069 a.sign = p_sign ^ sign_flip;
1070 a.exp = p_exp;
1071 a.frac = lo;
1073 return a;
1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077 int flags, float_status *status)
1079 FloatParts pa = float16_unpack_canonical(a, status);
1080 FloatParts pb = float16_unpack_canonical(b, status);
1081 FloatParts pc = float16_unpack_canonical(c, status);
1082 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1084 return float16_round_pack_canonical(pr, status);
1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088 int flags, float_status *status)
1090 FloatParts pa = float32_unpack_canonical(a, status);
1091 FloatParts pb = float32_unpack_canonical(b, status);
1092 FloatParts pc = float32_unpack_canonical(c, status);
1093 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1095 return float32_round_pack_canonical(pr, status);
1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099 int flags, float_status *status)
1101 FloatParts pa = float64_unpack_canonical(a, status);
1102 FloatParts pb = float64_unpack_canonical(b, status);
1103 FloatParts pc = float64_unpack_canonical(c, status);
1104 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1106 return float64_round_pack_canonical(pr, status);
1110 * Returns the result of dividing the floating-point value `a' by the
1111 * corresponding value `b'. The operation is performed according to
1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1117 bool sign = a.sign ^ b.sign;
1119 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120 uint64_t temp_lo, temp_hi;
1121 int exp = a.exp - b.exp;
1122 if (a.frac < b.frac) {
1123 exp -= 1;
1124 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125 &temp_hi, &temp_lo);
1126 } else {
1127 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128 &temp_hi, &temp_lo);
1130 /* LSB of quot is set if inexact which roundandpack will use
1131 * to set flags. Yet again we re-use a for the result */
1132 a.frac = div128To64(temp_lo, temp_hi, b.frac);
1133 a.sign = sign;
1134 a.exp = exp;
1135 return a;
1137 /* handle all the NaN cases */
1138 if (is_nan(a.cls) || is_nan(b.cls)) {
1139 return pick_nan(a, b, s);
1141 /* 0/0 or Inf/Inf */
1142 if (a.cls == b.cls
1144 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145 s->float_exception_flags |= float_flag_invalid;
1146 a.cls = float_class_dnan;
1147 return a;
1149 /* Div 0 => Inf */
1150 if (b.cls == float_class_zero) {
1151 s->float_exception_flags |= float_flag_divbyzero;
1152 a.cls = float_class_inf;
1153 a.sign = sign;
1154 return a;
1156 /* Inf / x or 0 / x */
1157 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1158 a.sign = sign;
1159 return a;
1161 /* Div by Inf */
1162 if (b.cls == float_class_inf) {
1163 a.cls = float_class_zero;
1164 a.sign = sign;
1165 return a;
1167 g_assert_not_reached();
1170 float16 float16_div(float16 a, float16 b, float_status *status)
1172 FloatParts pa = float16_unpack_canonical(a, status);
1173 FloatParts pb = float16_unpack_canonical(b, status);
1174 FloatParts pr = div_floats(pa, pb, status);
1176 return float16_round_pack_canonical(pr, status);
1179 float32 float32_div(float32 a, float32 b, float_status *status)
1181 FloatParts pa = float32_unpack_canonical(a, status);
1182 FloatParts pb = float32_unpack_canonical(b, status);
1183 FloatParts pr = div_floats(pa, pb, status);
1185 return float32_round_pack_canonical(pr, status);
1188 float64 float64_div(float64 a, float64 b, float_status *status)
1190 FloatParts pa = float64_unpack_canonical(a, status);
1191 FloatParts pb = float64_unpack_canonical(b, status);
1192 FloatParts pr = div_floats(pa, pb, status);
1194 return float64_round_pack_canonical(pr, status);
1198 * Rounds the floating-point value `a' to an integer, and returns the
1199 * result as a floating-point value. The operation is performed
1200 * according to the IEC/IEEE Standard for Binary Floating-Point
1201 * Arithmetic.
1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1206 if (is_nan(a.cls)) {
1207 return return_nan(a, s);
1210 switch (a.cls) {
1211 case float_class_zero:
1212 case float_class_inf:
1213 case float_class_qnan:
1214 /* already "integral" */
1215 break;
1216 case float_class_normal:
1217 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218 /* already integral */
1219 break;
1221 if (a.exp < 0) {
1222 bool one;
1223 /* all fractional */
1224 s->float_exception_flags |= float_flag_inexact;
1225 switch (rounding_mode) {
1226 case float_round_nearest_even:
1227 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1228 break;
1229 case float_round_ties_away:
1230 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1231 break;
1232 case float_round_to_zero:
1233 one = false;
1234 break;
1235 case float_round_up:
1236 one = !a.sign;
1237 break;
1238 case float_round_down:
1239 one = a.sign;
1240 break;
1241 default:
1242 g_assert_not_reached();
1245 if (one) {
1246 a.frac = DECOMPOSED_IMPLICIT_BIT;
1247 a.exp = 0;
1248 } else {
1249 a.cls = float_class_zero;
1251 } else {
1252 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253 uint64_t frac_lsbm1 = frac_lsb >> 1;
1254 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255 uint64_t rnd_mask = rnd_even_mask >> 1;
1256 uint64_t inc;
1258 switch (rounding_mode) {
1259 case float_round_nearest_even:
1260 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1261 break;
1262 case float_round_ties_away:
1263 inc = frac_lsbm1;
1264 break;
1265 case float_round_to_zero:
1266 inc = 0;
1267 break;
1268 case float_round_up:
1269 inc = a.sign ? 0 : rnd_mask;
1270 break;
1271 case float_round_down:
1272 inc = a.sign ? rnd_mask : 0;
1273 break;
1274 default:
1275 g_assert_not_reached();
1278 if (a.frac & rnd_mask) {
1279 s->float_exception_flags |= float_flag_inexact;
1280 a.frac += inc;
1281 a.frac &= ~rnd_mask;
1282 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1283 a.frac >>= 1;
1284 a.exp++;
1288 break;
1289 default:
1290 g_assert_not_reached();
1292 return a;
1295 float16 float16_round_to_int(float16 a, float_status *s)
1297 FloatParts pa = float16_unpack_canonical(a, s);
1298 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299 return float16_round_pack_canonical(pr, s);
1302 float32 float32_round_to_int(float32 a, float_status *s)
1304 FloatParts pa = float32_unpack_canonical(a, s);
1305 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306 return float32_round_pack_canonical(pr, s);
1309 float64 float64_round_to_int(float64 a, float_status *s)
1311 FloatParts pa = float64_unpack_canonical(a, s);
1312 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313 return float64_round_pack_canonical(pr, s);
1316 float64 float64_trunc_to_int(float64 a, float_status *s)
1318 FloatParts pa = float64_unpack_canonical(a, s);
1319 FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320 return float64_round_pack_canonical(pr, s);
1324 * Returns the result of converting the floating-point value `a' to
1325 * the two's complement integer format. The conversion is performed
1326 * according to the IEC/IEEE Standard for Binary Floating-Point
1327 * Arithmetic---which means in particular that the conversion is
1328 * rounded according to the current rounding mode. If `a' is a NaN,
1329 * the largest positive integer is returned. Otherwise, if the
1330 * conversion overflows, the largest integer with the same sign as `a'
1331 * is returned.
1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1335 int64_t min, int64_t max,
1336 float_status *s)
1338 uint64_t r;
1339 int orig_flags = get_float_exception_flags(s);
1340 FloatParts p = round_to_int(in, rmode, s);
1342 switch (p.cls) {
1343 case float_class_snan:
1344 case float_class_qnan:
1345 return max;
1346 case float_class_inf:
1347 return p.sign ? min : max;
1348 case float_class_zero:
1349 return 0;
1350 case float_class_normal:
1351 if (p.exp < DECOMPOSED_BINARY_POINT) {
1352 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1353 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1354 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1355 } else {
1356 r = UINT64_MAX;
1358 if (p.sign) {
1359 if (r < -(uint64_t) min) {
1360 return -r;
1361 } else {
1362 s->float_exception_flags = orig_flags | float_flag_invalid;
1363 return min;
1365 } else {
1366 if (r < max) {
1367 return r;
1368 } else {
1369 s->float_exception_flags = orig_flags | float_flag_invalid;
1370 return max;
1373 default:
1374 g_assert_not_reached();
1378 #define FLOAT_TO_INT(fsz, isz) \
1379 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1380 float_status *s) \
1382 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1383 return round_to_int_and_pack(p, s->float_rounding_mode, \
1384 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1385 s); \
1388 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1389 (float ## fsz a, float_status *s) \
1391 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1392 return round_to_int_and_pack(p, float_round_to_zero, \
1393 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1394 s); \
1397 FLOAT_TO_INT(16, 16)
1398 FLOAT_TO_INT(16, 32)
1399 FLOAT_TO_INT(16, 64)
1401 FLOAT_TO_INT(32, 16)
1402 FLOAT_TO_INT(32, 32)
1403 FLOAT_TO_INT(32, 64)
1405 FLOAT_TO_INT(64, 16)
1406 FLOAT_TO_INT(64, 32)
1407 FLOAT_TO_INT(64, 64)
1409 #undef FLOAT_TO_INT
1412 * Returns the result of converting the floating-point value `a' to
1413 * the unsigned integer format. The conversion is performed according
1414 * to the IEC/IEEE Standard for Binary Floating-Point
1415 * Arithmetic---which means in particular that the conversion is
1416 * rounded according to the current rounding mode. If `a' is a NaN,
1417 * the largest unsigned integer is returned. Otherwise, if the
1418 * conversion overflows, the largest unsigned integer is returned. If
1419 * the 'a' is negative, the result is rounded and zero is returned;
1420 * values that do not round to zero will raise the inexact exception
1421 * flag.
1424 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1425 float_status *s)
1427 int orig_flags = get_float_exception_flags(s);
1428 FloatParts p = round_to_int(in, rmode, s);
1430 switch (p.cls) {
1431 case float_class_snan:
1432 case float_class_qnan:
1433 s->float_exception_flags = orig_flags | float_flag_invalid;
1434 return max;
1435 case float_class_inf:
1436 return p.sign ? 0 : max;
1437 case float_class_zero:
1438 return 0;
1439 case float_class_normal:
1441 uint64_t r;
1442 if (p.sign) {
1443 s->float_exception_flags = orig_flags | float_flag_invalid;
1444 return 0;
1447 if (p.exp < DECOMPOSED_BINARY_POINT) {
1448 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1449 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1450 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1451 } else {
1452 s->float_exception_flags = orig_flags | float_flag_invalid;
1453 return max;
1456 /* For uint64 this will never trip, but if p.exp is too large
1457 * to shift a decomposed fraction we shall have exited via the
1458 * 3rd leg above.
1460 if (r > max) {
1461 s->float_exception_flags = orig_flags | float_flag_invalid;
1462 return max;
1463 } else {
1464 return r;
1467 default:
1468 g_assert_not_reached();
1472 #define FLOAT_TO_UINT(fsz, isz) \
1473 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1474 float_status *s) \
1476 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1477 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1478 UINT ## isz ## _MAX, s); \
1481 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1482 (float ## fsz a, float_status *s) \
1484 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1485 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1486 UINT ## isz ## _MAX, s); \
1489 FLOAT_TO_UINT(16, 16)
1490 FLOAT_TO_UINT(16, 32)
1491 FLOAT_TO_UINT(16, 64)
1493 FLOAT_TO_UINT(32, 16)
1494 FLOAT_TO_UINT(32, 32)
1495 FLOAT_TO_UINT(32, 64)
1497 FLOAT_TO_UINT(64, 16)
1498 FLOAT_TO_UINT(64, 32)
1499 FLOAT_TO_UINT(64, 64)
1501 #undef FLOAT_TO_UINT
1504 * Integer to float conversions
1506 * Returns the result of converting the two's complement integer `a'
1507 * to the floating-point format. The conversion is performed according
1508 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1511 static FloatParts int_to_float(int64_t a, float_status *status)
1513 FloatParts r;
1514 if (a == 0) {
1515 r.cls = float_class_zero;
1516 r.sign = false;
1517 } else if (a == (1ULL << 63)) {
1518 r.cls = float_class_normal;
1519 r.sign = true;
1520 r.frac = DECOMPOSED_IMPLICIT_BIT;
1521 r.exp = 63;
1522 } else {
1523 uint64_t f;
1524 if (a < 0) {
1525 f = -a;
1526 r.sign = true;
1527 } else {
1528 f = a;
1529 r.sign = false;
1531 int shift = clz64(f) - 1;
1532 r.cls = float_class_normal;
1533 r.exp = (DECOMPOSED_BINARY_POINT - shift);
1534 r.frac = f << shift;
1537 return r;
1540 float16 int64_to_float16(int64_t a, float_status *status)
1542 FloatParts pa = int_to_float(a, status);
1543 return float16_round_pack_canonical(pa, status);
1546 float16 int32_to_float16(int32_t a, float_status *status)
1548 return int64_to_float16(a, status);
1551 float16 int16_to_float16(int16_t a, float_status *status)
1553 return int64_to_float16(a, status);
1556 float32 int64_to_float32(int64_t a, float_status *status)
1558 FloatParts pa = int_to_float(a, status);
1559 return float32_round_pack_canonical(pa, status);
1562 float32 int32_to_float32(int32_t a, float_status *status)
1564 return int64_to_float32(a, status);
1567 float32 int16_to_float32(int16_t a, float_status *status)
1569 return int64_to_float32(a, status);
1572 float64 int64_to_float64(int64_t a, float_status *status)
1574 FloatParts pa = int_to_float(a, status);
1575 return float64_round_pack_canonical(pa, status);
1578 float64 int32_to_float64(int32_t a, float_status *status)
1580 return int64_to_float64(a, status);
1583 float64 int16_to_float64(int16_t a, float_status *status)
1585 return int64_to_float64(a, status);
1590 * Unsigned Integer to float conversions
1592 * Returns the result of converting the unsigned integer `a' to the
1593 * floating-point format. The conversion is performed according to the
1594 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1597 static FloatParts uint_to_float(uint64_t a, float_status *status)
1599 FloatParts r = { .sign = false};
1601 if (a == 0) {
1602 r.cls = float_class_zero;
1603 } else {
1604 int spare_bits = clz64(a) - 1;
1605 r.cls = float_class_normal;
1606 r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1607 if (spare_bits < 0) {
1608 shift64RightJamming(a, -spare_bits, &a);
1609 r.frac = a;
1610 } else {
1611 r.frac = a << spare_bits;
1615 return r;
1618 float16 uint64_to_float16(uint64_t a, float_status *status)
1620 FloatParts pa = uint_to_float(a, status);
1621 return float16_round_pack_canonical(pa, status);
1624 float16 uint32_to_float16(uint32_t a, float_status *status)
1626 return uint64_to_float16(a, status);
1629 float16 uint16_to_float16(uint16_t a, float_status *status)
1631 return uint64_to_float16(a, status);
1634 float32 uint64_to_float32(uint64_t a, float_status *status)
1636 FloatParts pa = uint_to_float(a, status);
1637 return float32_round_pack_canonical(pa, status);
1640 float32 uint32_to_float32(uint32_t a, float_status *status)
1642 return uint64_to_float32(a, status);
1645 float32 uint16_to_float32(uint16_t a, float_status *status)
1647 return uint64_to_float32(a, status);
1650 float64 uint64_to_float64(uint64_t a, float_status *status)
1652 FloatParts pa = uint_to_float(a, status);
1653 return float64_round_pack_canonical(pa, status);
1656 float64 uint32_to_float64(uint32_t a, float_status *status)
1658 return uint64_to_float64(a, status);
1661 float64 uint16_to_float64(uint16_t a, float_status *status)
1663 return uint64_to_float64(a, status);
1666 /* Float Min/Max */
1667 /* min() and max() functions. These can't be implemented as
1668 * 'compare and pick one input' because that would mishandle
1669 * NaNs and +0 vs -0.
1671 * minnum() and maxnum() functions. These are similar to the min()
1672 * and max() functions but if one of the arguments is a QNaN and
1673 * the other is numerical then the numerical argument is returned.
1674 * SNaNs will get quietened before being returned.
1675 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1676 * and maxNum() operations. min() and max() are the typical min/max
1677 * semantics provided by many CPUs which predate that specification.
1679 * minnummag() and maxnummag() functions correspond to minNumMag()
1680 * and minNumMag() from the IEEE-754 2008.
1682 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1683 bool ieee, bool ismag, float_status *s)
1685 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1686 if (ieee) {
1687 /* Takes two floating-point values `a' and `b', one of
1688 * which is a NaN, and returns the appropriate NaN
1689 * result. If either `a' or `b' is a signaling NaN,
1690 * the invalid exception is raised.
1692 if (is_snan(a.cls) || is_snan(b.cls)) {
1693 return pick_nan(a, b, s);
1694 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1695 return b;
1696 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1697 return a;
1700 return pick_nan(a, b, s);
1701 } else {
1702 int a_exp, b_exp;
1703 bool a_sign, b_sign;
1705 switch (a.cls) {
1706 case float_class_normal:
1707 a_exp = a.exp;
1708 break;
1709 case float_class_inf:
1710 a_exp = INT_MAX;
1711 break;
1712 case float_class_zero:
1713 a_exp = INT_MIN;
1714 break;
1715 default:
1716 g_assert_not_reached();
1717 break;
1719 switch (b.cls) {
1720 case float_class_normal:
1721 b_exp = b.exp;
1722 break;
1723 case float_class_inf:
1724 b_exp = INT_MAX;
1725 break;
1726 case float_class_zero:
1727 b_exp = INT_MIN;
1728 break;
1729 default:
1730 g_assert_not_reached();
1731 break;
1734 a_sign = a.sign;
1735 b_sign = b.sign;
1736 if (ismag) {
1737 a_sign = b_sign = 0;
1740 if (a_sign == b_sign) {
1741 bool a_less = a_exp < b_exp;
1742 if (a_exp == b_exp) {
1743 a_less = a.frac < b.frac;
1745 return a_sign ^ a_less ^ ismin ? b : a;
1746 } else {
1747 return a_sign ^ ismin ? b : a;
1752 #define MINMAX(sz, name, ismin, isiee, ismag) \
1753 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1754 float_status *s) \
1756 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1757 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1758 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1760 return float ## sz ## _round_pack_canonical(pr, s); \
1763 MINMAX(16, min, true, false, false)
1764 MINMAX(16, minnum, true, true, false)
1765 MINMAX(16, minnummag, true, true, true)
1766 MINMAX(16, max, false, false, false)
1767 MINMAX(16, maxnum, false, true, false)
1768 MINMAX(16, maxnummag, false, true, true)
1770 MINMAX(32, min, true, false, false)
1771 MINMAX(32, minnum, true, true, false)
1772 MINMAX(32, minnummag, true, true, true)
1773 MINMAX(32, max, false, false, false)
1774 MINMAX(32, maxnum, false, true, false)
1775 MINMAX(32, maxnummag, false, true, true)
1777 MINMAX(64, min, true, false, false)
1778 MINMAX(64, minnum, true, true, false)
1779 MINMAX(64, minnummag, true, true, true)
1780 MINMAX(64, max, false, false, false)
1781 MINMAX(64, maxnum, false, true, false)
1782 MINMAX(64, maxnummag, false, true, true)
1784 #undef MINMAX
1786 /* Floating point compare */
1787 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1788 float_status *s)
1790 if (is_nan(a.cls) || is_nan(b.cls)) {
1791 if (!is_quiet ||
1792 a.cls == float_class_snan ||
1793 b.cls == float_class_snan) {
1794 s->float_exception_flags |= float_flag_invalid;
1796 return float_relation_unordered;
1799 if (a.cls == float_class_zero) {
1800 if (b.cls == float_class_zero) {
1801 return float_relation_equal;
1803 return b.sign ? float_relation_greater : float_relation_less;
1804 } else if (b.cls == float_class_zero) {
1805 return a.sign ? float_relation_less : float_relation_greater;
1808 /* The only really important thing about infinity is its sign. If
1809 * both are infinities the sign marks the smallest of the two.
1811 if (a.cls == float_class_inf) {
1812 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1813 return float_relation_equal;
1815 return a.sign ? float_relation_less : float_relation_greater;
1816 } else if (b.cls == float_class_inf) {
1817 return b.sign ? float_relation_greater : float_relation_less;
1820 if (a.sign != b.sign) {
1821 return a.sign ? float_relation_less : float_relation_greater;
1824 if (a.exp == b.exp) {
1825 if (a.frac == b.frac) {
1826 return float_relation_equal;
1828 if (a.sign) {
1829 return a.frac > b.frac ?
1830 float_relation_less : float_relation_greater;
1831 } else {
1832 return a.frac > b.frac ?
1833 float_relation_greater : float_relation_less;
1835 } else {
1836 if (a.sign) {
1837 return a.exp > b.exp ? float_relation_less : float_relation_greater;
1838 } else {
1839 return a.exp > b.exp ? float_relation_greater : float_relation_less;
1844 #define COMPARE(sz) \
1845 int float ## sz ## _compare(float ## sz a, float ## sz b, \
1846 float_status *s) \
1848 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1849 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1850 return compare_floats(pa, pb, false, s); \
1852 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
1853 float_status *s) \
1855 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1856 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1857 return compare_floats(pa, pb, true, s); \
1860 COMPARE(16)
1861 COMPARE(32)
1862 COMPARE(64)
1864 #undef COMPARE
1866 /* Multiply A by 2 raised to the power N. */
1867 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1869 if (unlikely(is_nan(a.cls))) {
1870 return return_nan(a, s);
1872 if (a.cls == float_class_normal) {
1873 a.exp += n;
1875 return a;
1878 float16 float16_scalbn(float16 a, int n, float_status *status)
1880 FloatParts pa = float16_unpack_canonical(a, status);
1881 FloatParts pr = scalbn_decomposed(pa, n, status);
1882 return float16_round_pack_canonical(pr, status);
1885 float32 float32_scalbn(float32 a, int n, float_status *status)
1887 FloatParts pa = float32_unpack_canonical(a, status);
1888 FloatParts pr = scalbn_decomposed(pa, n, status);
1889 return float32_round_pack_canonical(pr, status);
1892 float64 float64_scalbn(float64 a, int n, float_status *status)
1894 FloatParts pa = float64_unpack_canonical(a, status);
1895 FloatParts pr = scalbn_decomposed(pa, n, status);
1896 return float64_round_pack_canonical(pr, status);
1900 * Square Root
1902 * The old softfloat code did an approximation step before zeroing in
1903 * on the final result. However for simpleness we just compute the
1904 * square root by iterating down from the implicit bit to enough extra
1905 * bits to ensure we get a correctly rounded result.
1907 * This does mean however the calculation is slower than before,
1908 * especially for 64 bit floats.
1911 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1913 uint64_t a_frac, r_frac, s_frac;
1914 int bit, last_bit;
1916 if (is_nan(a.cls)) {
1917 return return_nan(a, s);
1919 if (a.cls == float_class_zero) {
1920 return a; /* sqrt(+-0) = +-0 */
1922 if (a.sign) {
1923 s->float_exception_flags |= float_flag_invalid;
1924 a.cls = float_class_dnan;
1925 return a;
1927 if (a.cls == float_class_inf) {
1928 return a; /* sqrt(+inf) = +inf */
1931 assert(a.cls == float_class_normal);
1933 /* We need two overflow bits at the top. Adding room for that is a
1934 * right shift. If the exponent is odd, we can discard the low bit
1935 * by multiplying the fraction by 2; that's a left shift. Combine
1936 * those and we shift right if the exponent is even.
1938 a_frac = a.frac;
1939 if (!(a.exp & 1)) {
1940 a_frac >>= 1;
1942 a.exp >>= 1;
1944 /* Bit-by-bit computation of sqrt. */
1945 r_frac = 0;
1946 s_frac = 0;
1948 /* Iterate from implicit bit down to the 3 extra bits to compute a
1949 * properly rounded result. Remember we've inserted one more bit
1950 * at the top, so these positions are one less.
1952 bit = DECOMPOSED_BINARY_POINT - 1;
1953 last_bit = MAX(p->frac_shift - 4, 0);
1954 do {
1955 uint64_t q = 1ULL << bit;
1956 uint64_t t_frac = s_frac + q;
1957 if (t_frac <= a_frac) {
1958 s_frac = t_frac + q;
1959 a_frac -= t_frac;
1960 r_frac += q;
1962 a_frac <<= 1;
1963 } while (--bit >= last_bit);
1965 /* Undo the right shift done above. If there is any remaining
1966 * fraction, the result is inexact. Set the sticky bit.
1968 a.frac = (r_frac << 1) + (a_frac != 0);
1970 return a;
1973 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1975 FloatParts pa = float16_unpack_canonical(a, status);
1976 FloatParts pr = sqrt_float(pa, status, &float16_params);
1977 return float16_round_pack_canonical(pr, status);
1980 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
1982 FloatParts pa = float32_unpack_canonical(a, status);
1983 FloatParts pr = sqrt_float(pa, status, &float32_params);
1984 return float32_round_pack_canonical(pr, status);
1987 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
1989 FloatParts pa = float64_unpack_canonical(a, status);
1990 FloatParts pr = sqrt_float(pa, status, &float64_params);
1991 return float64_round_pack_canonical(pr, status);
1995 /*----------------------------------------------------------------------------
1996 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
1997 | and 7, and returns the properly rounded 32-bit integer corresponding to the
1998 | input. If `zSign' is 1, the input is negated before being converted to an
1999 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2000 | is simply rounded to an integer, with the inexact exception raised if the
2001 | input cannot be represented exactly as an integer. However, if the fixed-
2002 | point input is too large, the invalid exception is raised and the largest
2003 | positive or negative integer is returned.
2004 *----------------------------------------------------------------------------*/
2006 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2008 int8_t roundingMode;
2009 flag roundNearestEven;
2010 int8_t roundIncrement, roundBits;
2011 int32_t z;
2013 roundingMode = status->float_rounding_mode;
2014 roundNearestEven = ( roundingMode == float_round_nearest_even );
2015 switch (roundingMode) {
2016 case float_round_nearest_even:
2017 case float_round_ties_away:
2018 roundIncrement = 0x40;
2019 break;
2020 case float_round_to_zero:
2021 roundIncrement = 0;
2022 break;
2023 case float_round_up:
2024 roundIncrement = zSign ? 0 : 0x7f;
2025 break;
2026 case float_round_down:
2027 roundIncrement = zSign ? 0x7f : 0;
2028 break;
2029 default:
2030 abort();
2032 roundBits = absZ & 0x7F;
2033 absZ = ( absZ + roundIncrement )>>7;
2034 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2035 z = absZ;
2036 if ( zSign ) z = - z;
2037 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2038 float_raise(float_flag_invalid, status);
2039 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2041 if (roundBits) {
2042 status->float_exception_flags |= float_flag_inexact;
2044 return z;
2048 /*----------------------------------------------------------------------------
2049 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2050 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2051 | and returns the properly rounded 64-bit integer corresponding to the input.
2052 | If `zSign' is 1, the input is negated before being converted to an integer.
2053 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2054 | the inexact exception raised if the input cannot be represented exactly as
2055 | an integer. However, if the fixed-point input is too large, the invalid
2056 | exception is raised and the largest positive or negative integer is
2057 | returned.
2058 *----------------------------------------------------------------------------*/
2060 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2061 float_status *status)
2063 int8_t roundingMode;
2064 flag roundNearestEven, increment;
2065 int64_t z;
2067 roundingMode = status->float_rounding_mode;
2068 roundNearestEven = ( roundingMode == float_round_nearest_even );
2069 switch (roundingMode) {
2070 case float_round_nearest_even:
2071 case float_round_ties_away:
2072 increment = ((int64_t) absZ1 < 0);
2073 break;
2074 case float_round_to_zero:
2075 increment = 0;
2076 break;
2077 case float_round_up:
2078 increment = !zSign && absZ1;
2079 break;
2080 case float_round_down:
2081 increment = zSign && absZ1;
2082 break;
2083 default:
2084 abort();
2086 if ( increment ) {
2087 ++absZ0;
2088 if ( absZ0 == 0 ) goto overflow;
2089 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2091 z = absZ0;
2092 if ( zSign ) z = - z;
2093 if ( z && ( ( z < 0 ) ^ zSign ) ) {
2094 overflow:
2095 float_raise(float_flag_invalid, status);
2096 return
2097 zSign ? (int64_t) LIT64( 0x8000000000000000 )
2098 : LIT64( 0x7FFFFFFFFFFFFFFF );
2100 if (absZ1) {
2101 status->float_exception_flags |= float_flag_inexact;
2103 return z;
2107 /*----------------------------------------------------------------------------
2108 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2109 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2110 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2111 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2112 | with the inexact exception raised if the input cannot be represented exactly
2113 | as an integer. However, if the fixed-point input is too large, the invalid
2114 | exception is raised and the largest unsigned integer is returned.
2115 *----------------------------------------------------------------------------*/
2117 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2118 uint64_t absZ1, float_status *status)
2120 int8_t roundingMode;
2121 flag roundNearestEven, increment;
2123 roundingMode = status->float_rounding_mode;
2124 roundNearestEven = (roundingMode == float_round_nearest_even);
2125 switch (roundingMode) {
2126 case float_round_nearest_even:
2127 case float_round_ties_away:
2128 increment = ((int64_t)absZ1 < 0);
2129 break;
2130 case float_round_to_zero:
2131 increment = 0;
2132 break;
2133 case float_round_up:
2134 increment = !zSign && absZ1;
2135 break;
2136 case float_round_down:
2137 increment = zSign && absZ1;
2138 break;
2139 default:
2140 abort();
2142 if (increment) {
2143 ++absZ0;
2144 if (absZ0 == 0) {
2145 float_raise(float_flag_invalid, status);
2146 return LIT64(0xFFFFFFFFFFFFFFFF);
2148 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2151 if (zSign && absZ0) {
2152 float_raise(float_flag_invalid, status);
2153 return 0;
2156 if (absZ1) {
2157 status->float_exception_flags |= float_flag_inexact;
2159 return absZ0;
2162 /*----------------------------------------------------------------------------
2163 | If `a' is denormal and we are in flush-to-zero mode then set the
2164 | input-denormal exception and return zero. Otherwise just return the value.
2165 *----------------------------------------------------------------------------*/
2166 float32 float32_squash_input_denormal(float32 a, float_status *status)
2168 if (status->flush_inputs_to_zero) {
2169 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2170 float_raise(float_flag_input_denormal, status);
2171 return make_float32(float32_val(a) & 0x80000000);
2174 return a;
2177 /*----------------------------------------------------------------------------
2178 | Normalizes the subnormal single-precision floating-point value represented
2179 | by the denormalized significand `aSig'. The normalized exponent and
2180 | significand are stored at the locations pointed to by `zExpPtr' and
2181 | `zSigPtr', respectively.
2182 *----------------------------------------------------------------------------*/
2184 static void
2185 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2187 int8_t shiftCount;
2189 shiftCount = countLeadingZeros32( aSig ) - 8;
2190 *zSigPtr = aSig<<shiftCount;
2191 *zExpPtr = 1 - shiftCount;
2195 /*----------------------------------------------------------------------------
2196 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2197 | and significand `zSig', and returns the proper single-precision floating-
2198 | point value corresponding to the abstract input. Ordinarily, the abstract
2199 | value is simply rounded and packed into the single-precision format, with
2200 | the inexact exception raised if the abstract input cannot be represented
2201 | exactly. However, if the abstract value is too large, the overflow and
2202 | inexact exceptions are raised and an infinity or maximal finite value is
2203 | returned. If the abstract value is too small, the input value is rounded to
2204 | a subnormal number, and the underflow and inexact exceptions are raised if
2205 | the abstract input cannot be represented exactly as a subnormal single-
2206 | precision floating-point number.
2207 | The input significand `zSig' has its binary point between bits 30
2208 | and 29, which is 7 bits to the left of the usual location. This shifted
2209 | significand must be normalized or smaller. If `zSig' is not normalized,
2210 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2211 | and it must not require rounding. In the usual case that `zSig' is
2212 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2213 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2214 | Binary Floating-Point Arithmetic.
2215 *----------------------------------------------------------------------------*/
2217 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2218 float_status *status)
2220 int8_t roundingMode;
2221 flag roundNearestEven;
2222 int8_t roundIncrement, roundBits;
2223 flag isTiny;
2225 roundingMode = status->float_rounding_mode;
2226 roundNearestEven = ( roundingMode == float_round_nearest_even );
2227 switch (roundingMode) {
2228 case float_round_nearest_even:
2229 case float_round_ties_away:
2230 roundIncrement = 0x40;
2231 break;
2232 case float_round_to_zero:
2233 roundIncrement = 0;
2234 break;
2235 case float_round_up:
2236 roundIncrement = zSign ? 0 : 0x7f;
2237 break;
2238 case float_round_down:
2239 roundIncrement = zSign ? 0x7f : 0;
2240 break;
2241 default:
2242 abort();
2243 break;
2245 roundBits = zSig & 0x7F;
2246 if ( 0xFD <= (uint16_t) zExp ) {
2247 if ( ( 0xFD < zExp )
2248 || ( ( zExp == 0xFD )
2249 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2251 float_raise(float_flag_overflow | float_flag_inexact, status);
2252 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2254 if ( zExp < 0 ) {
2255 if (status->flush_to_zero) {
2256 float_raise(float_flag_output_denormal, status);
2257 return packFloat32(zSign, 0, 0);
2259 isTiny =
2260 (status->float_detect_tininess
2261 == float_tininess_before_rounding)
2262 || ( zExp < -1 )
2263 || ( zSig + roundIncrement < 0x80000000 );
2264 shift32RightJamming( zSig, - zExp, &zSig );
2265 zExp = 0;
2266 roundBits = zSig & 0x7F;
2267 if (isTiny && roundBits) {
2268 float_raise(float_flag_underflow, status);
2272 if (roundBits) {
2273 status->float_exception_flags |= float_flag_inexact;
2275 zSig = ( zSig + roundIncrement )>>7;
2276 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2277 if ( zSig == 0 ) zExp = 0;
2278 return packFloat32( zSign, zExp, zSig );
2282 /*----------------------------------------------------------------------------
2283 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2284 | and significand `zSig', and returns the proper single-precision floating-
2285 | point value corresponding to the abstract input. This routine is just like
2286 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2287 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2288 | floating-point exponent.
2289 *----------------------------------------------------------------------------*/
2291 static float32
2292 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2293 float_status *status)
2295 int8_t shiftCount;
2297 shiftCount = countLeadingZeros32( zSig ) - 1;
2298 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2299 status);
2303 /*----------------------------------------------------------------------------
2304 | If `a' is denormal and we are in flush-to-zero mode then set the
2305 | input-denormal exception and return zero. Otherwise just return the value.
2306 *----------------------------------------------------------------------------*/
2307 float64 float64_squash_input_denormal(float64 a, float_status *status)
2309 if (status->flush_inputs_to_zero) {
2310 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2311 float_raise(float_flag_input_denormal, status);
2312 return make_float64(float64_val(a) & (1ULL << 63));
2315 return a;
2318 /*----------------------------------------------------------------------------
2319 | Normalizes the subnormal double-precision floating-point value represented
2320 | by the denormalized significand `aSig'. The normalized exponent and
2321 | significand are stored at the locations pointed to by `zExpPtr' and
2322 | `zSigPtr', respectively.
2323 *----------------------------------------------------------------------------*/
2325 static void
2326 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2328 int8_t shiftCount;
2330 shiftCount = countLeadingZeros64( aSig ) - 11;
2331 *zSigPtr = aSig<<shiftCount;
2332 *zExpPtr = 1 - shiftCount;
2336 /*----------------------------------------------------------------------------
2337 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2338 | double-precision floating-point value, returning the result. After being
2339 | shifted into the proper positions, the three fields are simply added
2340 | together to form the result. This means that any integer portion of `zSig'
2341 | will be added into the exponent. Since a properly normalized significand
2342 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2343 | than the desired result exponent whenever `zSig' is a complete, normalized
2344 | significand.
2345 *----------------------------------------------------------------------------*/
2347 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2350 return make_float64(
2351 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2355 /*----------------------------------------------------------------------------
2356 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2357 | and significand `zSig', and returns the proper double-precision floating-
2358 | point value corresponding to the abstract input. Ordinarily, the abstract
2359 | value is simply rounded and packed into the double-precision format, with
2360 | the inexact exception raised if the abstract input cannot be represented
2361 | exactly. However, if the abstract value is too large, the overflow and
2362 | inexact exceptions are raised and an infinity or maximal finite value is
2363 | returned. If the abstract value is too small, the input value is rounded to
2364 | a subnormal number, and the underflow and inexact exceptions are raised if
2365 | the abstract input cannot be represented exactly as a subnormal double-
2366 | precision floating-point number.
2367 | The input significand `zSig' has its binary point between bits 62
2368 | and 61, which is 10 bits to the left of the usual location. This shifted
2369 | significand must be normalized or smaller. If `zSig' is not normalized,
2370 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2371 | and it must not require rounding. In the usual case that `zSig' is
2372 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2373 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2374 | Binary Floating-Point Arithmetic.
2375 *----------------------------------------------------------------------------*/
2377 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2378 float_status *status)
2380 int8_t roundingMode;
2381 flag roundNearestEven;
2382 int roundIncrement, roundBits;
2383 flag isTiny;
2385 roundingMode = status->float_rounding_mode;
2386 roundNearestEven = ( roundingMode == float_round_nearest_even );
2387 switch (roundingMode) {
2388 case float_round_nearest_even:
2389 case float_round_ties_away:
2390 roundIncrement = 0x200;
2391 break;
2392 case float_round_to_zero:
2393 roundIncrement = 0;
2394 break;
2395 case float_round_up:
2396 roundIncrement = zSign ? 0 : 0x3ff;
2397 break;
2398 case float_round_down:
2399 roundIncrement = zSign ? 0x3ff : 0;
2400 break;
2401 case float_round_to_odd:
2402 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2403 break;
2404 default:
2405 abort();
2407 roundBits = zSig & 0x3FF;
2408 if ( 0x7FD <= (uint16_t) zExp ) {
2409 if ( ( 0x7FD < zExp )
2410 || ( ( zExp == 0x7FD )
2411 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2413 bool overflow_to_inf = roundingMode != float_round_to_odd &&
2414 roundIncrement != 0;
2415 float_raise(float_flag_overflow | float_flag_inexact, status);
2416 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2418 if ( zExp < 0 ) {
2419 if (status->flush_to_zero) {
2420 float_raise(float_flag_output_denormal, status);
2421 return packFloat64(zSign, 0, 0);
2423 isTiny =
2424 (status->float_detect_tininess
2425 == float_tininess_before_rounding)
2426 || ( zExp < -1 )
2427 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2428 shift64RightJamming( zSig, - zExp, &zSig );
2429 zExp = 0;
2430 roundBits = zSig & 0x3FF;
2431 if (isTiny && roundBits) {
2432 float_raise(float_flag_underflow, status);
2434 if (roundingMode == float_round_to_odd) {
2436 * For round-to-odd case, the roundIncrement depends on
2437 * zSig which just changed.
2439 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2443 if (roundBits) {
2444 status->float_exception_flags |= float_flag_inexact;
2446 zSig = ( zSig + roundIncrement )>>10;
2447 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2448 if ( zSig == 0 ) zExp = 0;
2449 return packFloat64( zSign, zExp, zSig );
2453 /*----------------------------------------------------------------------------
2454 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2455 | and significand `zSig', and returns the proper double-precision floating-
2456 | point value corresponding to the abstract input. This routine is just like
2457 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2458 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2459 | floating-point exponent.
2460 *----------------------------------------------------------------------------*/
2462 static float64
2463 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2464 float_status *status)
2466 int8_t shiftCount;
2468 shiftCount = countLeadingZeros64( zSig ) - 1;
2469 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2470 status);
2474 /*----------------------------------------------------------------------------
2475 | Normalizes the subnormal extended double-precision floating-point value
2476 | represented by the denormalized significand `aSig'. The normalized exponent
2477 | and significand are stored at the locations pointed to by `zExpPtr' and
2478 | `zSigPtr', respectively.
2479 *----------------------------------------------------------------------------*/
2481 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2482 uint64_t *zSigPtr)
2484 int8_t shiftCount;
2486 shiftCount = countLeadingZeros64( aSig );
2487 *zSigPtr = aSig<<shiftCount;
2488 *zExpPtr = 1 - shiftCount;
2491 /*----------------------------------------------------------------------------
2492 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2493 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2494 | and returns the proper extended double-precision floating-point value
2495 | corresponding to the abstract input. Ordinarily, the abstract value is
2496 | rounded and packed into the extended double-precision format, with the
2497 | inexact exception raised if the abstract input cannot be represented
2498 | exactly. However, if the abstract value is too large, the overflow and
2499 | inexact exceptions are raised and an infinity or maximal finite value is
2500 | returned. If the abstract value is too small, the input value is rounded to
2501 | a subnormal number, and the underflow and inexact exceptions are raised if
2502 | the abstract input cannot be represented exactly as a subnormal extended
2503 | double-precision floating-point number.
2504 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
2505 | number of bits as single or double precision, respectively. Otherwise, the
2506 | result is rounded to the full precision of the extended double-precision
2507 | format.
2508 | The input significand must be normalized or smaller. If the input
2509 | significand is not normalized, `zExp' must be 0; in that case, the result
2510 | returned is a subnormal number, and it must not require rounding. The
2511 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2512 | Floating-Point Arithmetic.
2513 *----------------------------------------------------------------------------*/
2515 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2516 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2517 float_status *status)
2519 int8_t roundingMode;
2520 flag roundNearestEven, increment, isTiny;
2521 int64_t roundIncrement, roundMask, roundBits;
2523 roundingMode = status->float_rounding_mode;
2524 roundNearestEven = ( roundingMode == float_round_nearest_even );
2525 if ( roundingPrecision == 80 ) goto precision80;
2526 if ( roundingPrecision == 64 ) {
2527 roundIncrement = LIT64( 0x0000000000000400 );
2528 roundMask = LIT64( 0x00000000000007FF );
2530 else if ( roundingPrecision == 32 ) {
2531 roundIncrement = LIT64( 0x0000008000000000 );
2532 roundMask = LIT64( 0x000000FFFFFFFFFF );
2534 else {
2535 goto precision80;
2537 zSig0 |= ( zSig1 != 0 );
2538 switch (roundingMode) {
2539 case float_round_nearest_even:
2540 case float_round_ties_away:
2541 break;
2542 case float_round_to_zero:
2543 roundIncrement = 0;
2544 break;
2545 case float_round_up:
2546 roundIncrement = zSign ? 0 : roundMask;
2547 break;
2548 case float_round_down:
2549 roundIncrement = zSign ? roundMask : 0;
2550 break;
2551 default:
2552 abort();
2554 roundBits = zSig0 & roundMask;
2555 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2556 if ( ( 0x7FFE < zExp )
2557 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2559 goto overflow;
2561 if ( zExp <= 0 ) {
2562 if (status->flush_to_zero) {
2563 float_raise(float_flag_output_denormal, status);
2564 return packFloatx80(zSign, 0, 0);
2566 isTiny =
2567 (status->float_detect_tininess
2568 == float_tininess_before_rounding)
2569 || ( zExp < 0 )
2570 || ( zSig0 <= zSig0 + roundIncrement );
2571 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2572 zExp = 0;
2573 roundBits = zSig0 & roundMask;
2574 if (isTiny && roundBits) {
2575 float_raise(float_flag_underflow, status);
2577 if (roundBits) {
2578 status->float_exception_flags |= float_flag_inexact;
2580 zSig0 += roundIncrement;
2581 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2582 roundIncrement = roundMask + 1;
2583 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2584 roundMask |= roundIncrement;
2586 zSig0 &= ~ roundMask;
2587 return packFloatx80( zSign, zExp, zSig0 );
2590 if (roundBits) {
2591 status->float_exception_flags |= float_flag_inexact;
2593 zSig0 += roundIncrement;
2594 if ( zSig0 < roundIncrement ) {
2595 ++zExp;
2596 zSig0 = LIT64( 0x8000000000000000 );
2598 roundIncrement = roundMask + 1;
2599 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2600 roundMask |= roundIncrement;
2602 zSig0 &= ~ roundMask;
2603 if ( zSig0 == 0 ) zExp = 0;
2604 return packFloatx80( zSign, zExp, zSig0 );
2605 precision80:
2606 switch (roundingMode) {
2607 case float_round_nearest_even:
2608 case float_round_ties_away:
2609 increment = ((int64_t)zSig1 < 0);
2610 break;
2611 case float_round_to_zero:
2612 increment = 0;
2613 break;
2614 case float_round_up:
2615 increment = !zSign && zSig1;
2616 break;
2617 case float_round_down:
2618 increment = zSign && zSig1;
2619 break;
2620 default:
2621 abort();
2623 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2624 if ( ( 0x7FFE < zExp )
2625 || ( ( zExp == 0x7FFE )
2626 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2627 && increment
2630 roundMask = 0;
2631 overflow:
2632 float_raise(float_flag_overflow | float_flag_inexact, status);
2633 if ( ( roundingMode == float_round_to_zero )
2634 || ( zSign && ( roundingMode == float_round_up ) )
2635 || ( ! zSign && ( roundingMode == float_round_down ) )
2637 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2639 return packFloatx80(zSign,
2640 floatx80_infinity_high,
2641 floatx80_infinity_low);
2643 if ( zExp <= 0 ) {
2644 isTiny =
2645 (status->float_detect_tininess
2646 == float_tininess_before_rounding)
2647 || ( zExp < 0 )
2648 || ! increment
2649 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2650 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2651 zExp = 0;
2652 if (isTiny && zSig1) {
2653 float_raise(float_flag_underflow, status);
2655 if (zSig1) {
2656 status->float_exception_flags |= float_flag_inexact;
2658 switch (roundingMode) {
2659 case float_round_nearest_even:
2660 case float_round_ties_away:
2661 increment = ((int64_t)zSig1 < 0);
2662 break;
2663 case float_round_to_zero:
2664 increment = 0;
2665 break;
2666 case float_round_up:
2667 increment = !zSign && zSig1;
2668 break;
2669 case float_round_down:
2670 increment = zSign && zSig1;
2671 break;
2672 default:
2673 abort();
2675 if ( increment ) {
2676 ++zSig0;
2677 zSig0 &=
2678 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2679 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2681 return packFloatx80( zSign, zExp, zSig0 );
2684 if (zSig1) {
2685 status->float_exception_flags |= float_flag_inexact;
2687 if ( increment ) {
2688 ++zSig0;
2689 if ( zSig0 == 0 ) {
2690 ++zExp;
2691 zSig0 = LIT64( 0x8000000000000000 );
2693 else {
2694 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2697 else {
2698 if ( zSig0 == 0 ) zExp = 0;
2700 return packFloatx80( zSign, zExp, zSig0 );
2704 /*----------------------------------------------------------------------------
2705 | Takes an abstract floating-point value having sign `zSign', exponent
2706 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2707 | and returns the proper extended double-precision floating-point value
2708 | corresponding to the abstract input. This routine is just like
2709 | `roundAndPackFloatx80' except that the input significand does not have to be
2710 | normalized.
2711 *----------------------------------------------------------------------------*/
2713 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2714 flag zSign, int32_t zExp,
2715 uint64_t zSig0, uint64_t zSig1,
2716 float_status *status)
2718 int8_t shiftCount;
2720 if ( zSig0 == 0 ) {
2721 zSig0 = zSig1;
2722 zSig1 = 0;
2723 zExp -= 64;
2725 shiftCount = countLeadingZeros64( zSig0 );
2726 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2727 zExp -= shiftCount;
2728 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2729 zSig0, zSig1, status);
2733 /*----------------------------------------------------------------------------
2734 | Returns the least-significant 64 fraction bits of the quadruple-precision
2735 | floating-point value `a'.
2736 *----------------------------------------------------------------------------*/
2738 static inline uint64_t extractFloat128Frac1( float128 a )
2741 return a.low;
2745 /*----------------------------------------------------------------------------
2746 | Returns the most-significant 48 fraction bits of the quadruple-precision
2747 | floating-point value `a'.
2748 *----------------------------------------------------------------------------*/
2750 static inline uint64_t extractFloat128Frac0( float128 a )
2753 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2757 /*----------------------------------------------------------------------------
2758 | Returns the exponent bits of the quadruple-precision floating-point value
2759 | `a'.
2760 *----------------------------------------------------------------------------*/
2762 static inline int32_t extractFloat128Exp( float128 a )
2765 return ( a.high>>48 ) & 0x7FFF;
2769 /*----------------------------------------------------------------------------
2770 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2771 *----------------------------------------------------------------------------*/
2773 static inline flag extractFloat128Sign( float128 a )
2776 return a.high>>63;
2780 /*----------------------------------------------------------------------------
2781 | Normalizes the subnormal quadruple-precision floating-point value
2782 | represented by the denormalized significand formed by the concatenation of
2783 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
2784 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2785 | significand are stored at the location pointed to by `zSig0Ptr', and the
2786 | least significant 64 bits of the normalized significand are stored at the
2787 | location pointed to by `zSig1Ptr'.
2788 *----------------------------------------------------------------------------*/
2790 static void
2791 normalizeFloat128Subnormal(
2792 uint64_t aSig0,
2793 uint64_t aSig1,
2794 int32_t *zExpPtr,
2795 uint64_t *zSig0Ptr,
2796 uint64_t *zSig1Ptr
2799 int8_t shiftCount;
2801 if ( aSig0 == 0 ) {
2802 shiftCount = countLeadingZeros64( aSig1 ) - 15;
2803 if ( shiftCount < 0 ) {
2804 *zSig0Ptr = aSig1>>( - shiftCount );
2805 *zSig1Ptr = aSig1<<( shiftCount & 63 );
2807 else {
2808 *zSig0Ptr = aSig1<<shiftCount;
2809 *zSig1Ptr = 0;
2811 *zExpPtr = - shiftCount - 63;
2813 else {
2814 shiftCount = countLeadingZeros64( aSig0 ) - 15;
2815 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2816 *zExpPtr = 1 - shiftCount;
2821 /*----------------------------------------------------------------------------
2822 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2823 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2824 | floating-point value, returning the result. After being shifted into the
2825 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2826 | added together to form the most significant 32 bits of the result. This
2827 | means that any integer portion of `zSig0' will be added into the exponent.
2828 | Since a properly normalized significand will have an integer portion equal
2829 | to 1, the `zExp' input should be 1 less than the desired result exponent
2830 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2831 | significand.
2832 *----------------------------------------------------------------------------*/
2834 static inline float128
2835 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2837 float128 z;
2839 z.low = zSig1;
2840 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2841 return z;
2845 /*----------------------------------------------------------------------------
2846 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2847 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2848 | and `zSig2', and returns the proper quadruple-precision floating-point value
2849 | corresponding to the abstract input. Ordinarily, the abstract value is
2850 | simply rounded and packed into the quadruple-precision format, with the
2851 | inexact exception raised if the abstract input cannot be represented
2852 | exactly. However, if the abstract value is too large, the overflow and
2853 | inexact exceptions are raised and an infinity or maximal finite value is
2854 | returned. If the abstract value is too small, the input value is rounded to
2855 | a subnormal number, and the underflow and inexact exceptions are raised if
2856 | the abstract input cannot be represented exactly as a subnormal quadruple-
2857 | precision floating-point number.
2858 | The input significand must be normalized or smaller. If the input
2859 | significand is not normalized, `zExp' must be 0; in that case, the result
2860 | returned is a subnormal number, and it must not require rounding. In the
2861 | usual case that the input significand is normalized, `zExp' must be 1 less
2862 | than the ``true'' floating-point exponent. The handling of underflow and
2863 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2864 *----------------------------------------------------------------------------*/
2866 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2867 uint64_t zSig0, uint64_t zSig1,
2868 uint64_t zSig2, float_status *status)
2870 int8_t roundingMode;
2871 flag roundNearestEven, increment, isTiny;
2873 roundingMode = status->float_rounding_mode;
2874 roundNearestEven = ( roundingMode == float_round_nearest_even );
2875 switch (roundingMode) {
2876 case float_round_nearest_even:
2877 case float_round_ties_away:
2878 increment = ((int64_t)zSig2 < 0);
2879 break;
2880 case float_round_to_zero:
2881 increment = 0;
2882 break;
2883 case float_round_up:
2884 increment = !zSign && zSig2;
2885 break;
2886 case float_round_down:
2887 increment = zSign && zSig2;
2888 break;
2889 case float_round_to_odd:
2890 increment = !(zSig1 & 0x1) && zSig2;
2891 break;
2892 default:
2893 abort();
2895 if ( 0x7FFD <= (uint32_t) zExp ) {
2896 if ( ( 0x7FFD < zExp )
2897 || ( ( zExp == 0x7FFD )
2898 && eq128(
2899 LIT64( 0x0001FFFFFFFFFFFF ),
2900 LIT64( 0xFFFFFFFFFFFFFFFF ),
2901 zSig0,
2902 zSig1
2904 && increment
2907 float_raise(float_flag_overflow | float_flag_inexact, status);
2908 if ( ( roundingMode == float_round_to_zero )
2909 || ( zSign && ( roundingMode == float_round_up ) )
2910 || ( ! zSign && ( roundingMode == float_round_down ) )
2911 || (roundingMode == float_round_to_odd)
2913 return
2914 packFloat128(
2915 zSign,
2916 0x7FFE,
2917 LIT64( 0x0000FFFFFFFFFFFF ),
2918 LIT64( 0xFFFFFFFFFFFFFFFF )
2921 return packFloat128( zSign, 0x7FFF, 0, 0 );
2923 if ( zExp < 0 ) {
2924 if (status->flush_to_zero) {
2925 float_raise(float_flag_output_denormal, status);
2926 return packFloat128(zSign, 0, 0, 0);
2928 isTiny =
2929 (status->float_detect_tininess
2930 == float_tininess_before_rounding)
2931 || ( zExp < -1 )
2932 || ! increment
2933 || lt128(
2934 zSig0,
2935 zSig1,
2936 LIT64( 0x0001FFFFFFFFFFFF ),
2937 LIT64( 0xFFFFFFFFFFFFFFFF )
2939 shift128ExtraRightJamming(
2940 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2941 zExp = 0;
2942 if (isTiny && zSig2) {
2943 float_raise(float_flag_underflow, status);
2945 switch (roundingMode) {
2946 case float_round_nearest_even:
2947 case float_round_ties_away:
2948 increment = ((int64_t)zSig2 < 0);
2949 break;
2950 case float_round_to_zero:
2951 increment = 0;
2952 break;
2953 case float_round_up:
2954 increment = !zSign && zSig2;
2955 break;
2956 case float_round_down:
2957 increment = zSign && zSig2;
2958 break;
2959 case float_round_to_odd:
2960 increment = !(zSig1 & 0x1) && zSig2;
2961 break;
2962 default:
2963 abort();
2967 if (zSig2) {
2968 status->float_exception_flags |= float_flag_inexact;
2970 if ( increment ) {
2971 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2972 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2974 else {
2975 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2977 return packFloat128( zSign, zExp, zSig0, zSig1 );
2981 /*----------------------------------------------------------------------------
2982 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2983 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2984 | returns the proper quadruple-precision floating-point value corresponding
2985 | to the abstract input. This routine is just like `roundAndPackFloat128'
2986 | except that the input significand has fewer bits and does not have to be
2987 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
2988 | point exponent.
2989 *----------------------------------------------------------------------------*/
2991 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
2992 uint64_t zSig0, uint64_t zSig1,
2993 float_status *status)
2995 int8_t shiftCount;
2996 uint64_t zSig2;
2998 if ( zSig0 == 0 ) {
2999 zSig0 = zSig1;
3000 zSig1 = 0;
3001 zExp -= 64;
3003 shiftCount = countLeadingZeros64( zSig0 ) - 15;
3004 if ( 0 <= shiftCount ) {
3005 zSig2 = 0;
3006 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3008 else {
3009 shift128ExtraRightJamming(
3010 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3012 zExp -= shiftCount;
3013 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3018 /*----------------------------------------------------------------------------
3019 | Returns the result of converting the 32-bit two's complement integer `a'
3020 | to the extended double-precision floating-point format. The conversion
3021 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3022 | Arithmetic.
3023 *----------------------------------------------------------------------------*/
3025 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3027 flag zSign;
3028 uint32_t absA;
3029 int8_t shiftCount;
3030 uint64_t zSig;
3032 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3033 zSign = ( a < 0 );
3034 absA = zSign ? - a : a;
3035 shiftCount = countLeadingZeros32( absA ) + 32;
3036 zSig = absA;
3037 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3041 /*----------------------------------------------------------------------------
3042 | Returns the result of converting the 32-bit two's complement integer `a' to
3043 | the quadruple-precision floating-point format. The conversion is performed
3044 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3045 *----------------------------------------------------------------------------*/
3047 float128 int32_to_float128(int32_t a, float_status *status)
3049 flag zSign;
3050 uint32_t absA;
3051 int8_t shiftCount;
3052 uint64_t zSig0;
3054 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3055 zSign = ( a < 0 );
3056 absA = zSign ? - a : a;
3057 shiftCount = countLeadingZeros32( absA ) + 17;
3058 zSig0 = absA;
3059 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3063 /*----------------------------------------------------------------------------
3064 | Returns the result of converting the 64-bit two's complement integer `a'
3065 | to the extended double-precision floating-point format. The conversion
3066 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3067 | Arithmetic.
3068 *----------------------------------------------------------------------------*/
3070 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3072 flag zSign;
3073 uint64_t absA;
3074 int8_t shiftCount;
3076 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3077 zSign = ( a < 0 );
3078 absA = zSign ? - a : a;
3079 shiftCount = countLeadingZeros64( absA );
3080 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3084 /*----------------------------------------------------------------------------
3085 | Returns the result of converting the 64-bit two's complement integer `a' to
3086 | the quadruple-precision floating-point format. The conversion is performed
3087 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3088 *----------------------------------------------------------------------------*/
3090 float128 int64_to_float128(int64_t a, float_status *status)
3092 flag zSign;
3093 uint64_t absA;
3094 int8_t shiftCount;
3095 int32_t zExp;
3096 uint64_t zSig0, zSig1;
3098 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3099 zSign = ( a < 0 );
3100 absA = zSign ? - a : a;
3101 shiftCount = countLeadingZeros64( absA ) + 49;
3102 zExp = 0x406E - shiftCount;
3103 if ( 64 <= shiftCount ) {
3104 zSig1 = 0;
3105 zSig0 = absA;
3106 shiftCount -= 64;
3108 else {
3109 zSig1 = absA;
3110 zSig0 = 0;
3112 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3113 return packFloat128( zSign, zExp, zSig0, zSig1 );
3117 /*----------------------------------------------------------------------------
3118 | Returns the result of converting the 64-bit unsigned integer `a'
3119 | to the quadruple-precision floating-point format. The conversion is performed
3120 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3121 *----------------------------------------------------------------------------*/
3123 float128 uint64_to_float128(uint64_t a, float_status *status)
3125 if (a == 0) {
3126 return float128_zero;
3128 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
3134 /*----------------------------------------------------------------------------
3135 | Returns the result of converting the single-precision floating-point value
3136 | `a' to the double-precision floating-point format. The conversion is
3137 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3138 | Arithmetic.
3139 *----------------------------------------------------------------------------*/
3141 float64 float32_to_float64(float32 a, float_status *status)
3143 flag aSign;
3144 int aExp;
3145 uint32_t aSig;
3146 a = float32_squash_input_denormal(a, status);
3148 aSig = extractFloat32Frac( a );
3149 aExp = extractFloat32Exp( a );
3150 aSign = extractFloat32Sign( a );
3151 if ( aExp == 0xFF ) {
3152 if (aSig) {
3153 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3155 return packFloat64( aSign, 0x7FF, 0 );
3157 if ( aExp == 0 ) {
3158 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3159 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3160 --aExp;
3162 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3166 /*----------------------------------------------------------------------------
3167 | Returns the result of converting the single-precision floating-point value
3168 | `a' to the extended double-precision floating-point format. The conversion
3169 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3170 | Arithmetic.
3171 *----------------------------------------------------------------------------*/
3173 floatx80 float32_to_floatx80(float32 a, float_status *status)
3175 flag aSign;
3176 int aExp;
3177 uint32_t aSig;
3179 a = float32_squash_input_denormal(a, status);
3180 aSig = extractFloat32Frac( a );
3181 aExp = extractFloat32Exp( a );
3182 aSign = extractFloat32Sign( a );
3183 if ( aExp == 0xFF ) {
3184 if (aSig) {
3185 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3187 return packFloatx80(aSign,
3188 floatx80_infinity_high,
3189 floatx80_infinity_low);
3191 if ( aExp == 0 ) {
3192 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3193 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3195 aSig |= 0x00800000;
3196 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3200 /*----------------------------------------------------------------------------
3201 | Returns the result of converting the single-precision floating-point value
3202 | `a' to the double-precision floating-point format. The conversion is
3203 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3204 | Arithmetic.
3205 *----------------------------------------------------------------------------*/
3207 float128 float32_to_float128(float32 a, float_status *status)
3209 flag aSign;
3210 int aExp;
3211 uint32_t aSig;
3213 a = float32_squash_input_denormal(a, status);
3214 aSig = extractFloat32Frac( a );
3215 aExp = extractFloat32Exp( a );
3216 aSign = extractFloat32Sign( a );
3217 if ( aExp == 0xFF ) {
3218 if (aSig) {
3219 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3221 return packFloat128( aSign, 0x7FFF, 0, 0 );
3223 if ( aExp == 0 ) {
3224 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3225 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3226 --aExp;
3228 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3232 /*----------------------------------------------------------------------------
3233 | Returns the remainder of the single-precision floating-point value `a'
3234 | with respect to the corresponding value `b'. The operation is performed
3235 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3236 *----------------------------------------------------------------------------*/
3238 float32 float32_rem(float32 a, float32 b, float_status *status)
3240 flag aSign, zSign;
3241 int aExp, bExp, expDiff;
3242 uint32_t aSig, bSig;
3243 uint32_t q;
3244 uint64_t aSig64, bSig64, q64;
3245 uint32_t alternateASig;
3246 int32_t sigMean;
3247 a = float32_squash_input_denormal(a, status);
3248 b = float32_squash_input_denormal(b, status);
3250 aSig = extractFloat32Frac( a );
3251 aExp = extractFloat32Exp( a );
3252 aSign = extractFloat32Sign( a );
3253 bSig = extractFloat32Frac( b );
3254 bExp = extractFloat32Exp( b );
3255 if ( aExp == 0xFF ) {
3256 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3257 return propagateFloat32NaN(a, b, status);
3259 float_raise(float_flag_invalid, status);
3260 return float32_default_nan(status);
3262 if ( bExp == 0xFF ) {
3263 if (bSig) {
3264 return propagateFloat32NaN(a, b, status);
3266 return a;
3268 if ( bExp == 0 ) {
3269 if ( bSig == 0 ) {
3270 float_raise(float_flag_invalid, status);
3271 return float32_default_nan(status);
3273 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3275 if ( aExp == 0 ) {
3276 if ( aSig == 0 ) return a;
3277 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3279 expDiff = aExp - bExp;
3280 aSig |= 0x00800000;
3281 bSig |= 0x00800000;
3282 if ( expDiff < 32 ) {
3283 aSig <<= 8;
3284 bSig <<= 8;
3285 if ( expDiff < 0 ) {
3286 if ( expDiff < -1 ) return a;
3287 aSig >>= 1;
3289 q = ( bSig <= aSig );
3290 if ( q ) aSig -= bSig;
3291 if ( 0 < expDiff ) {
3292 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3293 q >>= 32 - expDiff;
3294 bSig >>= 2;
3295 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3297 else {
3298 aSig >>= 2;
3299 bSig >>= 2;
3302 else {
3303 if ( bSig <= aSig ) aSig -= bSig;
3304 aSig64 = ( (uint64_t) aSig )<<40;
3305 bSig64 = ( (uint64_t) bSig )<<40;
3306 expDiff -= 64;
3307 while ( 0 < expDiff ) {
3308 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3309 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3310 aSig64 = - ( ( bSig * q64 )<<38 );
3311 expDiff -= 62;
3313 expDiff += 64;
3314 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3315 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3316 q = q64>>( 64 - expDiff );
3317 bSig <<= 6;
3318 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3320 do {
3321 alternateASig = aSig;
3322 ++q;
3323 aSig -= bSig;
3324 } while ( 0 <= (int32_t) aSig );
3325 sigMean = aSig + alternateASig;
3326 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3327 aSig = alternateASig;
3329 zSign = ( (int32_t) aSig < 0 );
3330 if ( zSign ) aSig = - aSig;
3331 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3336 /*----------------------------------------------------------------------------
3337 | Returns the binary exponential of the single-precision floating-point value
3338 | `a'. The operation is performed according to the IEC/IEEE Standard for
3339 | Binary Floating-Point Arithmetic.
3341 | Uses the following identities:
3343 | 1. -------------------------------------------------------------------------
3344 | x x*ln(2)
3345 | 2 = e
3347 | 2. -------------------------------------------------------------------------
3348 | 2 3 4 5 n
3349 | x x x x x x x
3350 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3351 | 1! 2! 3! 4! 5! n!
3352 *----------------------------------------------------------------------------*/
3354 static const float64 float32_exp2_coefficients[15] =
3356 const_float64( 0x3ff0000000000000ll ), /* 1 */
3357 const_float64( 0x3fe0000000000000ll ), /* 2 */
3358 const_float64( 0x3fc5555555555555ll ), /* 3 */
3359 const_float64( 0x3fa5555555555555ll ), /* 4 */
3360 const_float64( 0x3f81111111111111ll ), /* 5 */
3361 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3362 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3363 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3364 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3365 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3366 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3367 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3368 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3369 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3370 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3373 float32 float32_exp2(float32 a, float_status *status)
3375 flag aSign;
3376 int aExp;
3377 uint32_t aSig;
3378 float64 r, x, xn;
3379 int i;
3380 a = float32_squash_input_denormal(a, status);
3382 aSig = extractFloat32Frac( a );
3383 aExp = extractFloat32Exp( a );
3384 aSign = extractFloat32Sign( a );
3386 if ( aExp == 0xFF) {
3387 if (aSig) {
3388 return propagateFloat32NaN(a, float32_zero, status);
3390 return (aSign) ? float32_zero : a;
3392 if (aExp == 0) {
3393 if (aSig == 0) return float32_one;
3396 float_raise(float_flag_inexact, status);
3398 /* ******************************* */
3399 /* using float64 for approximation */
3400 /* ******************************* */
3401 x = float32_to_float64(a, status);
3402 x = float64_mul(x, float64_ln2, status);
3404 xn = x;
3405 r = float64_one;
3406 for (i = 0 ; i < 15 ; i++) {
3407 float64 f;
3409 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3410 r = float64_add(r, f, status);
3412 xn = float64_mul(xn, x, status);
3415 return float64_to_float32(r, status);
3418 /*----------------------------------------------------------------------------
3419 | Returns the binary log of the single-precision floating-point value `a'.
3420 | The operation is performed according to the IEC/IEEE Standard for Binary
3421 | Floating-Point Arithmetic.
3422 *----------------------------------------------------------------------------*/
3423 float32 float32_log2(float32 a, float_status *status)
3425 flag aSign, zSign;
3426 int aExp;
3427 uint32_t aSig, zSig, i;
3429 a = float32_squash_input_denormal(a, status);
3430 aSig = extractFloat32Frac( a );
3431 aExp = extractFloat32Exp( a );
3432 aSign = extractFloat32Sign( a );
3434 if ( aExp == 0 ) {
3435 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3436 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3438 if ( aSign ) {
3439 float_raise(float_flag_invalid, status);
3440 return float32_default_nan(status);
3442 if ( aExp == 0xFF ) {
3443 if (aSig) {
3444 return propagateFloat32NaN(a, float32_zero, status);
3446 return a;
3449 aExp -= 0x7F;
3450 aSig |= 0x00800000;
3451 zSign = aExp < 0;
3452 zSig = aExp << 23;
3454 for (i = 1 << 22; i > 0; i >>= 1) {
3455 aSig = ( (uint64_t)aSig * aSig ) >> 23;
3456 if ( aSig & 0x01000000 ) {
3457 aSig >>= 1;
3458 zSig |= i;
3462 if ( zSign )
3463 zSig = -zSig;
3465 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3468 /*----------------------------------------------------------------------------
3469 | Returns 1 if the single-precision floating-point value `a' is equal to
3470 | the corresponding value `b', and 0 otherwise. The invalid exception is
3471 | raised if either operand is a NaN. Otherwise, the comparison is performed
3472 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3473 *----------------------------------------------------------------------------*/
3475 int float32_eq(float32 a, float32 b, float_status *status)
3477 uint32_t av, bv;
3478 a = float32_squash_input_denormal(a, status);
3479 b = float32_squash_input_denormal(b, status);
3481 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3482 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3484 float_raise(float_flag_invalid, status);
3485 return 0;
3487 av = float32_val(a);
3488 bv = float32_val(b);
3489 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3492 /*----------------------------------------------------------------------------
3493 | Returns 1 if the single-precision floating-point value `a' is less than
3494 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3495 | exception is raised if either operand is a NaN. The comparison is performed
3496 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3497 *----------------------------------------------------------------------------*/
3499 int float32_le(float32 a, float32 b, float_status *status)
3501 flag aSign, bSign;
3502 uint32_t av, bv;
3503 a = float32_squash_input_denormal(a, status);
3504 b = float32_squash_input_denormal(b, status);
3506 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3507 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3509 float_raise(float_flag_invalid, status);
3510 return 0;
3512 aSign = extractFloat32Sign( a );
3513 bSign = extractFloat32Sign( b );
3514 av = float32_val(a);
3515 bv = float32_val(b);
3516 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3517 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3521 /*----------------------------------------------------------------------------
3522 | Returns 1 if the single-precision floating-point value `a' is less than
3523 | the corresponding value `b', and 0 otherwise. The invalid exception is
3524 | raised if either operand is a NaN. The comparison is performed according
3525 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3526 *----------------------------------------------------------------------------*/
3528 int float32_lt(float32 a, float32 b, float_status *status)
3530 flag aSign, bSign;
3531 uint32_t av, bv;
3532 a = float32_squash_input_denormal(a, status);
3533 b = float32_squash_input_denormal(b, status);
3535 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3536 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3538 float_raise(float_flag_invalid, status);
3539 return 0;
3541 aSign = extractFloat32Sign( a );
3542 bSign = extractFloat32Sign( b );
3543 av = float32_val(a);
3544 bv = float32_val(b);
3545 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3546 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3550 /*----------------------------------------------------------------------------
3551 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3552 | be compared, and 0 otherwise. The invalid exception is raised if either
3553 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3554 | Standard for Binary Floating-Point Arithmetic.
3555 *----------------------------------------------------------------------------*/
3557 int float32_unordered(float32 a, float32 b, float_status *status)
3559 a = float32_squash_input_denormal(a, status);
3560 b = float32_squash_input_denormal(b, status);
3562 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3563 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3565 float_raise(float_flag_invalid, status);
3566 return 1;
3568 return 0;
3571 /*----------------------------------------------------------------------------
3572 | Returns 1 if the single-precision floating-point value `a' is equal to
3573 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3574 | exception. The comparison is performed according to the IEC/IEEE Standard
3575 | for Binary Floating-Point Arithmetic.
3576 *----------------------------------------------------------------------------*/
3578 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3580 a = float32_squash_input_denormal(a, status);
3581 b = float32_squash_input_denormal(b, status);
3583 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3584 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3586 if (float32_is_signaling_nan(a, status)
3587 || float32_is_signaling_nan(b, status)) {
3588 float_raise(float_flag_invalid, status);
3590 return 0;
3592 return ( float32_val(a) == float32_val(b) ) ||
3593 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3596 /*----------------------------------------------------------------------------
3597 | Returns 1 if the single-precision floating-point value `a' is less than or
3598 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3599 | cause an exception. Otherwise, the comparison is performed according to the
3600 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3601 *----------------------------------------------------------------------------*/
3603 int float32_le_quiet(float32 a, float32 b, float_status *status)
3605 flag aSign, bSign;
3606 uint32_t av, bv;
3607 a = float32_squash_input_denormal(a, status);
3608 b = float32_squash_input_denormal(b, status);
3610 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3611 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3613 if (float32_is_signaling_nan(a, status)
3614 || float32_is_signaling_nan(b, status)) {
3615 float_raise(float_flag_invalid, status);
3617 return 0;
3619 aSign = extractFloat32Sign( a );
3620 bSign = extractFloat32Sign( b );
3621 av = float32_val(a);
3622 bv = float32_val(b);
3623 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3624 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3628 /*----------------------------------------------------------------------------
3629 | Returns 1 if the single-precision floating-point value `a' is less than
3630 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3631 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3632 | Standard for Binary Floating-Point Arithmetic.
3633 *----------------------------------------------------------------------------*/
3635 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3637 flag aSign, bSign;
3638 uint32_t av, bv;
3639 a = float32_squash_input_denormal(a, status);
3640 b = float32_squash_input_denormal(b, status);
3642 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3643 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3645 if (float32_is_signaling_nan(a, status)
3646 || float32_is_signaling_nan(b, status)) {
3647 float_raise(float_flag_invalid, status);
3649 return 0;
3651 aSign = extractFloat32Sign( a );
3652 bSign = extractFloat32Sign( b );
3653 av = float32_val(a);
3654 bv = float32_val(b);
3655 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3656 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3660 /*----------------------------------------------------------------------------
3661 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3662 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3663 | comparison is performed according to the IEC/IEEE Standard for Binary
3664 | Floating-Point Arithmetic.
3665 *----------------------------------------------------------------------------*/
3667 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3669 a = float32_squash_input_denormal(a, status);
3670 b = float32_squash_input_denormal(b, status);
3672 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3673 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3675 if (float32_is_signaling_nan(a, status)
3676 || float32_is_signaling_nan(b, status)) {
3677 float_raise(float_flag_invalid, status);
3679 return 1;
3681 return 0;
3685 /*----------------------------------------------------------------------------
3686 | Returns the result of converting the double-precision floating-point value
3687 | `a' to the single-precision floating-point format. The conversion is
3688 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3689 | Arithmetic.
3690 *----------------------------------------------------------------------------*/
3692 float32 float64_to_float32(float64 a, float_status *status)
3694 flag aSign;
3695 int aExp;
3696 uint64_t aSig;
3697 uint32_t zSig;
3698 a = float64_squash_input_denormal(a, status);
3700 aSig = extractFloat64Frac( a );
3701 aExp = extractFloat64Exp( a );
3702 aSign = extractFloat64Sign( a );
3703 if ( aExp == 0x7FF ) {
3704 if (aSig) {
3705 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3707 return packFloat32( aSign, 0xFF, 0 );
3709 shift64RightJamming( aSig, 22, &aSig );
3710 zSig = aSig;
3711 if ( aExp || zSig ) {
3712 zSig |= 0x40000000;
3713 aExp -= 0x381;
3715 return roundAndPackFloat32(aSign, aExp, zSig, status);
3720 /*----------------------------------------------------------------------------
3721 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3722 | half-precision floating-point value, returning the result. After being
3723 | shifted into the proper positions, the three fields are simply added
3724 | together to form the result. This means that any integer portion of `zSig'
3725 | will be added into the exponent. Since a properly normalized significand
3726 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3727 | than the desired result exponent whenever `zSig' is a complete, normalized
3728 | significand.
3729 *----------------------------------------------------------------------------*/
3730 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3732 return make_float16(
3733 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3736 /*----------------------------------------------------------------------------
3737 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3738 | and significand `zSig', and returns the proper half-precision floating-
3739 | point value corresponding to the abstract input. Ordinarily, the abstract
3740 | value is simply rounded and packed into the half-precision format, with
3741 | the inexact exception raised if the abstract input cannot be represented
3742 | exactly. However, if the abstract value is too large, the overflow and
3743 | inexact exceptions are raised and an infinity or maximal finite value is
3744 | returned. If the abstract value is too small, the input value is rounded to
3745 | a subnormal number, and the underflow and inexact exceptions are raised if
3746 | the abstract input cannot be represented exactly as a subnormal half-
3747 | precision floating-point number.
3748 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3749 | ARM-style "alternative representation", which omits the NaN and Inf
3750 | encodings in order to raise the maximum representable exponent by one.
3751 | The input significand `zSig' has its binary point between bits 22
3752 | and 23, which is 13 bits to the left of the usual location. This shifted
3753 | significand must be normalized or smaller. If `zSig' is not normalized,
3754 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3755 | and it must not require rounding. In the usual case that `zSig' is
3756 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3757 | Note the slightly odd position of the binary point in zSig compared with the
3758 | other roundAndPackFloat functions. This should probably be fixed if we
3759 | need to implement more float16 routines than just conversion.
3760 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3761 | Binary Floating-Point Arithmetic.
3762 *----------------------------------------------------------------------------*/
3764 static float16 roundAndPackFloat16(flag zSign, int zExp,
3765 uint32_t zSig, flag ieee,
3766 float_status *status)
3768 int maxexp = ieee ? 29 : 30;
3769 uint32_t mask;
3770 uint32_t increment;
3771 bool rounding_bumps_exp;
3772 bool is_tiny = false;
3774 /* Calculate the mask of bits of the mantissa which are not
3775 * representable in half-precision and will be lost.
3777 if (zExp < 1) {
3778 /* Will be denormal in halfprec */
3779 mask = 0x00ffffff;
3780 if (zExp >= -11) {
3781 mask >>= 11 + zExp;
3783 } else {
3784 /* Normal number in halfprec */
3785 mask = 0x00001fff;
3788 switch (status->float_rounding_mode) {
3789 case float_round_nearest_even:
3790 increment = (mask + 1) >> 1;
3791 if ((zSig & mask) == increment) {
3792 increment = zSig & (increment << 1);
3794 break;
3795 case float_round_ties_away:
3796 increment = (mask + 1) >> 1;
3797 break;
3798 case float_round_up:
3799 increment = zSign ? 0 : mask;
3800 break;
3801 case float_round_down:
3802 increment = zSign ? mask : 0;
3803 break;
3804 default: /* round_to_zero */
3805 increment = 0;
3806 break;
3809 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3811 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3812 if (ieee) {
3813 float_raise(float_flag_overflow | float_flag_inexact, status);
3814 return packFloat16(zSign, 0x1f, 0);
3815 } else {
3816 float_raise(float_flag_invalid, status);
3817 return packFloat16(zSign, 0x1f, 0x3ff);
3821 if (zExp < 0) {
3822 /* Note that flush-to-zero does not affect half-precision results */
3823 is_tiny =
3824 (status->float_detect_tininess == float_tininess_before_rounding)
3825 || (zExp < -1)
3826 || (!rounding_bumps_exp);
3828 if (zSig & mask) {
3829 float_raise(float_flag_inexact, status);
3830 if (is_tiny) {
3831 float_raise(float_flag_underflow, status);
3835 zSig += increment;
3836 if (rounding_bumps_exp) {
3837 zSig >>= 1;
3838 zExp++;
3841 if (zExp < -10) {
3842 return packFloat16(zSign, 0, 0);
3844 if (zExp < 0) {
3845 zSig >>= -zExp;
3846 zExp = 0;
3848 return packFloat16(zSign, zExp, zSig >> 13);
3851 /*----------------------------------------------------------------------------
3852 | If `a' is denormal and we are in flush-to-zero mode then set the
3853 | input-denormal exception and return zero. Otherwise just return the value.
3854 *----------------------------------------------------------------------------*/
3855 float16 float16_squash_input_denormal(float16 a, float_status *status)
3857 if (status->flush_inputs_to_zero) {
3858 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3859 float_raise(float_flag_input_denormal, status);
3860 return make_float16(float16_val(a) & 0x8000);
3863 return a;
3866 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3867 uint32_t *zSigPtr)
3869 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3870 *zSigPtr = aSig << shiftCount;
3871 *zExpPtr = 1 - shiftCount;
3874 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3875 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3877 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3879 flag aSign;
3880 int aExp;
3881 uint32_t aSig;
3883 aSign = extractFloat16Sign(a);
3884 aExp = extractFloat16Exp(a);
3885 aSig = extractFloat16Frac(a);
3887 if (aExp == 0x1f && ieee) {
3888 if (aSig) {
3889 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3891 return packFloat32(aSign, 0xff, 0);
3893 if (aExp == 0) {
3894 if (aSig == 0) {
3895 return packFloat32(aSign, 0, 0);
3898 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3899 aExp--;
3901 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3904 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3906 flag aSign;
3907 int aExp;
3908 uint32_t aSig;
3910 a = float32_squash_input_denormal(a, status);
3912 aSig = extractFloat32Frac( a );
3913 aExp = extractFloat32Exp( a );
3914 aSign = extractFloat32Sign( a );
3915 if ( aExp == 0xFF ) {
3916 if (aSig) {
3917 /* Input is a NaN */
3918 if (!ieee) {
3919 float_raise(float_flag_invalid, status);
3920 return packFloat16(aSign, 0, 0);
3922 return commonNaNToFloat16(
3923 float32ToCommonNaN(a, status), status);
3925 /* Infinity */
3926 if (!ieee) {
3927 float_raise(float_flag_invalid, status);
3928 return packFloat16(aSign, 0x1f, 0x3ff);
3930 return packFloat16(aSign, 0x1f, 0);
3932 if (aExp == 0 && aSig == 0) {
3933 return packFloat16(aSign, 0, 0);
3935 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3936 * even if the input is denormal; however this is harmless because
3937 * the largest possible single-precision denormal is still smaller
3938 * than the smallest representable half-precision denormal, and so we
3939 * will end up ignoring aSig and returning via the "always return zero"
3940 * codepath.
3942 aSig |= 0x00800000;
3943 aExp -= 0x71;
3945 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3948 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3950 flag aSign;
3951 int aExp;
3952 uint32_t aSig;
3954 aSign = extractFloat16Sign(a);
3955 aExp = extractFloat16Exp(a);
3956 aSig = extractFloat16Frac(a);
3958 if (aExp == 0x1f && ieee) {
3959 if (aSig) {
3960 return commonNaNToFloat64(
3961 float16ToCommonNaN(a, status), status);
3963 return packFloat64(aSign, 0x7ff, 0);
3965 if (aExp == 0) {
3966 if (aSig == 0) {
3967 return packFloat64(aSign, 0, 0);
3970 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3971 aExp--;
3973 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3976 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3978 flag aSign;
3979 int aExp;
3980 uint64_t aSig;
3981 uint32_t zSig;
3983 a = float64_squash_input_denormal(a, status);
3985 aSig = extractFloat64Frac(a);
3986 aExp = extractFloat64Exp(a);
3987 aSign = extractFloat64Sign(a);
3988 if (aExp == 0x7FF) {
3989 if (aSig) {
3990 /* Input is a NaN */
3991 if (!ieee) {
3992 float_raise(float_flag_invalid, status);
3993 return packFloat16(aSign, 0, 0);
3995 return commonNaNToFloat16(
3996 float64ToCommonNaN(a, status), status);
3998 /* Infinity */
3999 if (!ieee) {
4000 float_raise(float_flag_invalid, status);
4001 return packFloat16(aSign, 0x1f, 0x3ff);
4003 return packFloat16(aSign, 0x1f, 0);
4005 shift64RightJamming(aSig, 29, &aSig);
4006 zSig = aSig;
4007 if (aExp == 0 && zSig == 0) {
4008 return packFloat16(aSign, 0, 0);
4010 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4011 * even if the input is denormal; however this is harmless because
4012 * the largest possible single-precision denormal is still smaller
4013 * than the smallest representable half-precision denormal, and so we
4014 * will end up ignoring aSig and returning via the "always return zero"
4015 * codepath.
4017 zSig |= 0x00800000;
4018 aExp -= 0x3F1;
4020 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4023 /*----------------------------------------------------------------------------
4024 | Returns the result of converting the double-precision floating-point value
4025 | `a' to the extended double-precision floating-point format. The conversion
4026 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4027 | Arithmetic.
4028 *----------------------------------------------------------------------------*/
4030 floatx80 float64_to_floatx80(float64 a, float_status *status)
4032 flag aSign;
4033 int aExp;
4034 uint64_t aSig;
4036 a = float64_squash_input_denormal(a, status);
4037 aSig = extractFloat64Frac( a );
4038 aExp = extractFloat64Exp( a );
4039 aSign = extractFloat64Sign( a );
4040 if ( aExp == 0x7FF ) {
4041 if (aSig) {
4042 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4044 return packFloatx80(aSign,
4045 floatx80_infinity_high,
4046 floatx80_infinity_low);
4048 if ( aExp == 0 ) {
4049 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4050 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4052 return
4053 packFloatx80(
4054 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4058 /*----------------------------------------------------------------------------
4059 | Returns the result of converting the double-precision floating-point value
4060 | `a' to the quadruple-precision floating-point format. The conversion is
4061 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4062 | Arithmetic.
4063 *----------------------------------------------------------------------------*/
4065 float128 float64_to_float128(float64 a, float_status *status)
4067 flag aSign;
4068 int aExp;
4069 uint64_t aSig, zSig0, zSig1;
4071 a = float64_squash_input_denormal(a, status);
4072 aSig = extractFloat64Frac( a );
4073 aExp = extractFloat64Exp( a );
4074 aSign = extractFloat64Sign( a );
4075 if ( aExp == 0x7FF ) {
4076 if (aSig) {
4077 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4079 return packFloat128( aSign, 0x7FFF, 0, 0 );
4081 if ( aExp == 0 ) {
4082 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4083 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4084 --aExp;
4086 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4087 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4092 /*----------------------------------------------------------------------------
4093 | Returns the remainder of the double-precision floating-point value `a'
4094 | with respect to the corresponding value `b'. The operation is performed
4095 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4096 *----------------------------------------------------------------------------*/
4098 float64 float64_rem(float64 a, float64 b, float_status *status)
4100 flag aSign, zSign;
4101 int aExp, bExp, expDiff;
4102 uint64_t aSig, bSig;
4103 uint64_t q, alternateASig;
4104 int64_t sigMean;
4106 a = float64_squash_input_denormal(a, status);
4107 b = float64_squash_input_denormal(b, status);
4108 aSig = extractFloat64Frac( a );
4109 aExp = extractFloat64Exp( a );
4110 aSign = extractFloat64Sign( a );
4111 bSig = extractFloat64Frac( b );
4112 bExp = extractFloat64Exp( b );
4113 if ( aExp == 0x7FF ) {
4114 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4115 return propagateFloat64NaN(a, b, status);
4117 float_raise(float_flag_invalid, status);
4118 return float64_default_nan(status);
4120 if ( bExp == 0x7FF ) {
4121 if (bSig) {
4122 return propagateFloat64NaN(a, b, status);
4124 return a;
4126 if ( bExp == 0 ) {
4127 if ( bSig == 0 ) {
4128 float_raise(float_flag_invalid, status);
4129 return float64_default_nan(status);
4131 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4133 if ( aExp == 0 ) {
4134 if ( aSig == 0 ) return a;
4135 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4137 expDiff = aExp - bExp;
4138 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4139 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4140 if ( expDiff < 0 ) {
4141 if ( expDiff < -1 ) return a;
4142 aSig >>= 1;
4144 q = ( bSig <= aSig );
4145 if ( q ) aSig -= bSig;
4146 expDiff -= 64;
4147 while ( 0 < expDiff ) {
4148 q = estimateDiv128To64( aSig, 0, bSig );
4149 q = ( 2 < q ) ? q - 2 : 0;
4150 aSig = - ( ( bSig>>2 ) * q );
4151 expDiff -= 62;
4153 expDiff += 64;
4154 if ( 0 < expDiff ) {
4155 q = estimateDiv128To64( aSig, 0, bSig );
4156 q = ( 2 < q ) ? q - 2 : 0;
4157 q >>= 64 - expDiff;
4158 bSig >>= 2;
4159 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4161 else {
4162 aSig >>= 2;
4163 bSig >>= 2;
4165 do {
4166 alternateASig = aSig;
4167 ++q;
4168 aSig -= bSig;
4169 } while ( 0 <= (int64_t) aSig );
4170 sigMean = aSig + alternateASig;
4171 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4172 aSig = alternateASig;
4174 zSign = ( (int64_t) aSig < 0 );
4175 if ( zSign ) aSig = - aSig;
4176 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4180 /*----------------------------------------------------------------------------
4181 | Returns the binary log of the double-precision floating-point value `a'.
4182 | The operation is performed according to the IEC/IEEE Standard for Binary
4183 | Floating-Point Arithmetic.
4184 *----------------------------------------------------------------------------*/
4185 float64 float64_log2(float64 a, float_status *status)
4187 flag aSign, zSign;
4188 int aExp;
4189 uint64_t aSig, aSig0, aSig1, zSig, i;
4190 a = float64_squash_input_denormal(a, status);
4192 aSig = extractFloat64Frac( a );
4193 aExp = extractFloat64Exp( a );
4194 aSign = extractFloat64Sign( a );
4196 if ( aExp == 0 ) {
4197 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4198 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4200 if ( aSign ) {
4201 float_raise(float_flag_invalid, status);
4202 return float64_default_nan(status);
4204 if ( aExp == 0x7FF ) {
4205 if (aSig) {
4206 return propagateFloat64NaN(a, float64_zero, status);
4208 return a;
4211 aExp -= 0x3FF;
4212 aSig |= LIT64( 0x0010000000000000 );
4213 zSign = aExp < 0;
4214 zSig = (uint64_t)aExp << 52;
4215 for (i = 1LL << 51; i > 0; i >>= 1) {
4216 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4217 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4218 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4219 aSig >>= 1;
4220 zSig |= i;
4224 if ( zSign )
4225 zSig = -zSig;
4226 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4229 /*----------------------------------------------------------------------------
4230 | Returns 1 if the double-precision floating-point value `a' is equal to the
4231 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4232 | if either operand is a NaN. Otherwise, the comparison is performed
4233 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4234 *----------------------------------------------------------------------------*/
4236 int float64_eq(float64 a, float64 b, float_status *status)
4238 uint64_t av, bv;
4239 a = float64_squash_input_denormal(a, status);
4240 b = float64_squash_input_denormal(b, status);
4242 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4243 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4245 float_raise(float_flag_invalid, status);
4246 return 0;
4248 av = float64_val(a);
4249 bv = float64_val(b);
4250 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4254 /*----------------------------------------------------------------------------
4255 | Returns 1 if the double-precision floating-point value `a' is less than or
4256 | equal to the corresponding value `b', and 0 otherwise. The invalid
4257 | exception is raised if either operand is a NaN. The comparison is performed
4258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4259 *----------------------------------------------------------------------------*/
4261 int float64_le(float64 a, float64 b, float_status *status)
4263 flag aSign, bSign;
4264 uint64_t av, bv;
4265 a = float64_squash_input_denormal(a, status);
4266 b = float64_squash_input_denormal(b, status);
4268 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4269 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4271 float_raise(float_flag_invalid, status);
4272 return 0;
4274 aSign = extractFloat64Sign( a );
4275 bSign = extractFloat64Sign( b );
4276 av = float64_val(a);
4277 bv = float64_val(b);
4278 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4279 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4283 /*----------------------------------------------------------------------------
4284 | Returns 1 if the double-precision floating-point value `a' is less than
4285 | the corresponding value `b', and 0 otherwise. The invalid exception is
4286 | raised if either operand is a NaN. The comparison is performed according
4287 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4288 *----------------------------------------------------------------------------*/
4290 int float64_lt(float64 a, float64 b, float_status *status)
4292 flag aSign, bSign;
4293 uint64_t av, bv;
4295 a = float64_squash_input_denormal(a, status);
4296 b = float64_squash_input_denormal(b, status);
4297 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4298 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4300 float_raise(float_flag_invalid, status);
4301 return 0;
4303 aSign = extractFloat64Sign( a );
4304 bSign = extractFloat64Sign( b );
4305 av = float64_val(a);
4306 bv = float64_val(b);
4307 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4308 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4312 /*----------------------------------------------------------------------------
4313 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4314 | be compared, and 0 otherwise. The invalid exception is raised if either
4315 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4316 | Standard for Binary Floating-Point Arithmetic.
4317 *----------------------------------------------------------------------------*/
4319 int float64_unordered(float64 a, float64 b, float_status *status)
4321 a = float64_squash_input_denormal(a, status);
4322 b = float64_squash_input_denormal(b, status);
4324 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4325 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4327 float_raise(float_flag_invalid, status);
4328 return 1;
4330 return 0;
4333 /*----------------------------------------------------------------------------
4334 | Returns 1 if the double-precision floating-point value `a' is equal to the
4335 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4336 | exception.The comparison is performed according to the IEC/IEEE Standard
4337 | for Binary Floating-Point Arithmetic.
4338 *----------------------------------------------------------------------------*/
4340 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4342 uint64_t av, bv;
4343 a = float64_squash_input_denormal(a, status);
4344 b = float64_squash_input_denormal(b, status);
4346 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4347 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4349 if (float64_is_signaling_nan(a, status)
4350 || float64_is_signaling_nan(b, status)) {
4351 float_raise(float_flag_invalid, status);
4353 return 0;
4355 av = float64_val(a);
4356 bv = float64_val(b);
4357 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4361 /*----------------------------------------------------------------------------
4362 | Returns 1 if the double-precision floating-point value `a' is less than or
4363 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4364 | cause an exception. Otherwise, the comparison is performed according to the
4365 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4366 *----------------------------------------------------------------------------*/
4368 int float64_le_quiet(float64 a, float64 b, float_status *status)
4370 flag aSign, bSign;
4371 uint64_t av, bv;
4372 a = float64_squash_input_denormal(a, status);
4373 b = float64_squash_input_denormal(b, status);
4375 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4376 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4378 if (float64_is_signaling_nan(a, status)
4379 || float64_is_signaling_nan(b, status)) {
4380 float_raise(float_flag_invalid, status);
4382 return 0;
4384 aSign = extractFloat64Sign( a );
4385 bSign = extractFloat64Sign( b );
4386 av = float64_val(a);
4387 bv = float64_val(b);
4388 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4389 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4393 /*----------------------------------------------------------------------------
4394 | Returns 1 if the double-precision floating-point value `a' is less than
4395 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4396 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4397 | Standard for Binary Floating-Point Arithmetic.
4398 *----------------------------------------------------------------------------*/
4400 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4402 flag aSign, bSign;
4403 uint64_t av, bv;
4404 a = float64_squash_input_denormal(a, status);
4405 b = float64_squash_input_denormal(b, status);
4407 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4408 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4410 if (float64_is_signaling_nan(a, status)
4411 || float64_is_signaling_nan(b, status)) {
4412 float_raise(float_flag_invalid, status);
4414 return 0;
4416 aSign = extractFloat64Sign( a );
4417 bSign = extractFloat64Sign( b );
4418 av = float64_val(a);
4419 bv = float64_val(b);
4420 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4421 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4425 /*----------------------------------------------------------------------------
4426 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4427 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4428 | comparison is performed according to the IEC/IEEE Standard for Binary
4429 | Floating-Point Arithmetic.
4430 *----------------------------------------------------------------------------*/
4432 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4434 a = float64_squash_input_denormal(a, status);
4435 b = float64_squash_input_denormal(b, status);
4437 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4438 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4440 if (float64_is_signaling_nan(a, status)
4441 || float64_is_signaling_nan(b, status)) {
4442 float_raise(float_flag_invalid, status);
4444 return 1;
4446 return 0;
4449 /*----------------------------------------------------------------------------
4450 | Returns the result of converting the extended double-precision floating-
4451 | point value `a' to the 32-bit two's complement integer format. The
4452 | conversion is performed according to the IEC/IEEE Standard for Binary
4453 | Floating-Point Arithmetic---which means in particular that the conversion
4454 | is rounded according to the current rounding mode. If `a' is a NaN, the
4455 | largest positive integer is returned. Otherwise, if the conversion
4456 | overflows, the largest integer with the same sign as `a' is returned.
4457 *----------------------------------------------------------------------------*/
4459 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4461 flag aSign;
4462 int32_t aExp, shiftCount;
4463 uint64_t aSig;
4465 if (floatx80_invalid_encoding(a)) {
4466 float_raise(float_flag_invalid, status);
4467 return 1 << 31;
4469 aSig = extractFloatx80Frac( a );
4470 aExp = extractFloatx80Exp( a );
4471 aSign = extractFloatx80Sign( a );
4472 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4473 shiftCount = 0x4037 - aExp;
4474 if ( shiftCount <= 0 ) shiftCount = 1;
4475 shift64RightJamming( aSig, shiftCount, &aSig );
4476 return roundAndPackInt32(aSign, aSig, status);
4480 /*----------------------------------------------------------------------------
4481 | Returns the result of converting the extended double-precision floating-
4482 | point value `a' to the 32-bit two's complement integer format. The
4483 | conversion is performed according to the IEC/IEEE Standard for Binary
4484 | Floating-Point Arithmetic, except that the conversion is always rounded
4485 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4486 | Otherwise, if the conversion overflows, the largest integer with the same
4487 | sign as `a' is returned.
4488 *----------------------------------------------------------------------------*/
4490 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4492 flag aSign;
4493 int32_t aExp, shiftCount;
4494 uint64_t aSig, savedASig;
4495 int32_t z;
4497 if (floatx80_invalid_encoding(a)) {
4498 float_raise(float_flag_invalid, status);
4499 return 1 << 31;
4501 aSig = extractFloatx80Frac( a );
4502 aExp = extractFloatx80Exp( a );
4503 aSign = extractFloatx80Sign( a );
4504 if ( 0x401E < aExp ) {
4505 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4506 goto invalid;
4508 else if ( aExp < 0x3FFF ) {
4509 if (aExp || aSig) {
4510 status->float_exception_flags |= float_flag_inexact;
4512 return 0;
4514 shiftCount = 0x403E - aExp;
4515 savedASig = aSig;
4516 aSig >>= shiftCount;
4517 z = aSig;
4518 if ( aSign ) z = - z;
4519 if ( ( z < 0 ) ^ aSign ) {
4520 invalid:
4521 float_raise(float_flag_invalid, status);
4522 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4524 if ( ( aSig<<shiftCount ) != savedASig ) {
4525 status->float_exception_flags |= float_flag_inexact;
4527 return z;
4531 /*----------------------------------------------------------------------------
4532 | Returns the result of converting the extended double-precision floating-
4533 | point value `a' to the 64-bit two's complement integer format. The
4534 | conversion is performed according to the IEC/IEEE Standard for Binary
4535 | Floating-Point Arithmetic---which means in particular that the conversion
4536 | is rounded according to the current rounding mode. If `a' is a NaN,
4537 | the largest positive integer is returned. Otherwise, if the conversion
4538 | overflows, the largest integer with the same sign as `a' is returned.
4539 *----------------------------------------------------------------------------*/
4541 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4543 flag aSign;
4544 int32_t aExp, shiftCount;
4545 uint64_t aSig, aSigExtra;
4547 if (floatx80_invalid_encoding(a)) {
4548 float_raise(float_flag_invalid, status);
4549 return 1ULL << 63;
4551 aSig = extractFloatx80Frac( a );
4552 aExp = extractFloatx80Exp( a );
4553 aSign = extractFloatx80Sign( a );
4554 shiftCount = 0x403E - aExp;
4555 if ( shiftCount <= 0 ) {
4556 if ( shiftCount ) {
4557 float_raise(float_flag_invalid, status);
4558 if (!aSign || floatx80_is_any_nan(a)) {
4559 return LIT64( 0x7FFFFFFFFFFFFFFF );
4561 return (int64_t) LIT64( 0x8000000000000000 );
4563 aSigExtra = 0;
4565 else {
4566 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4568 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4572 /*----------------------------------------------------------------------------
4573 | Returns the result of converting the extended double-precision floating-
4574 | point value `a' to the 64-bit two's complement integer format. The
4575 | conversion is performed according to the IEC/IEEE Standard for Binary
4576 | Floating-Point Arithmetic, except that the conversion is always rounded
4577 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4578 | Otherwise, if the conversion overflows, the largest integer with the same
4579 | sign as `a' is returned.
4580 *----------------------------------------------------------------------------*/
4582 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4584 flag aSign;
4585 int32_t aExp, shiftCount;
4586 uint64_t aSig;
4587 int64_t z;
4589 if (floatx80_invalid_encoding(a)) {
4590 float_raise(float_flag_invalid, status);
4591 return 1ULL << 63;
4593 aSig = extractFloatx80Frac( a );
4594 aExp = extractFloatx80Exp( a );
4595 aSign = extractFloatx80Sign( a );
4596 shiftCount = aExp - 0x403E;
4597 if ( 0 <= shiftCount ) {
4598 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4599 if ( ( a.high != 0xC03E ) || aSig ) {
4600 float_raise(float_flag_invalid, status);
4601 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4602 return LIT64( 0x7FFFFFFFFFFFFFFF );
4605 return (int64_t) LIT64( 0x8000000000000000 );
4607 else if ( aExp < 0x3FFF ) {
4608 if (aExp | aSig) {
4609 status->float_exception_flags |= float_flag_inexact;
4611 return 0;
4613 z = aSig>>( - shiftCount );
4614 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4615 status->float_exception_flags |= float_flag_inexact;
4617 if ( aSign ) z = - z;
4618 return z;
4622 /*----------------------------------------------------------------------------
4623 | Returns the result of converting the extended double-precision floating-
4624 | point value `a' to the single-precision floating-point format. The
4625 | conversion is performed according to the IEC/IEEE Standard for Binary
4626 | Floating-Point Arithmetic.
4627 *----------------------------------------------------------------------------*/
4629 float32 floatx80_to_float32(floatx80 a, float_status *status)
4631 flag aSign;
4632 int32_t aExp;
4633 uint64_t aSig;
4635 if (floatx80_invalid_encoding(a)) {
4636 float_raise(float_flag_invalid, status);
4637 return float32_default_nan(status);
4639 aSig = extractFloatx80Frac( a );
4640 aExp = extractFloatx80Exp( a );
4641 aSign = extractFloatx80Sign( a );
4642 if ( aExp == 0x7FFF ) {
4643 if ( (uint64_t) ( aSig<<1 ) ) {
4644 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4646 return packFloat32( aSign, 0xFF, 0 );
4648 shift64RightJamming( aSig, 33, &aSig );
4649 if ( aExp || aSig ) aExp -= 0x3F81;
4650 return roundAndPackFloat32(aSign, aExp, aSig, status);
4654 /*----------------------------------------------------------------------------
4655 | Returns the result of converting the extended double-precision floating-
4656 | point value `a' to the double-precision floating-point format. The
4657 | conversion is performed according to the IEC/IEEE Standard for Binary
4658 | Floating-Point Arithmetic.
4659 *----------------------------------------------------------------------------*/
4661 float64 floatx80_to_float64(floatx80 a, float_status *status)
4663 flag aSign;
4664 int32_t aExp;
4665 uint64_t aSig, zSig;
4667 if (floatx80_invalid_encoding(a)) {
4668 float_raise(float_flag_invalid, status);
4669 return float64_default_nan(status);
4671 aSig = extractFloatx80Frac( a );
4672 aExp = extractFloatx80Exp( a );
4673 aSign = extractFloatx80Sign( a );
4674 if ( aExp == 0x7FFF ) {
4675 if ( (uint64_t) ( aSig<<1 ) ) {
4676 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4678 return packFloat64( aSign, 0x7FF, 0 );
4680 shift64RightJamming( aSig, 1, &zSig );
4681 if ( aExp || aSig ) aExp -= 0x3C01;
4682 return roundAndPackFloat64(aSign, aExp, zSig, status);
4686 /*----------------------------------------------------------------------------
4687 | Returns the result of converting the extended double-precision floating-
4688 | point value `a' to the quadruple-precision floating-point format. The
4689 | conversion is performed according to the IEC/IEEE Standard for Binary
4690 | Floating-Point Arithmetic.
4691 *----------------------------------------------------------------------------*/
4693 float128 floatx80_to_float128(floatx80 a, float_status *status)
4695 flag aSign;
4696 int aExp;
4697 uint64_t aSig, zSig0, zSig1;
4699 if (floatx80_invalid_encoding(a)) {
4700 float_raise(float_flag_invalid, status);
4701 return float128_default_nan(status);
4703 aSig = extractFloatx80Frac( a );
4704 aExp = extractFloatx80Exp( a );
4705 aSign = extractFloatx80Sign( a );
4706 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4707 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4709 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4710 return packFloat128( aSign, aExp, zSig0, zSig1 );
4714 /*----------------------------------------------------------------------------
4715 | Rounds the extended double-precision floating-point value `a'
4716 | to the precision provided by floatx80_rounding_precision and returns the
4717 | result as an extended double-precision floating-point value.
4718 | The operation is performed according to the IEC/IEEE Standard for Binary
4719 | Floating-Point Arithmetic.
4720 *----------------------------------------------------------------------------*/
4722 floatx80 floatx80_round(floatx80 a, float_status *status)
4724 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4725 extractFloatx80Sign(a),
4726 extractFloatx80Exp(a),
4727 extractFloatx80Frac(a), 0, status);
4730 /*----------------------------------------------------------------------------
4731 | Rounds the extended double-precision floating-point value `a' to an integer,
4732 | and returns the result as an extended quadruple-precision floating-point
4733 | value. The operation is performed according to the IEC/IEEE Standard for
4734 | Binary Floating-Point Arithmetic.
4735 *----------------------------------------------------------------------------*/
4737 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4739 flag aSign;
4740 int32_t aExp;
4741 uint64_t lastBitMask, roundBitsMask;
4742 floatx80 z;
4744 if (floatx80_invalid_encoding(a)) {
4745 float_raise(float_flag_invalid, status);
4746 return floatx80_default_nan(status);
4748 aExp = extractFloatx80Exp( a );
4749 if ( 0x403E <= aExp ) {
4750 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4751 return propagateFloatx80NaN(a, a, status);
4753 return a;
4755 if ( aExp < 0x3FFF ) {
4756 if ( ( aExp == 0 )
4757 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4758 return a;
4760 status->float_exception_flags |= float_flag_inexact;
4761 aSign = extractFloatx80Sign( a );
4762 switch (status->float_rounding_mode) {
4763 case float_round_nearest_even:
4764 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4766 return
4767 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4769 break;
4770 case float_round_ties_away:
4771 if (aExp == 0x3FFE) {
4772 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4774 break;
4775 case float_round_down:
4776 return
4777 aSign ?
4778 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4779 : packFloatx80( 0, 0, 0 );
4780 case float_round_up:
4781 return
4782 aSign ? packFloatx80( 1, 0, 0 )
4783 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4785 return packFloatx80( aSign, 0, 0 );
4787 lastBitMask = 1;
4788 lastBitMask <<= 0x403E - aExp;
4789 roundBitsMask = lastBitMask - 1;
4790 z = a;
4791 switch (status->float_rounding_mode) {
4792 case float_round_nearest_even:
4793 z.low += lastBitMask>>1;
4794 if ((z.low & roundBitsMask) == 0) {
4795 z.low &= ~lastBitMask;
4797 break;
4798 case float_round_ties_away:
4799 z.low += lastBitMask >> 1;
4800 break;
4801 case float_round_to_zero:
4802 break;
4803 case float_round_up:
4804 if (!extractFloatx80Sign(z)) {
4805 z.low += roundBitsMask;
4807 break;
4808 case float_round_down:
4809 if (extractFloatx80Sign(z)) {
4810 z.low += roundBitsMask;
4812 break;
4813 default:
4814 abort();
4816 z.low &= ~ roundBitsMask;
4817 if ( z.low == 0 ) {
4818 ++z.high;
4819 z.low = LIT64( 0x8000000000000000 );
4821 if (z.low != a.low) {
4822 status->float_exception_flags |= float_flag_inexact;
4824 return z;
4828 /*----------------------------------------------------------------------------
4829 | Returns the result of adding the absolute values of the extended double-
4830 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4831 | negated before being returned. `zSign' is ignored if the result is a NaN.
4832 | The addition is performed according to the IEC/IEEE Standard for Binary
4833 | Floating-Point Arithmetic.
4834 *----------------------------------------------------------------------------*/
4836 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4837 float_status *status)
4839 int32_t aExp, bExp, zExp;
4840 uint64_t aSig, bSig, zSig0, zSig1;
4841 int32_t expDiff;
4843 aSig = extractFloatx80Frac( a );
4844 aExp = extractFloatx80Exp( a );
4845 bSig = extractFloatx80Frac( b );
4846 bExp = extractFloatx80Exp( b );
4847 expDiff = aExp - bExp;
4848 if ( 0 < expDiff ) {
4849 if ( aExp == 0x7FFF ) {
4850 if ((uint64_t)(aSig << 1)) {
4851 return propagateFloatx80NaN(a, b, status);
4853 return a;
4855 if ( bExp == 0 ) --expDiff;
4856 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4857 zExp = aExp;
4859 else if ( expDiff < 0 ) {
4860 if ( bExp == 0x7FFF ) {
4861 if ((uint64_t)(bSig << 1)) {
4862 return propagateFloatx80NaN(a, b, status);
4864 return packFloatx80(zSign,
4865 floatx80_infinity_high,
4866 floatx80_infinity_low);
4868 if ( aExp == 0 ) ++expDiff;
4869 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4870 zExp = bExp;
4872 else {
4873 if ( aExp == 0x7FFF ) {
4874 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4875 return propagateFloatx80NaN(a, b, status);
4877 return a;
4879 zSig1 = 0;
4880 zSig0 = aSig + bSig;
4881 if ( aExp == 0 ) {
4882 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4883 goto roundAndPack;
4885 zExp = aExp;
4886 goto shiftRight1;
4888 zSig0 = aSig + bSig;
4889 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4890 shiftRight1:
4891 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4892 zSig0 |= LIT64( 0x8000000000000000 );
4893 ++zExp;
4894 roundAndPack:
4895 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4896 zSign, zExp, zSig0, zSig1, status);
4899 /*----------------------------------------------------------------------------
4900 | Returns the result of subtracting the absolute values of the extended
4901 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4902 | difference is negated before being returned. `zSign' is ignored if the
4903 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4904 | Standard for Binary Floating-Point Arithmetic.
4905 *----------------------------------------------------------------------------*/
4907 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4908 float_status *status)
4910 int32_t aExp, bExp, zExp;
4911 uint64_t aSig, bSig, zSig0, zSig1;
4912 int32_t expDiff;
4914 aSig = extractFloatx80Frac( a );
4915 aExp = extractFloatx80Exp( a );
4916 bSig = extractFloatx80Frac( b );
4917 bExp = extractFloatx80Exp( b );
4918 expDiff = aExp - bExp;
4919 if ( 0 < expDiff ) goto aExpBigger;
4920 if ( expDiff < 0 ) goto bExpBigger;
4921 if ( aExp == 0x7FFF ) {
4922 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4923 return propagateFloatx80NaN(a, b, status);
4925 float_raise(float_flag_invalid, status);
4926 return floatx80_default_nan(status);
4928 if ( aExp == 0 ) {
4929 aExp = 1;
4930 bExp = 1;
4932 zSig1 = 0;
4933 if ( bSig < aSig ) goto aBigger;
4934 if ( aSig < bSig ) goto bBigger;
4935 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4936 bExpBigger:
4937 if ( bExp == 0x7FFF ) {
4938 if ((uint64_t)(bSig << 1)) {
4939 return propagateFloatx80NaN(a, b, status);
4941 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4942 floatx80_infinity_low);
4944 if ( aExp == 0 ) ++expDiff;
4945 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4946 bBigger:
4947 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4948 zExp = bExp;
4949 zSign ^= 1;
4950 goto normalizeRoundAndPack;
4951 aExpBigger:
4952 if ( aExp == 0x7FFF ) {
4953 if ((uint64_t)(aSig << 1)) {
4954 return propagateFloatx80NaN(a, b, status);
4956 return a;
4958 if ( bExp == 0 ) --expDiff;
4959 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4960 aBigger:
4961 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4962 zExp = aExp;
4963 normalizeRoundAndPack:
4964 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4965 zSign, zExp, zSig0, zSig1, status);
4968 /*----------------------------------------------------------------------------
4969 | Returns the result of adding the extended double-precision floating-point
4970 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4971 | Standard for Binary Floating-Point Arithmetic.
4972 *----------------------------------------------------------------------------*/
4974 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4976 flag aSign, bSign;
4978 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4979 float_raise(float_flag_invalid, status);
4980 return floatx80_default_nan(status);
4982 aSign = extractFloatx80Sign( a );
4983 bSign = extractFloatx80Sign( b );
4984 if ( aSign == bSign ) {
4985 return addFloatx80Sigs(a, b, aSign, status);
4987 else {
4988 return subFloatx80Sigs(a, b, aSign, status);
4993 /*----------------------------------------------------------------------------
4994 | Returns the result of subtracting the extended double-precision floating-
4995 | point values `a' and `b'. The operation is performed according to the
4996 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4997 *----------------------------------------------------------------------------*/
4999 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5001 flag aSign, bSign;
5003 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5004 float_raise(float_flag_invalid, status);
5005 return floatx80_default_nan(status);
5007 aSign = extractFloatx80Sign( a );
5008 bSign = extractFloatx80Sign( b );
5009 if ( aSign == bSign ) {
5010 return subFloatx80Sigs(a, b, aSign, status);
5012 else {
5013 return addFloatx80Sigs(a, b, aSign, status);
5018 /*----------------------------------------------------------------------------
5019 | Returns the result of multiplying the extended double-precision floating-
5020 | point values `a' and `b'. The operation is performed according to the
5021 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5022 *----------------------------------------------------------------------------*/
5024 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5026 flag aSign, bSign, zSign;
5027 int32_t aExp, bExp, zExp;
5028 uint64_t aSig, bSig, zSig0, zSig1;
5030 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5031 float_raise(float_flag_invalid, status);
5032 return floatx80_default_nan(status);
5034 aSig = extractFloatx80Frac( a );
5035 aExp = extractFloatx80Exp( a );
5036 aSign = extractFloatx80Sign( a );
5037 bSig = extractFloatx80Frac( b );
5038 bExp = extractFloatx80Exp( b );
5039 bSign = extractFloatx80Sign( b );
5040 zSign = aSign ^ bSign;
5041 if ( aExp == 0x7FFF ) {
5042 if ( (uint64_t) ( aSig<<1 )
5043 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5044 return propagateFloatx80NaN(a, b, status);
5046 if ( ( bExp | bSig ) == 0 ) goto invalid;
5047 return packFloatx80(zSign, floatx80_infinity_high,
5048 floatx80_infinity_low);
5050 if ( bExp == 0x7FFF ) {
5051 if ((uint64_t)(bSig << 1)) {
5052 return propagateFloatx80NaN(a, b, status);
5054 if ( ( aExp | aSig ) == 0 ) {
5055 invalid:
5056 float_raise(float_flag_invalid, status);
5057 return floatx80_default_nan(status);
5059 return packFloatx80(zSign, floatx80_infinity_high,
5060 floatx80_infinity_low);
5062 if ( aExp == 0 ) {
5063 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5064 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5066 if ( bExp == 0 ) {
5067 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5068 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5070 zExp = aExp + bExp - 0x3FFE;
5071 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5072 if ( 0 < (int64_t) zSig0 ) {
5073 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5074 --zExp;
5076 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5077 zSign, zExp, zSig0, zSig1, status);
5080 /*----------------------------------------------------------------------------
5081 | Returns the result of dividing the extended double-precision floating-point
5082 | value `a' by the corresponding value `b'. The operation is performed
5083 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5084 *----------------------------------------------------------------------------*/
5086 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5088 flag aSign, bSign, zSign;
5089 int32_t aExp, bExp, zExp;
5090 uint64_t aSig, bSig, zSig0, zSig1;
5091 uint64_t rem0, rem1, rem2, term0, term1, term2;
5093 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5094 float_raise(float_flag_invalid, status);
5095 return floatx80_default_nan(status);
5097 aSig = extractFloatx80Frac( a );
5098 aExp = extractFloatx80Exp( a );
5099 aSign = extractFloatx80Sign( a );
5100 bSig = extractFloatx80Frac( b );
5101 bExp = extractFloatx80Exp( b );
5102 bSign = extractFloatx80Sign( b );
5103 zSign = aSign ^ bSign;
5104 if ( aExp == 0x7FFF ) {
5105 if ((uint64_t)(aSig << 1)) {
5106 return propagateFloatx80NaN(a, b, status);
5108 if ( bExp == 0x7FFF ) {
5109 if ((uint64_t)(bSig << 1)) {
5110 return propagateFloatx80NaN(a, b, status);
5112 goto invalid;
5114 return packFloatx80(zSign, floatx80_infinity_high,
5115 floatx80_infinity_low);
5117 if ( bExp == 0x7FFF ) {
5118 if ((uint64_t)(bSig << 1)) {
5119 return propagateFloatx80NaN(a, b, status);
5121 return packFloatx80( zSign, 0, 0 );
5123 if ( bExp == 0 ) {
5124 if ( bSig == 0 ) {
5125 if ( ( aExp | aSig ) == 0 ) {
5126 invalid:
5127 float_raise(float_flag_invalid, status);
5128 return floatx80_default_nan(status);
5130 float_raise(float_flag_divbyzero, status);
5131 return packFloatx80(zSign, floatx80_infinity_high,
5132 floatx80_infinity_low);
5134 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5136 if ( aExp == 0 ) {
5137 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5138 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5140 zExp = aExp - bExp + 0x3FFE;
5141 rem1 = 0;
5142 if ( bSig <= aSig ) {
5143 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5144 ++zExp;
5146 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5147 mul64To128( bSig, zSig0, &term0, &term1 );
5148 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5149 while ( (int64_t) rem0 < 0 ) {
5150 --zSig0;
5151 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5153 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5154 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5155 mul64To128( bSig, zSig1, &term1, &term2 );
5156 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5157 while ( (int64_t) rem1 < 0 ) {
5158 --zSig1;
5159 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5161 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5163 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5164 zSign, zExp, zSig0, zSig1, status);
5167 /*----------------------------------------------------------------------------
5168 | Returns the remainder of the extended double-precision floating-point value
5169 | `a' with respect to the corresponding value `b'. The operation is performed
5170 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5171 *----------------------------------------------------------------------------*/
5173 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5175 flag aSign, zSign;
5176 int32_t aExp, bExp, expDiff;
5177 uint64_t aSig0, aSig1, bSig;
5178 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5180 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5181 float_raise(float_flag_invalid, status);
5182 return floatx80_default_nan(status);
5184 aSig0 = extractFloatx80Frac( a );
5185 aExp = extractFloatx80Exp( a );
5186 aSign = extractFloatx80Sign( a );
5187 bSig = extractFloatx80Frac( b );
5188 bExp = extractFloatx80Exp( b );
5189 if ( aExp == 0x7FFF ) {
5190 if ( (uint64_t) ( aSig0<<1 )
5191 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5192 return propagateFloatx80NaN(a, b, status);
5194 goto invalid;
5196 if ( bExp == 0x7FFF ) {
5197 if ((uint64_t)(bSig << 1)) {
5198 return propagateFloatx80NaN(a, b, status);
5200 return a;
5202 if ( bExp == 0 ) {
5203 if ( bSig == 0 ) {
5204 invalid:
5205 float_raise(float_flag_invalid, status);
5206 return floatx80_default_nan(status);
5208 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5210 if ( aExp == 0 ) {
5211 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5212 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5214 bSig |= LIT64( 0x8000000000000000 );
5215 zSign = aSign;
5216 expDiff = aExp - bExp;
5217 aSig1 = 0;
5218 if ( expDiff < 0 ) {
5219 if ( expDiff < -1 ) return a;
5220 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5221 expDiff = 0;
5223 q = ( bSig <= aSig0 );
5224 if ( q ) aSig0 -= bSig;
5225 expDiff -= 64;
5226 while ( 0 < expDiff ) {
5227 q = estimateDiv128To64( aSig0, aSig1, bSig );
5228 q = ( 2 < q ) ? q - 2 : 0;
5229 mul64To128( bSig, q, &term0, &term1 );
5230 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5231 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5232 expDiff -= 62;
5234 expDiff += 64;
5235 if ( 0 < expDiff ) {
5236 q = estimateDiv128To64( aSig0, aSig1, bSig );
5237 q = ( 2 < q ) ? q - 2 : 0;
5238 q >>= 64 - expDiff;
5239 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5240 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5241 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5242 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5243 ++q;
5244 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5247 else {
5248 term1 = 0;
5249 term0 = bSig;
5251 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5252 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5253 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5254 && ( q & 1 ) )
5256 aSig0 = alternateASig0;
5257 aSig1 = alternateASig1;
5258 zSign = ! zSign;
5260 return
5261 normalizeRoundAndPackFloatx80(
5262 80, zSign, bExp + expDiff, aSig0, aSig1, status);
5266 /*----------------------------------------------------------------------------
5267 | Returns the square root of the extended double-precision floating-point
5268 | value `a'. The operation is performed according to the IEC/IEEE Standard
5269 | for Binary Floating-Point Arithmetic.
5270 *----------------------------------------------------------------------------*/
5272 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5274 flag aSign;
5275 int32_t aExp, zExp;
5276 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5277 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5279 if (floatx80_invalid_encoding(a)) {
5280 float_raise(float_flag_invalid, status);
5281 return floatx80_default_nan(status);
5283 aSig0 = extractFloatx80Frac( a );
5284 aExp = extractFloatx80Exp( a );
5285 aSign = extractFloatx80Sign( a );
5286 if ( aExp == 0x7FFF ) {
5287 if ((uint64_t)(aSig0 << 1)) {
5288 return propagateFloatx80NaN(a, a, status);
5290 if ( ! aSign ) return a;
5291 goto invalid;
5293 if ( aSign ) {
5294 if ( ( aExp | aSig0 ) == 0 ) return a;
5295 invalid:
5296 float_raise(float_flag_invalid, status);
5297 return floatx80_default_nan(status);
5299 if ( aExp == 0 ) {
5300 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5301 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5303 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5304 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5305 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5306 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5307 doubleZSig0 = zSig0<<1;
5308 mul64To128( zSig0, zSig0, &term0, &term1 );
5309 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5310 while ( (int64_t) rem0 < 0 ) {
5311 --zSig0;
5312 doubleZSig0 -= 2;
5313 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5315 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5316 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5317 if ( zSig1 == 0 ) zSig1 = 1;
5318 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5319 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5320 mul64To128( zSig1, zSig1, &term2, &term3 );
5321 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5322 while ( (int64_t) rem1 < 0 ) {
5323 --zSig1;
5324 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5325 term3 |= 1;
5326 term2 |= doubleZSig0;
5327 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5329 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5331 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5332 zSig0 |= doubleZSig0;
5333 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5334 0, zExp, zSig0, zSig1, status);
5337 /*----------------------------------------------------------------------------
5338 | Returns 1 if the extended double-precision floating-point value `a' is equal
5339 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5340 | raised if either operand is a NaN. Otherwise, the comparison is performed
5341 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5342 *----------------------------------------------------------------------------*/
5344 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5347 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5348 || (extractFloatx80Exp(a) == 0x7FFF
5349 && (uint64_t) (extractFloatx80Frac(a) << 1))
5350 || (extractFloatx80Exp(b) == 0x7FFF
5351 && (uint64_t) (extractFloatx80Frac(b) << 1))
5353 float_raise(float_flag_invalid, status);
5354 return 0;
5356 return
5357 ( a.low == b.low )
5358 && ( ( a.high == b.high )
5359 || ( ( a.low == 0 )
5360 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5365 /*----------------------------------------------------------------------------
5366 | Returns 1 if the extended double-precision floating-point value `a' is
5367 | less than or equal to the corresponding value `b', and 0 otherwise. The
5368 | invalid exception is raised if either operand is a NaN. The comparison is
5369 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5370 | Arithmetic.
5371 *----------------------------------------------------------------------------*/
5373 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5375 flag aSign, bSign;
5377 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5378 || (extractFloatx80Exp(a) == 0x7FFF
5379 && (uint64_t) (extractFloatx80Frac(a) << 1))
5380 || (extractFloatx80Exp(b) == 0x7FFF
5381 && (uint64_t) (extractFloatx80Frac(b) << 1))
5383 float_raise(float_flag_invalid, status);
5384 return 0;
5386 aSign = extractFloatx80Sign( a );
5387 bSign = extractFloatx80Sign( b );
5388 if ( aSign != bSign ) {
5389 return
5390 aSign
5391 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5392 == 0 );
5394 return
5395 aSign ? le128( b.high, b.low, a.high, a.low )
5396 : le128( a.high, a.low, b.high, b.low );
5400 /*----------------------------------------------------------------------------
5401 | Returns 1 if the extended double-precision floating-point value `a' is
5402 | less than the corresponding value `b', and 0 otherwise. The invalid
5403 | exception is raised if either operand is a NaN. The comparison is performed
5404 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5405 *----------------------------------------------------------------------------*/
5407 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5409 flag aSign, bSign;
5411 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5412 || (extractFloatx80Exp(a) == 0x7FFF
5413 && (uint64_t) (extractFloatx80Frac(a) << 1))
5414 || (extractFloatx80Exp(b) == 0x7FFF
5415 && (uint64_t) (extractFloatx80Frac(b) << 1))
5417 float_raise(float_flag_invalid, status);
5418 return 0;
5420 aSign = extractFloatx80Sign( a );
5421 bSign = extractFloatx80Sign( b );
5422 if ( aSign != bSign ) {
5423 return
5424 aSign
5425 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5426 != 0 );
5428 return
5429 aSign ? lt128( b.high, b.low, a.high, a.low )
5430 : lt128( a.high, a.low, b.high, b.low );
5434 /*----------------------------------------------------------------------------
5435 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5436 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5437 | either operand is a NaN. The comparison is performed according to the
5438 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5439 *----------------------------------------------------------------------------*/
5440 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5442 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5443 || (extractFloatx80Exp(a) == 0x7FFF
5444 && (uint64_t) (extractFloatx80Frac(a) << 1))
5445 || (extractFloatx80Exp(b) == 0x7FFF
5446 && (uint64_t) (extractFloatx80Frac(b) << 1))
5448 float_raise(float_flag_invalid, status);
5449 return 1;
5451 return 0;
5454 /*----------------------------------------------------------------------------
5455 | Returns 1 if the extended double-precision floating-point value `a' is
5456 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5457 | cause an exception. The comparison is performed according to the IEC/IEEE
5458 | Standard for Binary Floating-Point Arithmetic.
5459 *----------------------------------------------------------------------------*/
5461 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5464 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5465 float_raise(float_flag_invalid, status);
5466 return 0;
5468 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5469 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5470 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5471 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5473 if (floatx80_is_signaling_nan(a, status)
5474 || floatx80_is_signaling_nan(b, status)) {
5475 float_raise(float_flag_invalid, status);
5477 return 0;
5479 return
5480 ( a.low == b.low )
5481 && ( ( a.high == b.high )
5482 || ( ( a.low == 0 )
5483 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5488 /*----------------------------------------------------------------------------
5489 | Returns 1 if the extended double-precision floating-point value `a' is less
5490 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5491 | do not cause an exception. Otherwise, the comparison is performed according
5492 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5493 *----------------------------------------------------------------------------*/
5495 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5497 flag aSign, bSign;
5499 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5500 float_raise(float_flag_invalid, status);
5501 return 0;
5503 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5504 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5505 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5506 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5508 if (floatx80_is_signaling_nan(a, status)
5509 || floatx80_is_signaling_nan(b, status)) {
5510 float_raise(float_flag_invalid, status);
5512 return 0;
5514 aSign = extractFloatx80Sign( a );
5515 bSign = extractFloatx80Sign( b );
5516 if ( aSign != bSign ) {
5517 return
5518 aSign
5519 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5520 == 0 );
5522 return
5523 aSign ? le128( b.high, b.low, a.high, a.low )
5524 : le128( a.high, a.low, b.high, b.low );
5528 /*----------------------------------------------------------------------------
5529 | Returns 1 if the extended double-precision floating-point value `a' is less
5530 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5531 | an exception. Otherwise, the comparison is performed according to the
5532 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5533 *----------------------------------------------------------------------------*/
5535 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5537 flag aSign, bSign;
5539 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5540 float_raise(float_flag_invalid, status);
5541 return 0;
5543 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5544 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5545 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5546 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5548 if (floatx80_is_signaling_nan(a, status)
5549 || floatx80_is_signaling_nan(b, status)) {
5550 float_raise(float_flag_invalid, status);
5552 return 0;
5554 aSign = extractFloatx80Sign( a );
5555 bSign = extractFloatx80Sign( b );
5556 if ( aSign != bSign ) {
5557 return
5558 aSign
5559 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5560 != 0 );
5562 return
5563 aSign ? lt128( b.high, b.low, a.high, a.low )
5564 : lt128( a.high, a.low, b.high, b.low );
5568 /*----------------------------------------------------------------------------
5569 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5570 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5571 | The comparison is performed according to the IEC/IEEE Standard for Binary
5572 | Floating-Point Arithmetic.
5573 *----------------------------------------------------------------------------*/
5574 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5576 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5577 float_raise(float_flag_invalid, status);
5578 return 1;
5580 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5581 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5582 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5583 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5585 if (floatx80_is_signaling_nan(a, status)
5586 || floatx80_is_signaling_nan(b, status)) {
5587 float_raise(float_flag_invalid, status);
5589 return 1;
5591 return 0;
5594 /*----------------------------------------------------------------------------
5595 | Returns the result of converting the quadruple-precision floating-point
5596 | value `a' to the 32-bit two's complement integer format. The conversion
5597 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5598 | Arithmetic---which means in particular that the conversion is rounded
5599 | according to the current rounding mode. If `a' is a NaN, the largest
5600 | positive integer is returned. Otherwise, if the conversion overflows, the
5601 | largest integer with the same sign as `a' is returned.
5602 *----------------------------------------------------------------------------*/
5604 int32_t float128_to_int32(float128 a, float_status *status)
5606 flag aSign;
5607 int32_t aExp, shiftCount;
5608 uint64_t aSig0, aSig1;
5610 aSig1 = extractFloat128Frac1( a );
5611 aSig0 = extractFloat128Frac0( a );
5612 aExp = extractFloat128Exp( a );
5613 aSign = extractFloat128Sign( a );
5614 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5615 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5616 aSig0 |= ( aSig1 != 0 );
5617 shiftCount = 0x4028 - aExp;
5618 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5619 return roundAndPackInt32(aSign, aSig0, status);
5623 /*----------------------------------------------------------------------------
5624 | Returns the result of converting the quadruple-precision floating-point
5625 | value `a' to the 32-bit two's complement integer format. The conversion
5626 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5627 | Arithmetic, except that the conversion is always rounded toward zero. If
5628 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5629 | conversion overflows, the largest integer with the same sign as `a' is
5630 | returned.
5631 *----------------------------------------------------------------------------*/
5633 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5635 flag aSign;
5636 int32_t aExp, shiftCount;
5637 uint64_t aSig0, aSig1, savedASig;
5638 int32_t z;
5640 aSig1 = extractFloat128Frac1( a );
5641 aSig0 = extractFloat128Frac0( a );
5642 aExp = extractFloat128Exp( a );
5643 aSign = extractFloat128Sign( a );
5644 aSig0 |= ( aSig1 != 0 );
5645 if ( 0x401E < aExp ) {
5646 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5647 goto invalid;
5649 else if ( aExp < 0x3FFF ) {
5650 if (aExp || aSig0) {
5651 status->float_exception_flags |= float_flag_inexact;
5653 return 0;
5655 aSig0 |= LIT64( 0x0001000000000000 );
5656 shiftCount = 0x402F - aExp;
5657 savedASig = aSig0;
5658 aSig0 >>= shiftCount;
5659 z = aSig0;
5660 if ( aSign ) z = - z;
5661 if ( ( z < 0 ) ^ aSign ) {
5662 invalid:
5663 float_raise(float_flag_invalid, status);
5664 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5666 if ( ( aSig0<<shiftCount ) != savedASig ) {
5667 status->float_exception_flags |= float_flag_inexact;
5669 return z;
5673 /*----------------------------------------------------------------------------
5674 | Returns the result of converting the quadruple-precision floating-point
5675 | value `a' to the 64-bit two's complement integer format. The conversion
5676 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5677 | Arithmetic---which means in particular that the conversion is rounded
5678 | according to the current rounding mode. If `a' is a NaN, the largest
5679 | positive integer is returned. Otherwise, if the conversion overflows, the
5680 | largest integer with the same sign as `a' is returned.
5681 *----------------------------------------------------------------------------*/
5683 int64_t float128_to_int64(float128 a, float_status *status)
5685 flag aSign;
5686 int32_t aExp, shiftCount;
5687 uint64_t aSig0, aSig1;
5689 aSig1 = extractFloat128Frac1( a );
5690 aSig0 = extractFloat128Frac0( a );
5691 aExp = extractFloat128Exp( a );
5692 aSign = extractFloat128Sign( a );
5693 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5694 shiftCount = 0x402F - aExp;
5695 if ( shiftCount <= 0 ) {
5696 if ( 0x403E < aExp ) {
5697 float_raise(float_flag_invalid, status);
5698 if ( ! aSign
5699 || ( ( aExp == 0x7FFF )
5700 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5703 return LIT64( 0x7FFFFFFFFFFFFFFF );
5705 return (int64_t) LIT64( 0x8000000000000000 );
5707 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5709 else {
5710 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5712 return roundAndPackInt64(aSign, aSig0, aSig1, status);
5716 /*----------------------------------------------------------------------------
5717 | Returns the result of converting the quadruple-precision floating-point
5718 | value `a' to the 64-bit two's complement integer format. The conversion
5719 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5720 | Arithmetic, except that the conversion is always rounded toward zero.
5721 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5722 | the conversion overflows, the largest integer with the same sign as `a' is
5723 | returned.
5724 *----------------------------------------------------------------------------*/
5726 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5728 flag aSign;
5729 int32_t aExp, shiftCount;
5730 uint64_t aSig0, aSig1;
5731 int64_t z;
5733 aSig1 = extractFloat128Frac1( a );
5734 aSig0 = extractFloat128Frac0( a );
5735 aExp = extractFloat128Exp( a );
5736 aSign = extractFloat128Sign( a );
5737 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5738 shiftCount = aExp - 0x402F;
5739 if ( 0 < shiftCount ) {
5740 if ( 0x403E <= aExp ) {
5741 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5742 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5743 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5744 if (aSig1) {
5745 status->float_exception_flags |= float_flag_inexact;
5748 else {
5749 float_raise(float_flag_invalid, status);
5750 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5751 return LIT64( 0x7FFFFFFFFFFFFFFF );
5754 return (int64_t) LIT64( 0x8000000000000000 );
5756 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5757 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5758 status->float_exception_flags |= float_flag_inexact;
5761 else {
5762 if ( aExp < 0x3FFF ) {
5763 if ( aExp | aSig0 | aSig1 ) {
5764 status->float_exception_flags |= float_flag_inexact;
5766 return 0;
5768 z = aSig0>>( - shiftCount );
5769 if ( aSig1
5770 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5771 status->float_exception_flags |= float_flag_inexact;
5774 if ( aSign ) z = - z;
5775 return z;
5779 /*----------------------------------------------------------------------------
5780 | Returns the result of converting the quadruple-precision floating-point value
5781 | `a' to the 64-bit unsigned integer format. The conversion is
5782 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5783 | Arithmetic---which means in particular that the conversion is rounded
5784 | according to the current rounding mode. If `a' is a NaN, the largest
5785 | positive integer is returned. If the conversion overflows, the
5786 | largest unsigned integer is returned. If 'a' is negative, the value is
5787 | rounded and zero is returned; negative values that do not round to zero
5788 | will raise the inexact exception.
5789 *----------------------------------------------------------------------------*/
5791 uint64_t float128_to_uint64(float128 a, float_status *status)
5793 flag aSign;
5794 int aExp;
5795 int shiftCount;
5796 uint64_t aSig0, aSig1;
5798 aSig0 = extractFloat128Frac0(a);
5799 aSig1 = extractFloat128Frac1(a);
5800 aExp = extractFloat128Exp(a);
5801 aSign = extractFloat128Sign(a);
5802 if (aSign && (aExp > 0x3FFE)) {
5803 float_raise(float_flag_invalid, status);
5804 if (float128_is_any_nan(a)) {
5805 return LIT64(0xFFFFFFFFFFFFFFFF);
5806 } else {
5807 return 0;
5810 if (aExp) {
5811 aSig0 |= LIT64(0x0001000000000000);
5813 shiftCount = 0x402F - aExp;
5814 if (shiftCount <= 0) {
5815 if (0x403E < aExp) {
5816 float_raise(float_flag_invalid, status);
5817 return LIT64(0xFFFFFFFFFFFFFFFF);
5819 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5820 } else {
5821 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5823 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5826 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5828 uint64_t v;
5829 signed char current_rounding_mode = status->float_rounding_mode;
5831 set_float_rounding_mode(float_round_to_zero, status);
5832 v = float128_to_uint64(a, status);
5833 set_float_rounding_mode(current_rounding_mode, status);
5835 return v;
5838 /*----------------------------------------------------------------------------
5839 | Returns the result of converting the quadruple-precision floating-point
5840 | value `a' to the 32-bit unsigned integer format. The conversion
5841 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5842 | Arithmetic except that the conversion is always rounded toward zero.
5843 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5844 | if the conversion overflows, the largest unsigned integer is returned.
5845 | If 'a' is negative, the value is rounded and zero is returned; negative
5846 | values that do not round to zero will raise the inexact exception.
5847 *----------------------------------------------------------------------------*/
5849 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5851 uint64_t v;
5852 uint32_t res;
5853 int old_exc_flags = get_float_exception_flags(status);
5855 v = float128_to_uint64_round_to_zero(a, status);
5856 if (v > 0xffffffff) {
5857 res = 0xffffffff;
5858 } else {
5859 return v;
5861 set_float_exception_flags(old_exc_flags, status);
5862 float_raise(float_flag_invalid, status);
5863 return res;
5866 /*----------------------------------------------------------------------------
5867 | Returns the result of converting the quadruple-precision floating-point
5868 | value `a' to the single-precision floating-point format. The conversion
5869 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5870 | Arithmetic.
5871 *----------------------------------------------------------------------------*/
5873 float32 float128_to_float32(float128 a, float_status *status)
5875 flag aSign;
5876 int32_t aExp;
5877 uint64_t aSig0, aSig1;
5878 uint32_t zSig;
5880 aSig1 = extractFloat128Frac1( a );
5881 aSig0 = extractFloat128Frac0( a );
5882 aExp = extractFloat128Exp( a );
5883 aSign = extractFloat128Sign( a );
5884 if ( aExp == 0x7FFF ) {
5885 if ( aSig0 | aSig1 ) {
5886 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5888 return packFloat32( aSign, 0xFF, 0 );
5890 aSig0 |= ( aSig1 != 0 );
5891 shift64RightJamming( aSig0, 18, &aSig0 );
5892 zSig = aSig0;
5893 if ( aExp || zSig ) {
5894 zSig |= 0x40000000;
5895 aExp -= 0x3F81;
5897 return roundAndPackFloat32(aSign, aExp, zSig, status);
5901 /*----------------------------------------------------------------------------
5902 | Returns the result of converting the quadruple-precision floating-point
5903 | value `a' to the double-precision floating-point format. The conversion
5904 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5905 | Arithmetic.
5906 *----------------------------------------------------------------------------*/
5908 float64 float128_to_float64(float128 a, float_status *status)
5910 flag aSign;
5911 int32_t aExp;
5912 uint64_t aSig0, aSig1;
5914 aSig1 = extractFloat128Frac1( a );
5915 aSig0 = extractFloat128Frac0( a );
5916 aExp = extractFloat128Exp( a );
5917 aSign = extractFloat128Sign( a );
5918 if ( aExp == 0x7FFF ) {
5919 if ( aSig0 | aSig1 ) {
5920 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5922 return packFloat64( aSign, 0x7FF, 0 );
5924 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5925 aSig0 |= ( aSig1 != 0 );
5926 if ( aExp || aSig0 ) {
5927 aSig0 |= LIT64( 0x4000000000000000 );
5928 aExp -= 0x3C01;
5930 return roundAndPackFloat64(aSign, aExp, aSig0, status);
5934 /*----------------------------------------------------------------------------
5935 | Returns the result of converting the quadruple-precision floating-point
5936 | value `a' to the extended double-precision floating-point format. The
5937 | conversion is performed according to the IEC/IEEE Standard for Binary
5938 | Floating-Point Arithmetic.
5939 *----------------------------------------------------------------------------*/
5941 floatx80 float128_to_floatx80(float128 a, float_status *status)
5943 flag aSign;
5944 int32_t aExp;
5945 uint64_t aSig0, aSig1;
5947 aSig1 = extractFloat128Frac1( a );
5948 aSig0 = extractFloat128Frac0( a );
5949 aExp = extractFloat128Exp( a );
5950 aSign = extractFloat128Sign( a );
5951 if ( aExp == 0x7FFF ) {
5952 if ( aSig0 | aSig1 ) {
5953 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5955 return packFloatx80(aSign, floatx80_infinity_high,
5956 floatx80_infinity_low);
5958 if ( aExp == 0 ) {
5959 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5960 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5962 else {
5963 aSig0 |= LIT64( 0x0001000000000000 );
5965 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5966 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5970 /*----------------------------------------------------------------------------
5971 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5972 | returns the result as a quadruple-precision floating-point value. The
5973 | operation is performed according to the IEC/IEEE Standard for Binary
5974 | Floating-Point Arithmetic.
5975 *----------------------------------------------------------------------------*/
5977 float128 float128_round_to_int(float128 a, float_status *status)
5979 flag aSign;
5980 int32_t aExp;
5981 uint64_t lastBitMask, roundBitsMask;
5982 float128 z;
5984 aExp = extractFloat128Exp( a );
5985 if ( 0x402F <= aExp ) {
5986 if ( 0x406F <= aExp ) {
5987 if ( ( aExp == 0x7FFF )
5988 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5990 return propagateFloat128NaN(a, a, status);
5992 return a;
5994 lastBitMask = 1;
5995 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5996 roundBitsMask = lastBitMask - 1;
5997 z = a;
5998 switch (status->float_rounding_mode) {
5999 case float_round_nearest_even:
6000 if ( lastBitMask ) {
6001 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6002 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6004 else {
6005 if ( (int64_t) z.low < 0 ) {
6006 ++z.high;
6007 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6010 break;
6011 case float_round_ties_away:
6012 if (lastBitMask) {
6013 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6014 } else {
6015 if ((int64_t) z.low < 0) {
6016 ++z.high;
6019 break;
6020 case float_round_to_zero:
6021 break;
6022 case float_round_up:
6023 if (!extractFloat128Sign(z)) {
6024 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6026 break;
6027 case float_round_down:
6028 if (extractFloat128Sign(z)) {
6029 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6031 break;
6032 default:
6033 abort();
6035 z.low &= ~ roundBitsMask;
6037 else {
6038 if ( aExp < 0x3FFF ) {
6039 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6040 status->float_exception_flags |= float_flag_inexact;
6041 aSign = extractFloat128Sign( a );
6042 switch (status->float_rounding_mode) {
6043 case float_round_nearest_even:
6044 if ( ( aExp == 0x3FFE )
6045 && ( extractFloat128Frac0( a )
6046 | extractFloat128Frac1( a ) )
6048 return packFloat128( aSign, 0x3FFF, 0, 0 );
6050 break;
6051 case float_round_ties_away:
6052 if (aExp == 0x3FFE) {
6053 return packFloat128(aSign, 0x3FFF, 0, 0);
6055 break;
6056 case float_round_down:
6057 return
6058 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6059 : packFloat128( 0, 0, 0, 0 );
6060 case float_round_up:
6061 return
6062 aSign ? packFloat128( 1, 0, 0, 0 )
6063 : packFloat128( 0, 0x3FFF, 0, 0 );
6065 return packFloat128( aSign, 0, 0, 0 );
6067 lastBitMask = 1;
6068 lastBitMask <<= 0x402F - aExp;
6069 roundBitsMask = lastBitMask - 1;
6070 z.low = 0;
6071 z.high = a.high;
6072 switch (status->float_rounding_mode) {
6073 case float_round_nearest_even:
6074 z.high += lastBitMask>>1;
6075 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6076 z.high &= ~ lastBitMask;
6078 break;
6079 case float_round_ties_away:
6080 z.high += lastBitMask>>1;
6081 break;
6082 case float_round_to_zero:
6083 break;
6084 case float_round_up:
6085 if (!extractFloat128Sign(z)) {
6086 z.high |= ( a.low != 0 );
6087 z.high += roundBitsMask;
6089 break;
6090 case float_round_down:
6091 if (extractFloat128Sign(z)) {
6092 z.high |= (a.low != 0);
6093 z.high += roundBitsMask;
6095 break;
6096 default:
6097 abort();
6099 z.high &= ~ roundBitsMask;
6101 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6102 status->float_exception_flags |= float_flag_inexact;
6104 return z;
6108 /*----------------------------------------------------------------------------
6109 | Returns the result of adding the absolute values of the quadruple-precision
6110 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6111 | before being returned. `zSign' is ignored if the result is a NaN.
6112 | The addition is performed according to the IEC/IEEE Standard for Binary
6113 | Floating-Point Arithmetic.
6114 *----------------------------------------------------------------------------*/
6116 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6117 float_status *status)
6119 int32_t aExp, bExp, zExp;
6120 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6121 int32_t expDiff;
6123 aSig1 = extractFloat128Frac1( a );
6124 aSig0 = extractFloat128Frac0( a );
6125 aExp = extractFloat128Exp( a );
6126 bSig1 = extractFloat128Frac1( b );
6127 bSig0 = extractFloat128Frac0( b );
6128 bExp = extractFloat128Exp( b );
6129 expDiff = aExp - bExp;
6130 if ( 0 < expDiff ) {
6131 if ( aExp == 0x7FFF ) {
6132 if (aSig0 | aSig1) {
6133 return propagateFloat128NaN(a, b, status);
6135 return a;
6137 if ( bExp == 0 ) {
6138 --expDiff;
6140 else {
6141 bSig0 |= LIT64( 0x0001000000000000 );
6143 shift128ExtraRightJamming(
6144 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6145 zExp = aExp;
6147 else if ( expDiff < 0 ) {
6148 if ( bExp == 0x7FFF ) {
6149 if (bSig0 | bSig1) {
6150 return propagateFloat128NaN(a, b, status);
6152 return packFloat128( zSign, 0x7FFF, 0, 0 );
6154 if ( aExp == 0 ) {
6155 ++expDiff;
6157 else {
6158 aSig0 |= LIT64( 0x0001000000000000 );
6160 shift128ExtraRightJamming(
6161 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6162 zExp = bExp;
6164 else {
6165 if ( aExp == 0x7FFF ) {
6166 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6167 return propagateFloat128NaN(a, b, status);
6169 return a;
6171 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6172 if ( aExp == 0 ) {
6173 if (status->flush_to_zero) {
6174 if (zSig0 | zSig1) {
6175 float_raise(float_flag_output_denormal, status);
6177 return packFloat128(zSign, 0, 0, 0);
6179 return packFloat128( zSign, 0, zSig0, zSig1 );
6181 zSig2 = 0;
6182 zSig0 |= LIT64( 0x0002000000000000 );
6183 zExp = aExp;
6184 goto shiftRight1;
6186 aSig0 |= LIT64( 0x0001000000000000 );
6187 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6188 --zExp;
6189 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6190 ++zExp;
6191 shiftRight1:
6192 shift128ExtraRightJamming(
6193 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6194 roundAndPack:
6195 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6199 /*----------------------------------------------------------------------------
6200 | Returns the result of subtracting the absolute values of the quadruple-
6201 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6202 | difference is negated before being returned. `zSign' is ignored if the
6203 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6204 | Standard for Binary Floating-Point Arithmetic.
6205 *----------------------------------------------------------------------------*/
6207 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6208 float_status *status)
6210 int32_t aExp, bExp, zExp;
6211 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6212 int32_t expDiff;
6214 aSig1 = extractFloat128Frac1( a );
6215 aSig0 = extractFloat128Frac0( a );
6216 aExp = extractFloat128Exp( a );
6217 bSig1 = extractFloat128Frac1( b );
6218 bSig0 = extractFloat128Frac0( b );
6219 bExp = extractFloat128Exp( b );
6220 expDiff = aExp - bExp;
6221 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6222 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6223 if ( 0 < expDiff ) goto aExpBigger;
6224 if ( expDiff < 0 ) goto bExpBigger;
6225 if ( aExp == 0x7FFF ) {
6226 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6227 return propagateFloat128NaN(a, b, status);
6229 float_raise(float_flag_invalid, status);
6230 return float128_default_nan(status);
6232 if ( aExp == 0 ) {
6233 aExp = 1;
6234 bExp = 1;
6236 if ( bSig0 < aSig0 ) goto aBigger;
6237 if ( aSig0 < bSig0 ) goto bBigger;
6238 if ( bSig1 < aSig1 ) goto aBigger;
6239 if ( aSig1 < bSig1 ) goto bBigger;
6240 return packFloat128(status->float_rounding_mode == float_round_down,
6241 0, 0, 0);
6242 bExpBigger:
6243 if ( bExp == 0x7FFF ) {
6244 if (bSig0 | bSig1) {
6245 return propagateFloat128NaN(a, b, status);
6247 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6249 if ( aExp == 0 ) {
6250 ++expDiff;
6252 else {
6253 aSig0 |= LIT64( 0x4000000000000000 );
6255 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6256 bSig0 |= LIT64( 0x4000000000000000 );
6257 bBigger:
6258 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6259 zExp = bExp;
6260 zSign ^= 1;
6261 goto normalizeRoundAndPack;
6262 aExpBigger:
6263 if ( aExp == 0x7FFF ) {
6264 if (aSig0 | aSig1) {
6265 return propagateFloat128NaN(a, b, status);
6267 return a;
6269 if ( bExp == 0 ) {
6270 --expDiff;
6272 else {
6273 bSig0 |= LIT64( 0x4000000000000000 );
6275 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6276 aSig0 |= LIT64( 0x4000000000000000 );
6277 aBigger:
6278 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6279 zExp = aExp;
6280 normalizeRoundAndPack:
6281 --zExp;
6282 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6283 status);
6287 /*----------------------------------------------------------------------------
6288 | Returns the result of adding the quadruple-precision floating-point values
6289 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6290 | for Binary Floating-Point Arithmetic.
6291 *----------------------------------------------------------------------------*/
6293 float128 float128_add(float128 a, float128 b, float_status *status)
6295 flag aSign, bSign;
6297 aSign = extractFloat128Sign( a );
6298 bSign = extractFloat128Sign( b );
6299 if ( aSign == bSign ) {
6300 return addFloat128Sigs(a, b, aSign, status);
6302 else {
6303 return subFloat128Sigs(a, b, aSign, status);
6308 /*----------------------------------------------------------------------------
6309 | Returns the result of subtracting the quadruple-precision floating-point
6310 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6311 | Standard for Binary Floating-Point Arithmetic.
6312 *----------------------------------------------------------------------------*/
6314 float128 float128_sub(float128 a, float128 b, float_status *status)
6316 flag aSign, bSign;
6318 aSign = extractFloat128Sign( a );
6319 bSign = extractFloat128Sign( b );
6320 if ( aSign == bSign ) {
6321 return subFloat128Sigs(a, b, aSign, status);
6323 else {
6324 return addFloat128Sigs(a, b, aSign, status);
6329 /*----------------------------------------------------------------------------
6330 | Returns the result of multiplying the quadruple-precision floating-point
6331 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6332 | Standard for Binary Floating-Point Arithmetic.
6333 *----------------------------------------------------------------------------*/
6335 float128 float128_mul(float128 a, float128 b, float_status *status)
6337 flag aSign, bSign, zSign;
6338 int32_t aExp, bExp, zExp;
6339 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6341 aSig1 = extractFloat128Frac1( a );
6342 aSig0 = extractFloat128Frac0( a );
6343 aExp = extractFloat128Exp( a );
6344 aSign = extractFloat128Sign( a );
6345 bSig1 = extractFloat128Frac1( b );
6346 bSig0 = extractFloat128Frac0( b );
6347 bExp = extractFloat128Exp( b );
6348 bSign = extractFloat128Sign( b );
6349 zSign = aSign ^ bSign;
6350 if ( aExp == 0x7FFF ) {
6351 if ( ( aSig0 | aSig1 )
6352 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6353 return propagateFloat128NaN(a, b, status);
6355 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6356 return packFloat128( zSign, 0x7FFF, 0, 0 );
6358 if ( bExp == 0x7FFF ) {
6359 if (bSig0 | bSig1) {
6360 return propagateFloat128NaN(a, b, status);
6362 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6363 invalid:
6364 float_raise(float_flag_invalid, status);
6365 return float128_default_nan(status);
6367 return packFloat128( zSign, 0x7FFF, 0, 0 );
6369 if ( aExp == 0 ) {
6370 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6371 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6373 if ( bExp == 0 ) {
6374 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6375 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6377 zExp = aExp + bExp - 0x4000;
6378 aSig0 |= LIT64( 0x0001000000000000 );
6379 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6380 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6381 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6382 zSig2 |= ( zSig3 != 0 );
6383 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6384 shift128ExtraRightJamming(
6385 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6386 ++zExp;
6388 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6392 /*----------------------------------------------------------------------------
6393 | Returns the result of dividing the quadruple-precision floating-point value
6394 | `a' by the corresponding value `b'. The operation is performed according to
6395 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6396 *----------------------------------------------------------------------------*/
6398 float128 float128_div(float128 a, float128 b, float_status *status)
6400 flag aSign, bSign, zSign;
6401 int32_t aExp, bExp, zExp;
6402 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6403 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6405 aSig1 = extractFloat128Frac1( a );
6406 aSig0 = extractFloat128Frac0( a );
6407 aExp = extractFloat128Exp( a );
6408 aSign = extractFloat128Sign( a );
6409 bSig1 = extractFloat128Frac1( b );
6410 bSig0 = extractFloat128Frac0( b );
6411 bExp = extractFloat128Exp( b );
6412 bSign = extractFloat128Sign( b );
6413 zSign = aSign ^ bSign;
6414 if ( aExp == 0x7FFF ) {
6415 if (aSig0 | aSig1) {
6416 return propagateFloat128NaN(a, b, status);
6418 if ( bExp == 0x7FFF ) {
6419 if (bSig0 | bSig1) {
6420 return propagateFloat128NaN(a, b, status);
6422 goto invalid;
6424 return packFloat128( zSign, 0x7FFF, 0, 0 );
6426 if ( bExp == 0x7FFF ) {
6427 if (bSig0 | bSig1) {
6428 return propagateFloat128NaN(a, b, status);
6430 return packFloat128( zSign, 0, 0, 0 );
6432 if ( bExp == 0 ) {
6433 if ( ( bSig0 | bSig1 ) == 0 ) {
6434 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6435 invalid:
6436 float_raise(float_flag_invalid, status);
6437 return float128_default_nan(status);
6439 float_raise(float_flag_divbyzero, status);
6440 return packFloat128( zSign, 0x7FFF, 0, 0 );
6442 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6444 if ( aExp == 0 ) {
6445 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6446 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6448 zExp = aExp - bExp + 0x3FFD;
6449 shortShift128Left(
6450 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6451 shortShift128Left(
6452 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6453 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6454 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6455 ++zExp;
6457 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6458 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6459 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6460 while ( (int64_t) rem0 < 0 ) {
6461 --zSig0;
6462 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6464 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6465 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6466 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6467 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6468 while ( (int64_t) rem1 < 0 ) {
6469 --zSig1;
6470 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6472 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6474 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6475 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6479 /*----------------------------------------------------------------------------
6480 | Returns the remainder of the quadruple-precision floating-point value `a'
6481 | with respect to the corresponding value `b'. The operation is performed
6482 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6483 *----------------------------------------------------------------------------*/
6485 float128 float128_rem(float128 a, float128 b, float_status *status)
6487 flag aSign, zSign;
6488 int32_t aExp, bExp, expDiff;
6489 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6490 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6491 int64_t sigMean0;
6493 aSig1 = extractFloat128Frac1( a );
6494 aSig0 = extractFloat128Frac0( a );
6495 aExp = extractFloat128Exp( a );
6496 aSign = extractFloat128Sign( a );
6497 bSig1 = extractFloat128Frac1( b );
6498 bSig0 = extractFloat128Frac0( b );
6499 bExp = extractFloat128Exp( b );
6500 if ( aExp == 0x7FFF ) {
6501 if ( ( aSig0 | aSig1 )
6502 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6503 return propagateFloat128NaN(a, b, status);
6505 goto invalid;
6507 if ( bExp == 0x7FFF ) {
6508 if (bSig0 | bSig1) {
6509 return propagateFloat128NaN(a, b, status);
6511 return a;
6513 if ( bExp == 0 ) {
6514 if ( ( bSig0 | bSig1 ) == 0 ) {
6515 invalid:
6516 float_raise(float_flag_invalid, status);
6517 return float128_default_nan(status);
6519 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6521 if ( aExp == 0 ) {
6522 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6523 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6525 expDiff = aExp - bExp;
6526 if ( expDiff < -1 ) return a;
6527 shortShift128Left(
6528 aSig0 | LIT64( 0x0001000000000000 ),
6529 aSig1,
6530 15 - ( expDiff < 0 ),
6531 &aSig0,
6532 &aSig1
6534 shortShift128Left(
6535 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6536 q = le128( bSig0, bSig1, aSig0, aSig1 );
6537 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6538 expDiff -= 64;
6539 while ( 0 < expDiff ) {
6540 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6541 q = ( 4 < q ) ? q - 4 : 0;
6542 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6543 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6544 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6545 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6546 expDiff -= 61;
6548 if ( -64 < expDiff ) {
6549 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6550 q = ( 4 < q ) ? q - 4 : 0;
6551 q >>= - expDiff;
6552 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6553 expDiff += 52;
6554 if ( expDiff < 0 ) {
6555 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6557 else {
6558 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6560 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6561 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6563 else {
6564 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6565 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6567 do {
6568 alternateASig0 = aSig0;
6569 alternateASig1 = aSig1;
6570 ++q;
6571 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6572 } while ( 0 <= (int64_t) aSig0 );
6573 add128(
6574 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6575 if ( ( sigMean0 < 0 )
6576 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6577 aSig0 = alternateASig0;
6578 aSig1 = alternateASig1;
6580 zSign = ( (int64_t) aSig0 < 0 );
6581 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6582 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6583 status);
6586 /*----------------------------------------------------------------------------
6587 | Returns the square root of the quadruple-precision floating-point value `a'.
6588 | The operation is performed according to the IEC/IEEE Standard for Binary
6589 | Floating-Point Arithmetic.
6590 *----------------------------------------------------------------------------*/
6592 float128 float128_sqrt(float128 a, float_status *status)
6594 flag aSign;
6595 int32_t aExp, zExp;
6596 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6597 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6599 aSig1 = extractFloat128Frac1( a );
6600 aSig0 = extractFloat128Frac0( a );
6601 aExp = extractFloat128Exp( a );
6602 aSign = extractFloat128Sign( a );
6603 if ( aExp == 0x7FFF ) {
6604 if (aSig0 | aSig1) {
6605 return propagateFloat128NaN(a, a, status);
6607 if ( ! aSign ) return a;
6608 goto invalid;
6610 if ( aSign ) {
6611 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6612 invalid:
6613 float_raise(float_flag_invalid, status);
6614 return float128_default_nan(status);
6616 if ( aExp == 0 ) {
6617 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6618 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6620 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6621 aSig0 |= LIT64( 0x0001000000000000 );
6622 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6623 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6624 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6625 doubleZSig0 = zSig0<<1;
6626 mul64To128( zSig0, zSig0, &term0, &term1 );
6627 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6628 while ( (int64_t) rem0 < 0 ) {
6629 --zSig0;
6630 doubleZSig0 -= 2;
6631 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6633 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6634 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6635 if ( zSig1 == 0 ) zSig1 = 1;
6636 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6637 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6638 mul64To128( zSig1, zSig1, &term2, &term3 );
6639 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6640 while ( (int64_t) rem1 < 0 ) {
6641 --zSig1;
6642 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6643 term3 |= 1;
6644 term2 |= doubleZSig0;
6645 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6647 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6649 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6650 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6654 /*----------------------------------------------------------------------------
6655 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6656 | the corresponding value `b', and 0 otherwise. The invalid exception is
6657 | raised if either operand is a NaN. Otherwise, the comparison is performed
6658 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6659 *----------------------------------------------------------------------------*/
6661 int float128_eq(float128 a, float128 b, float_status *status)
6664 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6665 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6666 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6667 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6669 float_raise(float_flag_invalid, status);
6670 return 0;
6672 return
6673 ( a.low == b.low )
6674 && ( ( a.high == b.high )
6675 || ( ( a.low == 0 )
6676 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6681 /*----------------------------------------------------------------------------
6682 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6683 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6684 | exception is raised if either operand is a NaN. The comparison is performed
6685 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6686 *----------------------------------------------------------------------------*/
6688 int float128_le(float128 a, float128 b, float_status *status)
6690 flag aSign, bSign;
6692 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6693 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6694 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6695 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6697 float_raise(float_flag_invalid, status);
6698 return 0;
6700 aSign = extractFloat128Sign( a );
6701 bSign = extractFloat128Sign( b );
6702 if ( aSign != bSign ) {
6703 return
6704 aSign
6705 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6706 == 0 );
6708 return
6709 aSign ? le128( b.high, b.low, a.high, a.low )
6710 : le128( a.high, a.low, b.high, b.low );
6714 /*----------------------------------------------------------------------------
6715 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6716 | the corresponding value `b', and 0 otherwise. The invalid exception is
6717 | raised if either operand is a NaN. The comparison is performed according
6718 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6719 *----------------------------------------------------------------------------*/
6721 int float128_lt(float128 a, float128 b, float_status *status)
6723 flag aSign, bSign;
6725 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6726 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6727 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6728 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6730 float_raise(float_flag_invalid, status);
6731 return 0;
6733 aSign = extractFloat128Sign( a );
6734 bSign = extractFloat128Sign( b );
6735 if ( aSign != bSign ) {
6736 return
6737 aSign
6738 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6739 != 0 );
6741 return
6742 aSign ? lt128( b.high, b.low, a.high, a.low )
6743 : lt128( a.high, a.low, b.high, b.low );
6747 /*----------------------------------------------------------------------------
6748 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6749 | be compared, and 0 otherwise. The invalid exception is raised if either
6750 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6751 | Standard for Binary Floating-Point Arithmetic.
6752 *----------------------------------------------------------------------------*/
6754 int float128_unordered(float128 a, float128 b, float_status *status)
6756 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6757 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6758 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6759 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6761 float_raise(float_flag_invalid, status);
6762 return 1;
6764 return 0;
6767 /*----------------------------------------------------------------------------
6768 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6769 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6770 | exception. The comparison is performed according to the IEC/IEEE Standard
6771 | for Binary Floating-Point Arithmetic.
6772 *----------------------------------------------------------------------------*/
6774 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6777 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6778 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6779 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6780 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6782 if (float128_is_signaling_nan(a, status)
6783 || float128_is_signaling_nan(b, status)) {
6784 float_raise(float_flag_invalid, status);
6786 return 0;
6788 return
6789 ( a.low == b.low )
6790 && ( ( a.high == b.high )
6791 || ( ( a.low == 0 )
6792 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6797 /*----------------------------------------------------------------------------
6798 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6799 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6800 | cause an exception. Otherwise, the comparison is performed according to the
6801 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6802 *----------------------------------------------------------------------------*/
6804 int float128_le_quiet(float128 a, float128 b, float_status *status)
6806 flag aSign, bSign;
6808 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6809 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6810 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6811 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6813 if (float128_is_signaling_nan(a, status)
6814 || float128_is_signaling_nan(b, status)) {
6815 float_raise(float_flag_invalid, status);
6817 return 0;
6819 aSign = extractFloat128Sign( a );
6820 bSign = extractFloat128Sign( b );
6821 if ( aSign != bSign ) {
6822 return
6823 aSign
6824 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6825 == 0 );
6827 return
6828 aSign ? le128( b.high, b.low, a.high, a.low )
6829 : le128( a.high, a.low, b.high, b.low );
6833 /*----------------------------------------------------------------------------
6834 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6835 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6836 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6837 | Standard for Binary Floating-Point Arithmetic.
6838 *----------------------------------------------------------------------------*/
6840 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6842 flag aSign, bSign;
6844 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6845 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6846 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6847 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6849 if (float128_is_signaling_nan(a, status)
6850 || float128_is_signaling_nan(b, status)) {
6851 float_raise(float_flag_invalid, status);
6853 return 0;
6855 aSign = extractFloat128Sign( a );
6856 bSign = extractFloat128Sign( b );
6857 if ( aSign != bSign ) {
6858 return
6859 aSign
6860 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6861 != 0 );
6863 return
6864 aSign ? lt128( b.high, b.low, a.high, a.low )
6865 : lt128( a.high, a.low, b.high, b.low );
6869 /*----------------------------------------------------------------------------
6870 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6871 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6872 | comparison is performed according to the IEC/IEEE Standard for Binary
6873 | Floating-Point Arithmetic.
6874 *----------------------------------------------------------------------------*/
6876 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6878 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6879 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6880 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6881 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6883 if (float128_is_signaling_nan(a, status)
6884 || float128_is_signaling_nan(b, status)) {
6885 float_raise(float_flag_invalid, status);
6887 return 1;
6889 return 0;
6892 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6893 int is_quiet, float_status *status)
6895 flag aSign, bSign;
6897 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6898 float_raise(float_flag_invalid, status);
6899 return float_relation_unordered;
6901 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6902 ( extractFloatx80Frac( a )<<1 ) ) ||
6903 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6904 ( extractFloatx80Frac( b )<<1 ) )) {
6905 if (!is_quiet ||
6906 floatx80_is_signaling_nan(a, status) ||
6907 floatx80_is_signaling_nan(b, status)) {
6908 float_raise(float_flag_invalid, status);
6910 return float_relation_unordered;
6912 aSign = extractFloatx80Sign( a );
6913 bSign = extractFloatx80Sign( b );
6914 if ( aSign != bSign ) {
6916 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6917 ( ( a.low | b.low ) == 0 ) ) {
6918 /* zero case */
6919 return float_relation_equal;
6920 } else {
6921 return 1 - (2 * aSign);
6923 } else {
6924 if (a.low == b.low && a.high == b.high) {
6925 return float_relation_equal;
6926 } else {
6927 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6932 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6934 return floatx80_compare_internal(a, b, 0, status);
6937 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6939 return floatx80_compare_internal(a, b, 1, status);
6942 static inline int float128_compare_internal(float128 a, float128 b,
6943 int is_quiet, float_status *status)
6945 flag aSign, bSign;
6947 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6948 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6949 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6950 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6951 if (!is_quiet ||
6952 float128_is_signaling_nan(a, status) ||
6953 float128_is_signaling_nan(b, status)) {
6954 float_raise(float_flag_invalid, status);
6956 return float_relation_unordered;
6958 aSign = extractFloat128Sign( a );
6959 bSign = extractFloat128Sign( b );
6960 if ( aSign != bSign ) {
6961 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6962 /* zero case */
6963 return float_relation_equal;
6964 } else {
6965 return 1 - (2 * aSign);
6967 } else {
6968 if (a.low == b.low && a.high == b.high) {
6969 return float_relation_equal;
6970 } else {
6971 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6976 int float128_compare(float128 a, float128 b, float_status *status)
6978 return float128_compare_internal(a, b, 0, status);
6981 int float128_compare_quiet(float128 a, float128 b, float_status *status)
6983 return float128_compare_internal(a, b, 1, status);
6986 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
6988 flag aSign;
6989 int32_t aExp;
6990 uint64_t aSig;
6992 if (floatx80_invalid_encoding(a)) {
6993 float_raise(float_flag_invalid, status);
6994 return floatx80_default_nan(status);
6996 aSig = extractFloatx80Frac( a );
6997 aExp = extractFloatx80Exp( a );
6998 aSign = extractFloatx80Sign( a );
7000 if ( aExp == 0x7FFF ) {
7001 if ( aSig<<1 ) {
7002 return propagateFloatx80NaN(a, a, status);
7004 return a;
7007 if (aExp == 0) {
7008 if (aSig == 0) {
7009 return a;
7011 aExp++;
7014 if (n > 0x10000) {
7015 n = 0x10000;
7016 } else if (n < -0x10000) {
7017 n = -0x10000;
7020 aExp += n;
7021 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7022 aSign, aExp, aSig, 0, status);
7025 float128 float128_scalbn(float128 a, int n, float_status *status)
7027 flag aSign;
7028 int32_t aExp;
7029 uint64_t aSig0, aSig1;
7031 aSig1 = extractFloat128Frac1( a );
7032 aSig0 = extractFloat128Frac0( a );
7033 aExp = extractFloat128Exp( a );
7034 aSign = extractFloat128Sign( a );
7035 if ( aExp == 0x7FFF ) {
7036 if ( aSig0 | aSig1 ) {
7037 return propagateFloat128NaN(a, a, status);
7039 return a;
7041 if (aExp != 0) {
7042 aSig0 |= LIT64( 0x0001000000000000 );
7043 } else if (aSig0 == 0 && aSig1 == 0) {
7044 return a;
7045 } else {
7046 aExp++;
7049 if (n > 0x10000) {
7050 n = 0x10000;
7051 } else if (n < -0x10000) {
7052 n = -0x10000;
7055 aExp += n - 1;
7056 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7057 , status);