fpu/softfloat: raise float_invalid for NaN/Inf in round_to_int_and_pack
[qemu/kevin.git] / fpu / softfloat.c
blobfb8663f59ea9122a78b71a210bff77a6dacff598
1 /*
2 * QEMU float support
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine: (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output. These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
112 static inline uint32_t extractFloat16Frac(float16 a)
114 return float16_val(a) & 0x3ff;
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
121 static inline int extractFloat16Exp(float16 a)
123 return (float16_val(a) >> 10) & 0x1f;
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
130 static inline flag extractFloat16Sign(float16 a)
132 return float16_val(a)>>15;
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
139 static inline uint32_t extractFloat32Frac(float32 a)
141 return float32_val(a) & 0x007FFFFF;
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
148 static inline int extractFloat32Exp(float32 a)
150 return (float32_val(a) >> 23) & 0xFF;
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
157 static inline flag extractFloat32Sign(float32 a)
159 return float32_val(a) >> 31;
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
166 static inline uint64_t extractFloat64Frac(float64 a)
168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
175 static inline int extractFloat64Exp(float64 a)
177 return (float64_val(a) >> 52) & 0x7FF;
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
184 static inline flag extractFloat64Sign(float64 a)
186 return float64_val(a) >> 63;
190 * Classify a floating point number. Everything above float_class_qnan
191 * is a NaN so cls >= float_class_qnan is any NaN.
194 typedef enum __attribute__ ((__packed__)) {
195 float_class_unclassified,
196 float_class_zero,
197 float_class_normal,
198 float_class_inf,
199 float_class_qnan, /* all NaNs from here */
200 float_class_snan,
201 float_class_dnan,
202 float_class_msnan, /* maybe silenced */
203 } FloatClass;
206 * Structure holding all of the decomposed parts of a float. The
207 * exponent is unbiased and the fraction is normalized. All
208 * calculations are done with a 64 bit fraction and then rounded as
209 * appropriate for the final format.
211 * Thanks to the packed FloatClass a decent compiler should be able to
212 * fit the whole structure into registers and avoid using the stack
213 * for parameter passing.
216 typedef struct {
217 uint64_t frac;
218 int32_t exp;
219 FloatClass cls;
220 bool sign;
221 } FloatParts;
223 #define DECOMPOSED_BINARY_POINT (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
227 /* Structure holding all of the relevant parameters for a format.
228 * exp_size: the size of the exponent field
229 * exp_bias: the offset applied to the exponent field
230 * exp_max: the maximum normalised exponent
231 * frac_size: the size of the fraction field
232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233 * The following are computed based the size of fraction
234 * frac_lsb: least significant bit of fraction
235 * fram_lsbm1: the bit bellow the least significant bit (for rounding)
236 * round_mask/roundeven_mask: masks used for rounding
238 typedef struct {
239 int exp_size;
240 int exp_bias;
241 int exp_max;
242 int frac_size;
243 int frac_shift;
244 uint64_t frac_lsb;
245 uint64_t frac_lsbm1;
246 uint64_t round_mask;
247 uint64_t roundeven_mask;
248 } FloatFmt;
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F) \
252 .exp_size = E, \
253 .exp_bias = ((1 << E) - 1) >> 1, \
254 .exp_max = (1 << E) - 1, \
255 .frac_size = F, \
256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
262 static const FloatFmt float16_params = {
263 FLOAT_PARAMS(5, 10)
266 static const FloatFmt float32_params = {
267 FLOAT_PARAMS(8, 23)
270 static const FloatFmt float64_params = {
271 FLOAT_PARAMS(11, 52)
274 /* Unpack a float to parts, but do not canonicalize. */
275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
277 const int sign_pos = fmt.frac_size + fmt.exp_size;
279 return (FloatParts) {
280 .cls = float_class_unclassified,
281 .sign = extract64(raw, sign_pos, 1),
282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283 .frac = extract64(raw, 0, fmt.frac_size),
287 static inline FloatParts float16_unpack_raw(float16 f)
289 return unpack_raw(float16_params, f);
292 static inline FloatParts float32_unpack_raw(float32 f)
294 return unpack_raw(float32_params, f);
297 static inline FloatParts float64_unpack_raw(float64 f)
299 return unpack_raw(float64_params, f);
302 /* Pack a float from parts, but do not canonicalize. */
303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
305 const int sign_pos = fmt.frac_size + fmt.exp_size;
306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307 return deposit64(ret, sign_pos, 1, p.sign);
310 static inline float16 float16_pack_raw(FloatParts p)
312 return make_float16(pack_raw(float16_params, p));
315 static inline float32 float32_pack_raw(FloatParts p)
317 return make_float32(pack_raw(float32_params, p));
320 static inline float64 float64_pack_raw(FloatParts p)
322 return make_float64(pack_raw(float64_params, p));
325 /* Canonicalize EXP and FRAC, setting CLS. */
326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327 float_status *status)
329 if (part.exp == parm->exp_max) {
330 if (part.frac == 0) {
331 part.cls = float_class_inf;
332 } else {
333 #ifdef NO_SIGNALING_NANS
334 part.cls = float_class_qnan;
335 #else
336 int64_t msb = part.frac << (parm->frac_shift + 2);
337 if ((msb < 0) == status->snan_bit_is_one) {
338 part.cls = float_class_snan;
339 } else {
340 part.cls = float_class_qnan;
342 #endif
344 } else if (part.exp == 0) {
345 if (likely(part.frac == 0)) {
346 part.cls = float_class_zero;
347 } else if (status->flush_inputs_to_zero) {
348 float_raise(float_flag_input_denormal, status);
349 part.cls = float_class_zero;
350 part.frac = 0;
351 } else {
352 int shift = clz64(part.frac) - 1;
353 part.cls = float_class_normal;
354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
355 part.frac <<= shift;
357 } else {
358 part.cls = float_class_normal;
359 part.exp -= parm->exp_bias;
360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
362 return part;
365 /* Round and uncanonicalize a floating-point number by parts. There
366 * are FRAC_SHIFT bits that may require rounding at the bottom of the
367 * fraction; these bits will be removed. The exponent will be biased
368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
371 static FloatParts round_canonical(FloatParts p, float_status *s,
372 const FloatFmt *parm)
374 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375 const uint64_t round_mask = parm->round_mask;
376 const uint64_t roundeven_mask = parm->roundeven_mask;
377 const int exp_max = parm->exp_max;
378 const int frac_shift = parm->frac_shift;
379 uint64_t frac, inc;
380 int exp, flags = 0;
381 bool overflow_norm;
383 frac = p.frac;
384 exp = p.exp;
386 switch (p.cls) {
387 case float_class_normal:
388 switch (s->float_rounding_mode) {
389 case float_round_nearest_even:
390 overflow_norm = false;
391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
392 break;
393 case float_round_ties_away:
394 overflow_norm = false;
395 inc = frac_lsbm1;
396 break;
397 case float_round_to_zero:
398 overflow_norm = true;
399 inc = 0;
400 break;
401 case float_round_up:
402 inc = p.sign ? 0 : round_mask;
403 overflow_norm = p.sign;
404 break;
405 case float_round_down:
406 inc = p.sign ? round_mask : 0;
407 overflow_norm = !p.sign;
408 break;
409 default:
410 g_assert_not_reached();
413 exp += parm->exp_bias;
414 if (likely(exp > 0)) {
415 if (frac & round_mask) {
416 flags |= float_flag_inexact;
417 frac += inc;
418 if (frac & DECOMPOSED_OVERFLOW_BIT) {
419 frac >>= 1;
420 exp++;
423 frac >>= frac_shift;
425 if (unlikely(exp >= exp_max)) {
426 flags |= float_flag_overflow | float_flag_inexact;
427 if (overflow_norm) {
428 exp = exp_max - 1;
429 frac = -1;
430 } else {
431 p.cls = float_class_inf;
432 goto do_inf;
435 } else if (s->flush_to_zero) {
436 flags |= float_flag_output_denormal;
437 p.cls = float_class_zero;
438 goto do_zero;
439 } else {
440 bool is_tiny = (s->float_detect_tininess
441 == float_tininess_before_rounding)
442 || (exp < 0)
443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
445 shift64RightJamming(frac, 1 - exp, &frac);
446 if (frac & round_mask) {
447 /* Need to recompute round-to-even. */
448 if (s->float_rounding_mode == float_round_nearest_even) {
449 inc = ((frac & roundeven_mask) != frac_lsbm1
450 ? frac_lsbm1 : 0);
452 flags |= float_flag_inexact;
453 frac += inc;
456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
457 frac >>= frac_shift;
459 if (is_tiny && (flags & float_flag_inexact)) {
460 flags |= float_flag_underflow;
462 if (exp == 0 && frac == 0) {
463 p.cls = float_class_zero;
466 break;
468 case float_class_zero:
469 do_zero:
470 exp = 0;
471 frac = 0;
472 break;
474 case float_class_inf:
475 do_inf:
476 exp = exp_max;
477 frac = 0;
478 break;
480 case float_class_qnan:
481 case float_class_snan:
482 exp = exp_max;
483 break;
485 default:
486 g_assert_not_reached();
489 float_raise(flags, s);
490 p.exp = exp;
491 p.frac = frac;
492 return p;
495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
497 return canonicalize(float16_unpack_raw(f), &float16_params, s);
500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
502 switch (p.cls) {
503 case float_class_dnan:
504 return float16_default_nan(s);
505 case float_class_msnan:
506 return float16_maybe_silence_nan(float16_pack_raw(p), s);
507 default:
508 p = round_canonical(p, s, &float16_params);
509 return float16_pack_raw(p);
513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
515 return canonicalize(float32_unpack_raw(f), &float32_params, s);
518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
520 switch (p.cls) {
521 case float_class_dnan:
522 return float32_default_nan(s);
523 case float_class_msnan:
524 return float32_maybe_silence_nan(float32_pack_raw(p), s);
525 default:
526 p = round_canonical(p, s, &float32_params);
527 return float32_pack_raw(p);
531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
533 return canonicalize(float64_unpack_raw(f), &float64_params, s);
536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
538 switch (p.cls) {
539 case float_class_dnan:
540 return float64_default_nan(s);
541 case float_class_msnan:
542 return float64_maybe_silence_nan(float64_pack_raw(p), s);
543 default:
544 p = round_canonical(p, s, &float64_params);
545 return float64_pack_raw(p);
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c)
552 return unlikely(c >= float_class_qnan);
554 static bool is_snan(FloatClass c)
556 return c == float_class_snan;
558 static bool is_qnan(FloatClass c)
560 return c == float_class_qnan;
563 static FloatParts return_nan(FloatParts a, float_status *s)
565 switch (a.cls) {
566 case float_class_snan:
567 s->float_exception_flags |= float_flag_invalid;
568 a.cls = float_class_msnan;
569 /* fall through */
570 case float_class_qnan:
571 if (s->default_nan_mode) {
572 a.cls = float_class_dnan;
574 break;
576 default:
577 g_assert_not_reached();
579 return a;
582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
584 if (is_snan(a.cls) || is_snan(b.cls)) {
585 s->float_exception_flags |= float_flag_invalid;
588 if (s->default_nan_mode) {
589 a.cls = float_class_dnan;
590 } else {
591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
592 is_qnan(b.cls), is_snan(b.cls),
593 a.frac > b.frac ||
594 (a.frac == b.frac && a.sign < b.sign))) {
595 a = b;
597 a.cls = float_class_msnan;
599 return a;
602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
603 bool inf_zero, float_status *s)
605 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
606 s->float_exception_flags |= float_flag_invalid;
609 if (s->default_nan_mode) {
610 a.cls = float_class_dnan;
611 } else {
612 switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
613 is_qnan(b.cls), is_snan(b.cls),
614 is_qnan(c.cls), is_snan(c.cls),
615 inf_zero, s)) {
616 case 0:
617 break;
618 case 1:
619 a = b;
620 break;
621 case 2:
622 a = c;
623 break;
624 case 3:
625 a.cls = float_class_dnan;
626 return a;
627 default:
628 g_assert_not_reached();
631 a.cls = float_class_msnan;
633 return a;
637 * Returns the result of adding or subtracting the values of the
638 * floating-point values `a' and `b'. The operation is performed
639 * according to the IEC/IEEE Standard for Binary Floating-Point
640 * Arithmetic.
643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
644 float_status *s)
646 bool a_sign = a.sign;
647 bool b_sign = b.sign ^ subtract;
649 if (a_sign != b_sign) {
650 /* Subtraction */
652 if (a.cls == float_class_normal && b.cls == float_class_normal) {
653 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
654 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
655 a.frac = a.frac - b.frac;
656 } else {
657 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
658 a.frac = b.frac - a.frac;
659 a.exp = b.exp;
660 a_sign ^= 1;
663 if (a.frac == 0) {
664 a.cls = float_class_zero;
665 a.sign = s->float_rounding_mode == float_round_down;
666 } else {
667 int shift = clz64(a.frac) - 1;
668 a.frac = a.frac << shift;
669 a.exp = a.exp - shift;
670 a.sign = a_sign;
672 return a;
674 if (is_nan(a.cls) || is_nan(b.cls)) {
675 return pick_nan(a, b, s);
677 if (a.cls == float_class_inf) {
678 if (b.cls == float_class_inf) {
679 float_raise(float_flag_invalid, s);
680 a.cls = float_class_dnan;
682 return a;
684 if (a.cls == float_class_zero && b.cls == float_class_zero) {
685 a.sign = s->float_rounding_mode == float_round_down;
686 return a;
688 if (a.cls == float_class_zero || b.cls == float_class_inf) {
689 b.sign = a_sign ^ 1;
690 return b;
692 if (b.cls == float_class_zero) {
693 return a;
695 } else {
696 /* Addition */
697 if (a.cls == float_class_normal && b.cls == float_class_normal) {
698 if (a.exp > b.exp) {
699 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
700 } else if (a.exp < b.exp) {
701 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
702 a.exp = b.exp;
704 a.frac += b.frac;
705 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
706 a.frac >>= 1;
707 a.exp += 1;
709 return a;
711 if (is_nan(a.cls) || is_nan(b.cls)) {
712 return pick_nan(a, b, s);
714 if (a.cls == float_class_inf || b.cls == float_class_zero) {
715 return a;
717 if (b.cls == float_class_inf || a.cls == float_class_zero) {
718 b.sign = b_sign;
719 return b;
722 g_assert_not_reached();
726 * Returns the result of adding or subtracting the floating-point
727 * values `a' and `b'. The operation is performed according to the
728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
731 float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
732 float_status *status)
734 FloatParts pa = float16_unpack_canonical(a, status);
735 FloatParts pb = float16_unpack_canonical(b, status);
736 FloatParts pr = addsub_floats(pa, pb, false, status);
738 return float16_round_pack_canonical(pr, status);
741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
742 float_status *status)
744 FloatParts pa = float32_unpack_canonical(a, status);
745 FloatParts pb = float32_unpack_canonical(b, status);
746 FloatParts pr = addsub_floats(pa, pb, false, status);
748 return float32_round_pack_canonical(pr, status);
751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
752 float_status *status)
754 FloatParts pa = float64_unpack_canonical(a, status);
755 FloatParts pb = float64_unpack_canonical(b, status);
756 FloatParts pr = addsub_floats(pa, pb, false, status);
758 return float64_round_pack_canonical(pr, status);
761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
762 float_status *status)
764 FloatParts pa = float16_unpack_canonical(a, status);
765 FloatParts pb = float16_unpack_canonical(b, status);
766 FloatParts pr = addsub_floats(pa, pb, true, status);
768 return float16_round_pack_canonical(pr, status);
771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
772 float_status *status)
774 FloatParts pa = float32_unpack_canonical(a, status);
775 FloatParts pb = float32_unpack_canonical(b, status);
776 FloatParts pr = addsub_floats(pa, pb, true, status);
778 return float32_round_pack_canonical(pr, status);
781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
782 float_status *status)
784 FloatParts pa = float64_unpack_canonical(a, status);
785 FloatParts pb = float64_unpack_canonical(b, status);
786 FloatParts pr = addsub_floats(pa, pb, true, status);
788 return float64_round_pack_canonical(pr, status);
792 * Returns the result of multiplying the floating-point values `a' and
793 * `b'. The operation is performed according to the IEC/IEEE Standard
794 * for Binary Floating-Point Arithmetic.
797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
799 bool sign = a.sign ^ b.sign;
801 if (a.cls == float_class_normal && b.cls == float_class_normal) {
802 uint64_t hi, lo;
803 int exp = a.exp + b.exp;
805 mul64To128(a.frac, b.frac, &hi, &lo);
806 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
807 if (lo & DECOMPOSED_OVERFLOW_BIT) {
808 shift64RightJamming(lo, 1, &lo);
809 exp += 1;
812 /* Re-use a */
813 a.exp = exp;
814 a.sign = sign;
815 a.frac = lo;
816 return a;
818 /* handle all the NaN cases */
819 if (is_nan(a.cls) || is_nan(b.cls)) {
820 return pick_nan(a, b, s);
822 /* Inf * Zero == NaN */
823 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
824 (a.cls == float_class_zero && b.cls == float_class_inf)) {
825 s->float_exception_flags |= float_flag_invalid;
826 a.cls = float_class_dnan;
827 a.sign = sign;
828 return a;
830 /* Multiply by 0 or Inf */
831 if (a.cls == float_class_inf || a.cls == float_class_zero) {
832 a.sign = sign;
833 return a;
835 if (b.cls == float_class_inf || b.cls == float_class_zero) {
836 b.sign = sign;
837 return b;
839 g_assert_not_reached();
842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
843 float_status *status)
845 FloatParts pa = float16_unpack_canonical(a, status);
846 FloatParts pb = float16_unpack_canonical(b, status);
847 FloatParts pr = mul_floats(pa, pb, status);
849 return float16_round_pack_canonical(pr, status);
852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
853 float_status *status)
855 FloatParts pa = float32_unpack_canonical(a, status);
856 FloatParts pb = float32_unpack_canonical(b, status);
857 FloatParts pr = mul_floats(pa, pb, status);
859 return float32_round_pack_canonical(pr, status);
862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
863 float_status *status)
865 FloatParts pa = float64_unpack_canonical(a, status);
866 FloatParts pb = float64_unpack_canonical(b, status);
867 FloatParts pr = mul_floats(pa, pb, status);
869 return float64_round_pack_canonical(pr, status);
873 * Returns the result of multiplying the floating-point values `a' and
874 * `b' then adding 'c', with no intermediate rounding step after the
875 * multiplication. The operation is performed according to the
876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
877 * The flags argument allows the caller to select negation of the
878 * addend, the intermediate product, or the final result. (The
879 * difference between this and having the caller do a separate
880 * negation is that negating externally will flip the sign bit on
881 * NaNs.)
884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
885 int flags, float_status *s)
887 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
888 ((1 << float_class_inf) | (1 << float_class_zero));
889 bool p_sign;
890 bool sign_flip = flags & float_muladd_negate_result;
891 FloatClass p_class;
892 uint64_t hi, lo;
893 int p_exp;
895 /* It is implementation-defined whether the cases of (0,inf,qnan)
896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
897 * they return if they do), so we have to hand this information
898 * off to the target-specific pick-a-NaN routine.
900 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
901 return pick_nan_muladd(a, b, c, inf_zero, s);
904 if (inf_zero) {
905 s->float_exception_flags |= float_flag_invalid;
906 a.cls = float_class_dnan;
907 return a;
910 if (flags & float_muladd_negate_c) {
911 c.sign ^= 1;
914 p_sign = a.sign ^ b.sign;
916 if (flags & float_muladd_negate_product) {
917 p_sign ^= 1;
920 if (a.cls == float_class_inf || b.cls == float_class_inf) {
921 p_class = float_class_inf;
922 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
923 p_class = float_class_zero;
924 } else {
925 p_class = float_class_normal;
928 if (c.cls == float_class_inf) {
929 if (p_class == float_class_inf && p_sign != c.sign) {
930 s->float_exception_flags |= float_flag_invalid;
931 a.cls = float_class_dnan;
932 } else {
933 a.cls = float_class_inf;
934 a.sign = c.sign ^ sign_flip;
936 return a;
939 if (p_class == float_class_inf) {
940 a.cls = float_class_inf;
941 a.sign = p_sign ^ sign_flip;
942 return a;
945 if (p_class == float_class_zero) {
946 if (c.cls == float_class_zero) {
947 if (p_sign != c.sign) {
948 p_sign = s->float_rounding_mode == float_round_down;
950 c.sign = p_sign;
951 } else if (flags & float_muladd_halve_result) {
952 c.exp -= 1;
954 c.sign ^= sign_flip;
955 return c;
958 /* a & b should be normals now... */
959 assert(a.cls == float_class_normal &&
960 b.cls == float_class_normal);
962 p_exp = a.exp + b.exp;
964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
965 * result.
967 mul64To128(a.frac, b.frac, &hi, &lo);
968 /* binary point now at bit 124 */
970 /* check for overflow */
971 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
972 shift128RightJamming(hi, lo, 1, &hi, &lo);
973 p_exp += 1;
976 /* + add/sub */
977 if (c.cls == float_class_zero) {
978 /* move binary point back to 62 */
979 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
980 } else {
981 int exp_diff = p_exp - c.exp;
982 if (p_sign == c.sign) {
983 /* Addition */
984 if (exp_diff <= 0) {
985 shift128RightJamming(hi, lo,
986 DECOMPOSED_BINARY_POINT - exp_diff,
987 &hi, &lo);
988 lo += c.frac;
989 p_exp = c.exp;
990 } else {
991 uint64_t c_hi, c_lo;
992 /* shift c to the same binary point as the product (124) */
993 c_hi = c.frac >> 2;
994 c_lo = 0;
995 shift128RightJamming(c_hi, c_lo,
996 exp_diff,
997 &c_hi, &c_lo);
998 add128(hi, lo, c_hi, c_lo, &hi, &lo);
999 /* move binary point back to 62 */
1000 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1003 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004 shift64RightJamming(lo, 1, &lo);
1005 p_exp += 1;
1008 } else {
1009 /* Subtraction */
1010 uint64_t c_hi, c_lo;
1011 /* make C binary point match product at bit 124 */
1012 c_hi = c.frac >> 2;
1013 c_lo = 0;
1015 if (exp_diff <= 0) {
1016 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1017 if (exp_diff == 0
1019 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1021 } else {
1022 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1023 p_sign ^= 1;
1024 p_exp = c.exp;
1026 } else {
1027 shift128RightJamming(c_hi, c_lo,
1028 exp_diff,
1029 &c_hi, &c_lo);
1030 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1033 if (hi == 0 && lo == 0) {
1034 a.cls = float_class_zero;
1035 a.sign = s->float_rounding_mode == float_round_down;
1036 a.sign ^= sign_flip;
1037 return a;
1038 } else {
1039 int shift;
1040 if (hi != 0) {
1041 shift = clz64(hi);
1042 } else {
1043 shift = clz64(lo) + 64;
1045 /* Normalizing to a binary point of 124 is the
1046 correct adjust for the exponent. However since we're
1047 shifting, we might as well put the binary point back
1048 at 62 where we really want it. Therefore shift as
1049 if we're leaving 1 bit at the top of the word, but
1050 adjust the exponent as if we're leaving 3 bits. */
1051 shift -= 1;
1052 if (shift >= 64) {
1053 lo = lo << (shift - 64);
1054 } else {
1055 hi = (hi << shift) | (lo >> (64 - shift));
1056 lo = hi | ((lo << shift) != 0);
1058 p_exp -= shift - 2;
1063 if (flags & float_muladd_halve_result) {
1064 p_exp -= 1;
1067 /* finally prepare our result */
1068 a.cls = float_class_normal;
1069 a.sign = p_sign ^ sign_flip;
1070 a.exp = p_exp;
1071 a.frac = lo;
1073 return a;
1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077 int flags, float_status *status)
1079 FloatParts pa = float16_unpack_canonical(a, status);
1080 FloatParts pb = float16_unpack_canonical(b, status);
1081 FloatParts pc = float16_unpack_canonical(c, status);
1082 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1084 return float16_round_pack_canonical(pr, status);
1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088 int flags, float_status *status)
1090 FloatParts pa = float32_unpack_canonical(a, status);
1091 FloatParts pb = float32_unpack_canonical(b, status);
1092 FloatParts pc = float32_unpack_canonical(c, status);
1093 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1095 return float32_round_pack_canonical(pr, status);
1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099 int flags, float_status *status)
1101 FloatParts pa = float64_unpack_canonical(a, status);
1102 FloatParts pb = float64_unpack_canonical(b, status);
1103 FloatParts pc = float64_unpack_canonical(c, status);
1104 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1106 return float64_round_pack_canonical(pr, status);
1110 * Returns the result of dividing the floating-point value `a' by the
1111 * corresponding value `b'. The operation is performed according to
1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1117 bool sign = a.sign ^ b.sign;
1119 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120 uint64_t temp_lo, temp_hi;
1121 int exp = a.exp - b.exp;
1122 if (a.frac < b.frac) {
1123 exp -= 1;
1124 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125 &temp_hi, &temp_lo);
1126 } else {
1127 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128 &temp_hi, &temp_lo);
1130 /* LSB of quot is set if inexact which roundandpack will use
1131 * to set flags. Yet again we re-use a for the result */
1132 a.frac = div128To64(temp_lo, temp_hi, b.frac);
1133 a.sign = sign;
1134 a.exp = exp;
1135 return a;
1137 /* handle all the NaN cases */
1138 if (is_nan(a.cls) || is_nan(b.cls)) {
1139 return pick_nan(a, b, s);
1141 /* 0/0 or Inf/Inf */
1142 if (a.cls == b.cls
1144 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145 s->float_exception_flags |= float_flag_invalid;
1146 a.cls = float_class_dnan;
1147 return a;
1149 /* Div 0 => Inf */
1150 if (b.cls == float_class_zero) {
1151 s->float_exception_flags |= float_flag_divbyzero;
1152 a.cls = float_class_inf;
1153 a.sign = sign;
1154 return a;
1156 /* Inf / x or 0 / x */
1157 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1158 a.sign = sign;
1159 return a;
1161 /* Div by Inf */
1162 if (b.cls == float_class_inf) {
1163 a.cls = float_class_zero;
1164 a.sign = sign;
1165 return a;
1167 g_assert_not_reached();
1170 float16 float16_div(float16 a, float16 b, float_status *status)
1172 FloatParts pa = float16_unpack_canonical(a, status);
1173 FloatParts pb = float16_unpack_canonical(b, status);
1174 FloatParts pr = div_floats(pa, pb, status);
1176 return float16_round_pack_canonical(pr, status);
1179 float32 float32_div(float32 a, float32 b, float_status *status)
1181 FloatParts pa = float32_unpack_canonical(a, status);
1182 FloatParts pb = float32_unpack_canonical(b, status);
1183 FloatParts pr = div_floats(pa, pb, status);
1185 return float32_round_pack_canonical(pr, status);
1188 float64 float64_div(float64 a, float64 b, float_status *status)
1190 FloatParts pa = float64_unpack_canonical(a, status);
1191 FloatParts pb = float64_unpack_canonical(b, status);
1192 FloatParts pr = div_floats(pa, pb, status);
1194 return float64_round_pack_canonical(pr, status);
1198 * Rounds the floating-point value `a' to an integer, and returns the
1199 * result as a floating-point value. The operation is performed
1200 * according to the IEC/IEEE Standard for Binary Floating-Point
1201 * Arithmetic.
1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1206 if (is_nan(a.cls)) {
1207 return return_nan(a, s);
1210 switch (a.cls) {
1211 case float_class_zero:
1212 case float_class_inf:
1213 case float_class_qnan:
1214 /* already "integral" */
1215 break;
1216 case float_class_normal:
1217 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218 /* already integral */
1219 break;
1221 if (a.exp < 0) {
1222 bool one;
1223 /* all fractional */
1224 s->float_exception_flags |= float_flag_inexact;
1225 switch (rounding_mode) {
1226 case float_round_nearest_even:
1227 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1228 break;
1229 case float_round_ties_away:
1230 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1231 break;
1232 case float_round_to_zero:
1233 one = false;
1234 break;
1235 case float_round_up:
1236 one = !a.sign;
1237 break;
1238 case float_round_down:
1239 one = a.sign;
1240 break;
1241 default:
1242 g_assert_not_reached();
1245 if (one) {
1246 a.frac = DECOMPOSED_IMPLICIT_BIT;
1247 a.exp = 0;
1248 } else {
1249 a.cls = float_class_zero;
1251 } else {
1252 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253 uint64_t frac_lsbm1 = frac_lsb >> 1;
1254 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255 uint64_t rnd_mask = rnd_even_mask >> 1;
1256 uint64_t inc;
1258 switch (rounding_mode) {
1259 case float_round_nearest_even:
1260 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1261 break;
1262 case float_round_ties_away:
1263 inc = frac_lsbm1;
1264 break;
1265 case float_round_to_zero:
1266 inc = 0;
1267 break;
1268 case float_round_up:
1269 inc = a.sign ? 0 : rnd_mask;
1270 break;
1271 case float_round_down:
1272 inc = a.sign ? rnd_mask : 0;
1273 break;
1274 default:
1275 g_assert_not_reached();
1278 if (a.frac & rnd_mask) {
1279 s->float_exception_flags |= float_flag_inexact;
1280 a.frac += inc;
1281 a.frac &= ~rnd_mask;
1282 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1283 a.frac >>= 1;
1284 a.exp++;
1288 break;
1289 default:
1290 g_assert_not_reached();
1292 return a;
1295 float16 float16_round_to_int(float16 a, float_status *s)
1297 FloatParts pa = float16_unpack_canonical(a, s);
1298 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299 return float16_round_pack_canonical(pr, s);
1302 float32 float32_round_to_int(float32 a, float_status *s)
1304 FloatParts pa = float32_unpack_canonical(a, s);
1305 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306 return float32_round_pack_canonical(pr, s);
1309 float64 float64_round_to_int(float64 a, float_status *s)
1311 FloatParts pa = float64_unpack_canonical(a, s);
1312 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313 return float64_round_pack_canonical(pr, s);
1316 float64 float64_trunc_to_int(float64 a, float_status *s)
1318 FloatParts pa = float64_unpack_canonical(a, s);
1319 FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320 return float64_round_pack_canonical(pr, s);
1324 * Returns the result of converting the floating-point value `a' to
1325 * the two's complement integer format. The conversion is performed
1326 * according to the IEC/IEEE Standard for Binary Floating-Point
1327 * Arithmetic---which means in particular that the conversion is
1328 * rounded according to the current rounding mode. If `a' is a NaN,
1329 * the largest positive integer is returned. Otherwise, if the
1330 * conversion overflows, the largest integer with the same sign as `a'
1331 * is returned.
1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1335 int64_t min, int64_t max,
1336 float_status *s)
1338 uint64_t r;
1339 int orig_flags = get_float_exception_flags(s);
1340 FloatParts p = round_to_int(in, rmode, s);
1342 switch (p.cls) {
1343 case float_class_snan:
1344 case float_class_qnan:
1345 case float_class_dnan:
1346 case float_class_msnan:
1347 s->float_exception_flags = orig_flags | float_flag_invalid;
1348 return max;
1349 case float_class_inf:
1350 s->float_exception_flags = orig_flags | float_flag_invalid;
1351 return p.sign ? min : max;
1352 case float_class_zero:
1353 return 0;
1354 case float_class_normal:
1355 if (p.exp < DECOMPOSED_BINARY_POINT) {
1356 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1357 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1358 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1359 } else {
1360 r = UINT64_MAX;
1362 if (p.sign) {
1363 if (r < -(uint64_t) min) {
1364 return -r;
1365 } else {
1366 s->float_exception_flags = orig_flags | float_flag_invalid;
1367 return min;
1369 } else {
1370 if (r < max) {
1371 return r;
1372 } else {
1373 s->float_exception_flags = orig_flags | float_flag_invalid;
1374 return max;
1377 default:
1378 g_assert_not_reached();
1382 #define FLOAT_TO_INT(fsz, isz) \
1383 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1384 float_status *s) \
1386 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1387 return round_to_int_and_pack(p, s->float_rounding_mode, \
1388 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1389 s); \
1392 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1393 (float ## fsz a, float_status *s) \
1395 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1396 return round_to_int_and_pack(p, float_round_to_zero, \
1397 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1398 s); \
1401 FLOAT_TO_INT(16, 16)
1402 FLOAT_TO_INT(16, 32)
1403 FLOAT_TO_INT(16, 64)
1405 FLOAT_TO_INT(32, 16)
1406 FLOAT_TO_INT(32, 32)
1407 FLOAT_TO_INT(32, 64)
1409 FLOAT_TO_INT(64, 16)
1410 FLOAT_TO_INT(64, 32)
1411 FLOAT_TO_INT(64, 64)
1413 #undef FLOAT_TO_INT
1416 * Returns the result of converting the floating-point value `a' to
1417 * the unsigned integer format. The conversion is performed according
1418 * to the IEC/IEEE Standard for Binary Floating-Point
1419 * Arithmetic---which means in particular that the conversion is
1420 * rounded according to the current rounding mode. If `a' is a NaN,
1421 * the largest unsigned integer is returned. Otherwise, if the
1422 * conversion overflows, the largest unsigned integer is returned. If
1423 * the 'a' is negative, the result is rounded and zero is returned;
1424 * values that do not round to zero will raise the inexact exception
1425 * flag.
1428 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1429 float_status *s)
1431 int orig_flags = get_float_exception_flags(s);
1432 FloatParts p = round_to_int(in, rmode, s);
1434 switch (p.cls) {
1435 case float_class_snan:
1436 case float_class_qnan:
1437 case float_class_dnan:
1438 case float_class_msnan:
1439 s->float_exception_flags = orig_flags | float_flag_invalid;
1440 return max;
1441 case float_class_inf:
1442 s->float_exception_flags = orig_flags | float_flag_invalid;
1443 return p.sign ? 0 : max;
1444 case float_class_zero:
1445 return 0;
1446 case float_class_normal:
1448 uint64_t r;
1449 if (p.sign) {
1450 s->float_exception_flags = orig_flags | float_flag_invalid;
1451 return 0;
1454 if (p.exp < DECOMPOSED_BINARY_POINT) {
1455 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1456 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1457 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1458 } else {
1459 s->float_exception_flags = orig_flags | float_flag_invalid;
1460 return max;
1463 /* For uint64 this will never trip, but if p.exp is too large
1464 * to shift a decomposed fraction we shall have exited via the
1465 * 3rd leg above.
1467 if (r > max) {
1468 s->float_exception_flags = orig_flags | float_flag_invalid;
1469 return max;
1470 } else {
1471 return r;
1474 default:
1475 g_assert_not_reached();
1479 #define FLOAT_TO_UINT(fsz, isz) \
1480 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1481 float_status *s) \
1483 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1484 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1485 UINT ## isz ## _MAX, s); \
1488 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1489 (float ## fsz a, float_status *s) \
1491 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1492 return round_to_uint_and_pack(p, float_round_to_zero, \
1493 UINT ## isz ## _MAX, s); \
1496 FLOAT_TO_UINT(16, 16)
1497 FLOAT_TO_UINT(16, 32)
1498 FLOAT_TO_UINT(16, 64)
1500 FLOAT_TO_UINT(32, 16)
1501 FLOAT_TO_UINT(32, 32)
1502 FLOAT_TO_UINT(32, 64)
1504 FLOAT_TO_UINT(64, 16)
1505 FLOAT_TO_UINT(64, 32)
1506 FLOAT_TO_UINT(64, 64)
1508 #undef FLOAT_TO_UINT
1511 * Integer to float conversions
1513 * Returns the result of converting the two's complement integer `a'
1514 * to the floating-point format. The conversion is performed according
1515 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1518 static FloatParts int_to_float(int64_t a, float_status *status)
1520 FloatParts r;
1521 if (a == 0) {
1522 r.cls = float_class_zero;
1523 r.sign = false;
1524 } else if (a == (1ULL << 63)) {
1525 r.cls = float_class_normal;
1526 r.sign = true;
1527 r.frac = DECOMPOSED_IMPLICIT_BIT;
1528 r.exp = 63;
1529 } else {
1530 uint64_t f;
1531 if (a < 0) {
1532 f = -a;
1533 r.sign = true;
1534 } else {
1535 f = a;
1536 r.sign = false;
1538 int shift = clz64(f) - 1;
1539 r.cls = float_class_normal;
1540 r.exp = (DECOMPOSED_BINARY_POINT - shift);
1541 r.frac = f << shift;
1544 return r;
1547 float16 int64_to_float16(int64_t a, float_status *status)
1549 FloatParts pa = int_to_float(a, status);
1550 return float16_round_pack_canonical(pa, status);
1553 float16 int32_to_float16(int32_t a, float_status *status)
1555 return int64_to_float16(a, status);
1558 float16 int16_to_float16(int16_t a, float_status *status)
1560 return int64_to_float16(a, status);
1563 float32 int64_to_float32(int64_t a, float_status *status)
1565 FloatParts pa = int_to_float(a, status);
1566 return float32_round_pack_canonical(pa, status);
1569 float32 int32_to_float32(int32_t a, float_status *status)
1571 return int64_to_float32(a, status);
1574 float32 int16_to_float32(int16_t a, float_status *status)
1576 return int64_to_float32(a, status);
1579 float64 int64_to_float64(int64_t a, float_status *status)
1581 FloatParts pa = int_to_float(a, status);
1582 return float64_round_pack_canonical(pa, status);
1585 float64 int32_to_float64(int32_t a, float_status *status)
1587 return int64_to_float64(a, status);
1590 float64 int16_to_float64(int16_t a, float_status *status)
1592 return int64_to_float64(a, status);
1597 * Unsigned Integer to float conversions
1599 * Returns the result of converting the unsigned integer `a' to the
1600 * floating-point format. The conversion is performed according to the
1601 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1604 static FloatParts uint_to_float(uint64_t a, float_status *status)
1606 FloatParts r = { .sign = false};
1608 if (a == 0) {
1609 r.cls = float_class_zero;
1610 } else {
1611 int spare_bits = clz64(a) - 1;
1612 r.cls = float_class_normal;
1613 r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1614 if (spare_bits < 0) {
1615 shift64RightJamming(a, -spare_bits, &a);
1616 r.frac = a;
1617 } else {
1618 r.frac = a << spare_bits;
1622 return r;
1625 float16 uint64_to_float16(uint64_t a, float_status *status)
1627 FloatParts pa = uint_to_float(a, status);
1628 return float16_round_pack_canonical(pa, status);
1631 float16 uint32_to_float16(uint32_t a, float_status *status)
1633 return uint64_to_float16(a, status);
1636 float16 uint16_to_float16(uint16_t a, float_status *status)
1638 return uint64_to_float16(a, status);
1641 float32 uint64_to_float32(uint64_t a, float_status *status)
1643 FloatParts pa = uint_to_float(a, status);
1644 return float32_round_pack_canonical(pa, status);
1647 float32 uint32_to_float32(uint32_t a, float_status *status)
1649 return uint64_to_float32(a, status);
1652 float32 uint16_to_float32(uint16_t a, float_status *status)
1654 return uint64_to_float32(a, status);
1657 float64 uint64_to_float64(uint64_t a, float_status *status)
1659 FloatParts pa = uint_to_float(a, status);
1660 return float64_round_pack_canonical(pa, status);
1663 float64 uint32_to_float64(uint32_t a, float_status *status)
1665 return uint64_to_float64(a, status);
1668 float64 uint16_to_float64(uint16_t a, float_status *status)
1670 return uint64_to_float64(a, status);
1673 /* Float Min/Max */
1674 /* min() and max() functions. These can't be implemented as
1675 * 'compare and pick one input' because that would mishandle
1676 * NaNs and +0 vs -0.
1678 * minnum() and maxnum() functions. These are similar to the min()
1679 * and max() functions but if one of the arguments is a QNaN and
1680 * the other is numerical then the numerical argument is returned.
1681 * SNaNs will get quietened before being returned.
1682 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1683 * and maxNum() operations. min() and max() are the typical min/max
1684 * semantics provided by many CPUs which predate that specification.
1686 * minnummag() and maxnummag() functions correspond to minNumMag()
1687 * and minNumMag() from the IEEE-754 2008.
1689 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1690 bool ieee, bool ismag, float_status *s)
1692 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1693 if (ieee) {
1694 /* Takes two floating-point values `a' and `b', one of
1695 * which is a NaN, and returns the appropriate NaN
1696 * result. If either `a' or `b' is a signaling NaN,
1697 * the invalid exception is raised.
1699 if (is_snan(a.cls) || is_snan(b.cls)) {
1700 return pick_nan(a, b, s);
1701 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1702 return b;
1703 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1704 return a;
1707 return pick_nan(a, b, s);
1708 } else {
1709 int a_exp, b_exp;
1711 switch (a.cls) {
1712 case float_class_normal:
1713 a_exp = a.exp;
1714 break;
1715 case float_class_inf:
1716 a_exp = INT_MAX;
1717 break;
1718 case float_class_zero:
1719 a_exp = INT_MIN;
1720 break;
1721 default:
1722 g_assert_not_reached();
1723 break;
1725 switch (b.cls) {
1726 case float_class_normal:
1727 b_exp = b.exp;
1728 break;
1729 case float_class_inf:
1730 b_exp = INT_MAX;
1731 break;
1732 case float_class_zero:
1733 b_exp = INT_MIN;
1734 break;
1735 default:
1736 g_assert_not_reached();
1737 break;
1740 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1741 bool a_less = a_exp < b_exp;
1742 if (a_exp == b_exp) {
1743 a_less = a.frac < b.frac;
1745 return a_less ^ ismin ? b : a;
1748 if (a.sign == b.sign) {
1749 bool a_less = a_exp < b_exp;
1750 if (a_exp == b_exp) {
1751 a_less = a.frac < b.frac;
1753 return a.sign ^ a_less ^ ismin ? b : a;
1754 } else {
1755 return a.sign ^ ismin ? b : a;
1760 #define MINMAX(sz, name, ismin, isiee, ismag) \
1761 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1762 float_status *s) \
1764 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1765 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1766 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1768 return float ## sz ## _round_pack_canonical(pr, s); \
1771 MINMAX(16, min, true, false, false)
1772 MINMAX(16, minnum, true, true, false)
1773 MINMAX(16, minnummag, true, true, true)
1774 MINMAX(16, max, false, false, false)
1775 MINMAX(16, maxnum, false, true, false)
1776 MINMAX(16, maxnummag, false, true, true)
1778 MINMAX(32, min, true, false, false)
1779 MINMAX(32, minnum, true, true, false)
1780 MINMAX(32, minnummag, true, true, true)
1781 MINMAX(32, max, false, false, false)
1782 MINMAX(32, maxnum, false, true, false)
1783 MINMAX(32, maxnummag, false, true, true)
1785 MINMAX(64, min, true, false, false)
1786 MINMAX(64, minnum, true, true, false)
1787 MINMAX(64, minnummag, true, true, true)
1788 MINMAX(64, max, false, false, false)
1789 MINMAX(64, maxnum, false, true, false)
1790 MINMAX(64, maxnummag, false, true, true)
1792 #undef MINMAX
1794 /* Floating point compare */
1795 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1796 float_status *s)
1798 if (is_nan(a.cls) || is_nan(b.cls)) {
1799 if (!is_quiet ||
1800 a.cls == float_class_snan ||
1801 b.cls == float_class_snan) {
1802 s->float_exception_flags |= float_flag_invalid;
1804 return float_relation_unordered;
1807 if (a.cls == float_class_zero) {
1808 if (b.cls == float_class_zero) {
1809 return float_relation_equal;
1811 return b.sign ? float_relation_greater : float_relation_less;
1812 } else if (b.cls == float_class_zero) {
1813 return a.sign ? float_relation_less : float_relation_greater;
1816 /* The only really important thing about infinity is its sign. If
1817 * both are infinities the sign marks the smallest of the two.
1819 if (a.cls == float_class_inf) {
1820 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1821 return float_relation_equal;
1823 return a.sign ? float_relation_less : float_relation_greater;
1824 } else if (b.cls == float_class_inf) {
1825 return b.sign ? float_relation_greater : float_relation_less;
1828 if (a.sign != b.sign) {
1829 return a.sign ? float_relation_less : float_relation_greater;
1832 if (a.exp == b.exp) {
1833 if (a.frac == b.frac) {
1834 return float_relation_equal;
1836 if (a.sign) {
1837 return a.frac > b.frac ?
1838 float_relation_less : float_relation_greater;
1839 } else {
1840 return a.frac > b.frac ?
1841 float_relation_greater : float_relation_less;
1843 } else {
1844 if (a.sign) {
1845 return a.exp > b.exp ? float_relation_less : float_relation_greater;
1846 } else {
1847 return a.exp > b.exp ? float_relation_greater : float_relation_less;
1852 #define COMPARE(sz) \
1853 int float ## sz ## _compare(float ## sz a, float ## sz b, \
1854 float_status *s) \
1856 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1857 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1858 return compare_floats(pa, pb, false, s); \
1860 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
1861 float_status *s) \
1863 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1864 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1865 return compare_floats(pa, pb, true, s); \
1868 COMPARE(16)
1869 COMPARE(32)
1870 COMPARE(64)
1872 #undef COMPARE
1874 /* Multiply A by 2 raised to the power N. */
1875 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1877 if (unlikely(is_nan(a.cls))) {
1878 return return_nan(a, s);
1880 if (a.cls == float_class_normal) {
1881 a.exp += n;
1883 return a;
1886 float16 float16_scalbn(float16 a, int n, float_status *status)
1888 FloatParts pa = float16_unpack_canonical(a, status);
1889 FloatParts pr = scalbn_decomposed(pa, n, status);
1890 return float16_round_pack_canonical(pr, status);
1893 float32 float32_scalbn(float32 a, int n, float_status *status)
1895 FloatParts pa = float32_unpack_canonical(a, status);
1896 FloatParts pr = scalbn_decomposed(pa, n, status);
1897 return float32_round_pack_canonical(pr, status);
1900 float64 float64_scalbn(float64 a, int n, float_status *status)
1902 FloatParts pa = float64_unpack_canonical(a, status);
1903 FloatParts pr = scalbn_decomposed(pa, n, status);
1904 return float64_round_pack_canonical(pr, status);
1908 * Square Root
1910 * The old softfloat code did an approximation step before zeroing in
1911 * on the final result. However for simpleness we just compute the
1912 * square root by iterating down from the implicit bit to enough extra
1913 * bits to ensure we get a correctly rounded result.
1915 * This does mean however the calculation is slower than before,
1916 * especially for 64 bit floats.
1919 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1921 uint64_t a_frac, r_frac, s_frac;
1922 int bit, last_bit;
1924 if (is_nan(a.cls)) {
1925 return return_nan(a, s);
1927 if (a.cls == float_class_zero) {
1928 return a; /* sqrt(+-0) = +-0 */
1930 if (a.sign) {
1931 s->float_exception_flags |= float_flag_invalid;
1932 a.cls = float_class_dnan;
1933 return a;
1935 if (a.cls == float_class_inf) {
1936 return a; /* sqrt(+inf) = +inf */
1939 assert(a.cls == float_class_normal);
1941 /* We need two overflow bits at the top. Adding room for that is a
1942 * right shift. If the exponent is odd, we can discard the low bit
1943 * by multiplying the fraction by 2; that's a left shift. Combine
1944 * those and we shift right if the exponent is even.
1946 a_frac = a.frac;
1947 if (!(a.exp & 1)) {
1948 a_frac >>= 1;
1950 a.exp >>= 1;
1952 /* Bit-by-bit computation of sqrt. */
1953 r_frac = 0;
1954 s_frac = 0;
1956 /* Iterate from implicit bit down to the 3 extra bits to compute a
1957 * properly rounded result. Remember we've inserted one more bit
1958 * at the top, so these positions are one less.
1960 bit = DECOMPOSED_BINARY_POINT - 1;
1961 last_bit = MAX(p->frac_shift - 4, 0);
1962 do {
1963 uint64_t q = 1ULL << bit;
1964 uint64_t t_frac = s_frac + q;
1965 if (t_frac <= a_frac) {
1966 s_frac = t_frac + q;
1967 a_frac -= t_frac;
1968 r_frac += q;
1970 a_frac <<= 1;
1971 } while (--bit >= last_bit);
1973 /* Undo the right shift done above. If there is any remaining
1974 * fraction, the result is inexact. Set the sticky bit.
1976 a.frac = (r_frac << 1) + (a_frac != 0);
1978 return a;
1981 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1983 FloatParts pa = float16_unpack_canonical(a, status);
1984 FloatParts pr = sqrt_float(pa, status, &float16_params);
1985 return float16_round_pack_canonical(pr, status);
1988 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
1990 FloatParts pa = float32_unpack_canonical(a, status);
1991 FloatParts pr = sqrt_float(pa, status, &float32_params);
1992 return float32_round_pack_canonical(pr, status);
1995 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
1997 FloatParts pa = float64_unpack_canonical(a, status);
1998 FloatParts pr = sqrt_float(pa, status, &float64_params);
1999 return float64_round_pack_canonical(pr, status);
2003 /*----------------------------------------------------------------------------
2004 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2005 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2006 | input. If `zSign' is 1, the input is negated before being converted to an
2007 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2008 | is simply rounded to an integer, with the inexact exception raised if the
2009 | input cannot be represented exactly as an integer. However, if the fixed-
2010 | point input is too large, the invalid exception is raised and the largest
2011 | positive or negative integer is returned.
2012 *----------------------------------------------------------------------------*/
2014 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2016 int8_t roundingMode;
2017 flag roundNearestEven;
2018 int8_t roundIncrement, roundBits;
2019 int32_t z;
2021 roundingMode = status->float_rounding_mode;
2022 roundNearestEven = ( roundingMode == float_round_nearest_even );
2023 switch (roundingMode) {
2024 case float_round_nearest_even:
2025 case float_round_ties_away:
2026 roundIncrement = 0x40;
2027 break;
2028 case float_round_to_zero:
2029 roundIncrement = 0;
2030 break;
2031 case float_round_up:
2032 roundIncrement = zSign ? 0 : 0x7f;
2033 break;
2034 case float_round_down:
2035 roundIncrement = zSign ? 0x7f : 0;
2036 break;
2037 default:
2038 abort();
2040 roundBits = absZ & 0x7F;
2041 absZ = ( absZ + roundIncrement )>>7;
2042 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2043 z = absZ;
2044 if ( zSign ) z = - z;
2045 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2046 float_raise(float_flag_invalid, status);
2047 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2049 if (roundBits) {
2050 status->float_exception_flags |= float_flag_inexact;
2052 return z;
2056 /*----------------------------------------------------------------------------
2057 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2058 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2059 | and returns the properly rounded 64-bit integer corresponding to the input.
2060 | If `zSign' is 1, the input is negated before being converted to an integer.
2061 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2062 | the inexact exception raised if the input cannot be represented exactly as
2063 | an integer. However, if the fixed-point input is too large, the invalid
2064 | exception is raised and the largest positive or negative integer is
2065 | returned.
2066 *----------------------------------------------------------------------------*/
2068 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2069 float_status *status)
2071 int8_t roundingMode;
2072 flag roundNearestEven, increment;
2073 int64_t z;
2075 roundingMode = status->float_rounding_mode;
2076 roundNearestEven = ( roundingMode == float_round_nearest_even );
2077 switch (roundingMode) {
2078 case float_round_nearest_even:
2079 case float_round_ties_away:
2080 increment = ((int64_t) absZ1 < 0);
2081 break;
2082 case float_round_to_zero:
2083 increment = 0;
2084 break;
2085 case float_round_up:
2086 increment = !zSign && absZ1;
2087 break;
2088 case float_round_down:
2089 increment = zSign && absZ1;
2090 break;
2091 default:
2092 abort();
2094 if ( increment ) {
2095 ++absZ0;
2096 if ( absZ0 == 0 ) goto overflow;
2097 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2099 z = absZ0;
2100 if ( zSign ) z = - z;
2101 if ( z && ( ( z < 0 ) ^ zSign ) ) {
2102 overflow:
2103 float_raise(float_flag_invalid, status);
2104 return
2105 zSign ? (int64_t) LIT64( 0x8000000000000000 )
2106 : LIT64( 0x7FFFFFFFFFFFFFFF );
2108 if (absZ1) {
2109 status->float_exception_flags |= float_flag_inexact;
2111 return z;
2115 /*----------------------------------------------------------------------------
2116 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2117 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2118 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2119 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2120 | with the inexact exception raised if the input cannot be represented exactly
2121 | as an integer. However, if the fixed-point input is too large, the invalid
2122 | exception is raised and the largest unsigned integer is returned.
2123 *----------------------------------------------------------------------------*/
2125 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2126 uint64_t absZ1, float_status *status)
2128 int8_t roundingMode;
2129 flag roundNearestEven, increment;
2131 roundingMode = status->float_rounding_mode;
2132 roundNearestEven = (roundingMode == float_round_nearest_even);
2133 switch (roundingMode) {
2134 case float_round_nearest_even:
2135 case float_round_ties_away:
2136 increment = ((int64_t)absZ1 < 0);
2137 break;
2138 case float_round_to_zero:
2139 increment = 0;
2140 break;
2141 case float_round_up:
2142 increment = !zSign && absZ1;
2143 break;
2144 case float_round_down:
2145 increment = zSign && absZ1;
2146 break;
2147 default:
2148 abort();
2150 if (increment) {
2151 ++absZ0;
2152 if (absZ0 == 0) {
2153 float_raise(float_flag_invalid, status);
2154 return LIT64(0xFFFFFFFFFFFFFFFF);
2156 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2159 if (zSign && absZ0) {
2160 float_raise(float_flag_invalid, status);
2161 return 0;
2164 if (absZ1) {
2165 status->float_exception_flags |= float_flag_inexact;
2167 return absZ0;
2170 /*----------------------------------------------------------------------------
2171 | If `a' is denormal and we are in flush-to-zero mode then set the
2172 | input-denormal exception and return zero. Otherwise just return the value.
2173 *----------------------------------------------------------------------------*/
2174 float32 float32_squash_input_denormal(float32 a, float_status *status)
2176 if (status->flush_inputs_to_zero) {
2177 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2178 float_raise(float_flag_input_denormal, status);
2179 return make_float32(float32_val(a) & 0x80000000);
2182 return a;
2185 /*----------------------------------------------------------------------------
2186 | Normalizes the subnormal single-precision floating-point value represented
2187 | by the denormalized significand `aSig'. The normalized exponent and
2188 | significand are stored at the locations pointed to by `zExpPtr' and
2189 | `zSigPtr', respectively.
2190 *----------------------------------------------------------------------------*/
2192 static void
2193 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2195 int8_t shiftCount;
2197 shiftCount = countLeadingZeros32( aSig ) - 8;
2198 *zSigPtr = aSig<<shiftCount;
2199 *zExpPtr = 1 - shiftCount;
2203 /*----------------------------------------------------------------------------
2204 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2205 | and significand `zSig', and returns the proper single-precision floating-
2206 | point value corresponding to the abstract input. Ordinarily, the abstract
2207 | value is simply rounded and packed into the single-precision format, with
2208 | the inexact exception raised if the abstract input cannot be represented
2209 | exactly. However, if the abstract value is too large, the overflow and
2210 | inexact exceptions are raised and an infinity or maximal finite value is
2211 | returned. If the abstract value is too small, the input value is rounded to
2212 | a subnormal number, and the underflow and inexact exceptions are raised if
2213 | the abstract input cannot be represented exactly as a subnormal single-
2214 | precision floating-point number.
2215 | The input significand `zSig' has its binary point between bits 30
2216 | and 29, which is 7 bits to the left of the usual location. This shifted
2217 | significand must be normalized or smaller. If `zSig' is not normalized,
2218 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2219 | and it must not require rounding. In the usual case that `zSig' is
2220 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2221 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2222 | Binary Floating-Point Arithmetic.
2223 *----------------------------------------------------------------------------*/
2225 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2226 float_status *status)
2228 int8_t roundingMode;
2229 flag roundNearestEven;
2230 int8_t roundIncrement, roundBits;
2231 flag isTiny;
2233 roundingMode = status->float_rounding_mode;
2234 roundNearestEven = ( roundingMode == float_round_nearest_even );
2235 switch (roundingMode) {
2236 case float_round_nearest_even:
2237 case float_round_ties_away:
2238 roundIncrement = 0x40;
2239 break;
2240 case float_round_to_zero:
2241 roundIncrement = 0;
2242 break;
2243 case float_round_up:
2244 roundIncrement = zSign ? 0 : 0x7f;
2245 break;
2246 case float_round_down:
2247 roundIncrement = zSign ? 0x7f : 0;
2248 break;
2249 default:
2250 abort();
2251 break;
2253 roundBits = zSig & 0x7F;
2254 if ( 0xFD <= (uint16_t) zExp ) {
2255 if ( ( 0xFD < zExp )
2256 || ( ( zExp == 0xFD )
2257 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2259 float_raise(float_flag_overflow | float_flag_inexact, status);
2260 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2262 if ( zExp < 0 ) {
2263 if (status->flush_to_zero) {
2264 float_raise(float_flag_output_denormal, status);
2265 return packFloat32(zSign, 0, 0);
2267 isTiny =
2268 (status->float_detect_tininess
2269 == float_tininess_before_rounding)
2270 || ( zExp < -1 )
2271 || ( zSig + roundIncrement < 0x80000000 );
2272 shift32RightJamming( zSig, - zExp, &zSig );
2273 zExp = 0;
2274 roundBits = zSig & 0x7F;
2275 if (isTiny && roundBits) {
2276 float_raise(float_flag_underflow, status);
2280 if (roundBits) {
2281 status->float_exception_flags |= float_flag_inexact;
2283 zSig = ( zSig + roundIncrement )>>7;
2284 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2285 if ( zSig == 0 ) zExp = 0;
2286 return packFloat32( zSign, zExp, zSig );
2290 /*----------------------------------------------------------------------------
2291 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2292 | and significand `zSig', and returns the proper single-precision floating-
2293 | point value corresponding to the abstract input. This routine is just like
2294 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2295 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2296 | floating-point exponent.
2297 *----------------------------------------------------------------------------*/
2299 static float32
2300 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2301 float_status *status)
2303 int8_t shiftCount;
2305 shiftCount = countLeadingZeros32( zSig ) - 1;
2306 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2307 status);
2311 /*----------------------------------------------------------------------------
2312 | If `a' is denormal and we are in flush-to-zero mode then set the
2313 | input-denormal exception and return zero. Otherwise just return the value.
2314 *----------------------------------------------------------------------------*/
2315 float64 float64_squash_input_denormal(float64 a, float_status *status)
2317 if (status->flush_inputs_to_zero) {
2318 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2319 float_raise(float_flag_input_denormal, status);
2320 return make_float64(float64_val(a) & (1ULL << 63));
2323 return a;
2326 /*----------------------------------------------------------------------------
2327 | Normalizes the subnormal double-precision floating-point value represented
2328 | by the denormalized significand `aSig'. The normalized exponent and
2329 | significand are stored at the locations pointed to by `zExpPtr' and
2330 | `zSigPtr', respectively.
2331 *----------------------------------------------------------------------------*/
2333 static void
2334 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2336 int8_t shiftCount;
2338 shiftCount = countLeadingZeros64( aSig ) - 11;
2339 *zSigPtr = aSig<<shiftCount;
2340 *zExpPtr = 1 - shiftCount;
2344 /*----------------------------------------------------------------------------
2345 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2346 | double-precision floating-point value, returning the result. After being
2347 | shifted into the proper positions, the three fields are simply added
2348 | together to form the result. This means that any integer portion of `zSig'
2349 | will be added into the exponent. Since a properly normalized significand
2350 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2351 | than the desired result exponent whenever `zSig' is a complete, normalized
2352 | significand.
2353 *----------------------------------------------------------------------------*/
2355 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2358 return make_float64(
2359 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2363 /*----------------------------------------------------------------------------
2364 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2365 | and significand `zSig', and returns the proper double-precision floating-
2366 | point value corresponding to the abstract input. Ordinarily, the abstract
2367 | value is simply rounded and packed into the double-precision format, with
2368 | the inexact exception raised if the abstract input cannot be represented
2369 | exactly. However, if the abstract value is too large, the overflow and
2370 | inexact exceptions are raised and an infinity or maximal finite value is
2371 | returned. If the abstract value is too small, the input value is rounded to
2372 | a subnormal number, and the underflow and inexact exceptions are raised if
2373 | the abstract input cannot be represented exactly as a subnormal double-
2374 | precision floating-point number.
2375 | The input significand `zSig' has its binary point between bits 62
2376 | and 61, which is 10 bits to the left of the usual location. This shifted
2377 | significand must be normalized or smaller. If `zSig' is not normalized,
2378 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2379 | and it must not require rounding. In the usual case that `zSig' is
2380 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2381 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2382 | Binary Floating-Point Arithmetic.
2383 *----------------------------------------------------------------------------*/
2385 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2386 float_status *status)
2388 int8_t roundingMode;
2389 flag roundNearestEven;
2390 int roundIncrement, roundBits;
2391 flag isTiny;
2393 roundingMode = status->float_rounding_mode;
2394 roundNearestEven = ( roundingMode == float_round_nearest_even );
2395 switch (roundingMode) {
2396 case float_round_nearest_even:
2397 case float_round_ties_away:
2398 roundIncrement = 0x200;
2399 break;
2400 case float_round_to_zero:
2401 roundIncrement = 0;
2402 break;
2403 case float_round_up:
2404 roundIncrement = zSign ? 0 : 0x3ff;
2405 break;
2406 case float_round_down:
2407 roundIncrement = zSign ? 0x3ff : 0;
2408 break;
2409 case float_round_to_odd:
2410 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2411 break;
2412 default:
2413 abort();
2415 roundBits = zSig & 0x3FF;
2416 if ( 0x7FD <= (uint16_t) zExp ) {
2417 if ( ( 0x7FD < zExp )
2418 || ( ( zExp == 0x7FD )
2419 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2421 bool overflow_to_inf = roundingMode != float_round_to_odd &&
2422 roundIncrement != 0;
2423 float_raise(float_flag_overflow | float_flag_inexact, status);
2424 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2426 if ( zExp < 0 ) {
2427 if (status->flush_to_zero) {
2428 float_raise(float_flag_output_denormal, status);
2429 return packFloat64(zSign, 0, 0);
2431 isTiny =
2432 (status->float_detect_tininess
2433 == float_tininess_before_rounding)
2434 || ( zExp < -1 )
2435 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2436 shift64RightJamming( zSig, - zExp, &zSig );
2437 zExp = 0;
2438 roundBits = zSig & 0x3FF;
2439 if (isTiny && roundBits) {
2440 float_raise(float_flag_underflow, status);
2442 if (roundingMode == float_round_to_odd) {
2444 * For round-to-odd case, the roundIncrement depends on
2445 * zSig which just changed.
2447 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2451 if (roundBits) {
2452 status->float_exception_flags |= float_flag_inexact;
2454 zSig = ( zSig + roundIncrement )>>10;
2455 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2456 if ( zSig == 0 ) zExp = 0;
2457 return packFloat64( zSign, zExp, zSig );
2461 /*----------------------------------------------------------------------------
2462 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2463 | and significand `zSig', and returns the proper double-precision floating-
2464 | point value corresponding to the abstract input. This routine is just like
2465 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2466 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2467 | floating-point exponent.
2468 *----------------------------------------------------------------------------*/
2470 static float64
2471 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2472 float_status *status)
2474 int8_t shiftCount;
2476 shiftCount = countLeadingZeros64( zSig ) - 1;
2477 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2478 status);
2482 /*----------------------------------------------------------------------------
2483 | Normalizes the subnormal extended double-precision floating-point value
2484 | represented by the denormalized significand `aSig'. The normalized exponent
2485 | and significand are stored at the locations pointed to by `zExpPtr' and
2486 | `zSigPtr', respectively.
2487 *----------------------------------------------------------------------------*/
2489 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2490 uint64_t *zSigPtr)
2492 int8_t shiftCount;
2494 shiftCount = countLeadingZeros64( aSig );
2495 *zSigPtr = aSig<<shiftCount;
2496 *zExpPtr = 1 - shiftCount;
2499 /*----------------------------------------------------------------------------
2500 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2501 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2502 | and returns the proper extended double-precision floating-point value
2503 | corresponding to the abstract input. Ordinarily, the abstract value is
2504 | rounded and packed into the extended double-precision format, with the
2505 | inexact exception raised if the abstract input cannot be represented
2506 | exactly. However, if the abstract value is too large, the overflow and
2507 | inexact exceptions are raised and an infinity or maximal finite value is
2508 | returned. If the abstract value is too small, the input value is rounded to
2509 | a subnormal number, and the underflow and inexact exceptions are raised if
2510 | the abstract input cannot be represented exactly as a subnormal extended
2511 | double-precision floating-point number.
2512 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
2513 | number of bits as single or double precision, respectively. Otherwise, the
2514 | result is rounded to the full precision of the extended double-precision
2515 | format.
2516 | The input significand must be normalized or smaller. If the input
2517 | significand is not normalized, `zExp' must be 0; in that case, the result
2518 | returned is a subnormal number, and it must not require rounding. The
2519 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2520 | Floating-Point Arithmetic.
2521 *----------------------------------------------------------------------------*/
2523 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2524 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2525 float_status *status)
2527 int8_t roundingMode;
2528 flag roundNearestEven, increment, isTiny;
2529 int64_t roundIncrement, roundMask, roundBits;
2531 roundingMode = status->float_rounding_mode;
2532 roundNearestEven = ( roundingMode == float_round_nearest_even );
2533 if ( roundingPrecision == 80 ) goto precision80;
2534 if ( roundingPrecision == 64 ) {
2535 roundIncrement = LIT64( 0x0000000000000400 );
2536 roundMask = LIT64( 0x00000000000007FF );
2538 else if ( roundingPrecision == 32 ) {
2539 roundIncrement = LIT64( 0x0000008000000000 );
2540 roundMask = LIT64( 0x000000FFFFFFFFFF );
2542 else {
2543 goto precision80;
2545 zSig0 |= ( zSig1 != 0 );
2546 switch (roundingMode) {
2547 case float_round_nearest_even:
2548 case float_round_ties_away:
2549 break;
2550 case float_round_to_zero:
2551 roundIncrement = 0;
2552 break;
2553 case float_round_up:
2554 roundIncrement = zSign ? 0 : roundMask;
2555 break;
2556 case float_round_down:
2557 roundIncrement = zSign ? roundMask : 0;
2558 break;
2559 default:
2560 abort();
2562 roundBits = zSig0 & roundMask;
2563 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2564 if ( ( 0x7FFE < zExp )
2565 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2567 goto overflow;
2569 if ( zExp <= 0 ) {
2570 if (status->flush_to_zero) {
2571 float_raise(float_flag_output_denormal, status);
2572 return packFloatx80(zSign, 0, 0);
2574 isTiny =
2575 (status->float_detect_tininess
2576 == float_tininess_before_rounding)
2577 || ( zExp < 0 )
2578 || ( zSig0 <= zSig0 + roundIncrement );
2579 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2580 zExp = 0;
2581 roundBits = zSig0 & roundMask;
2582 if (isTiny && roundBits) {
2583 float_raise(float_flag_underflow, status);
2585 if (roundBits) {
2586 status->float_exception_flags |= float_flag_inexact;
2588 zSig0 += roundIncrement;
2589 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2590 roundIncrement = roundMask + 1;
2591 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2592 roundMask |= roundIncrement;
2594 zSig0 &= ~ roundMask;
2595 return packFloatx80( zSign, zExp, zSig0 );
2598 if (roundBits) {
2599 status->float_exception_flags |= float_flag_inexact;
2601 zSig0 += roundIncrement;
2602 if ( zSig0 < roundIncrement ) {
2603 ++zExp;
2604 zSig0 = LIT64( 0x8000000000000000 );
2606 roundIncrement = roundMask + 1;
2607 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2608 roundMask |= roundIncrement;
2610 zSig0 &= ~ roundMask;
2611 if ( zSig0 == 0 ) zExp = 0;
2612 return packFloatx80( zSign, zExp, zSig0 );
2613 precision80:
2614 switch (roundingMode) {
2615 case float_round_nearest_even:
2616 case float_round_ties_away:
2617 increment = ((int64_t)zSig1 < 0);
2618 break;
2619 case float_round_to_zero:
2620 increment = 0;
2621 break;
2622 case float_round_up:
2623 increment = !zSign && zSig1;
2624 break;
2625 case float_round_down:
2626 increment = zSign && zSig1;
2627 break;
2628 default:
2629 abort();
2631 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2632 if ( ( 0x7FFE < zExp )
2633 || ( ( zExp == 0x7FFE )
2634 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2635 && increment
2638 roundMask = 0;
2639 overflow:
2640 float_raise(float_flag_overflow | float_flag_inexact, status);
2641 if ( ( roundingMode == float_round_to_zero )
2642 || ( zSign && ( roundingMode == float_round_up ) )
2643 || ( ! zSign && ( roundingMode == float_round_down ) )
2645 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2647 return packFloatx80(zSign,
2648 floatx80_infinity_high,
2649 floatx80_infinity_low);
2651 if ( zExp <= 0 ) {
2652 isTiny =
2653 (status->float_detect_tininess
2654 == float_tininess_before_rounding)
2655 || ( zExp < 0 )
2656 || ! increment
2657 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2658 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2659 zExp = 0;
2660 if (isTiny && zSig1) {
2661 float_raise(float_flag_underflow, status);
2663 if (zSig1) {
2664 status->float_exception_flags |= float_flag_inexact;
2666 switch (roundingMode) {
2667 case float_round_nearest_even:
2668 case float_round_ties_away:
2669 increment = ((int64_t)zSig1 < 0);
2670 break;
2671 case float_round_to_zero:
2672 increment = 0;
2673 break;
2674 case float_round_up:
2675 increment = !zSign && zSig1;
2676 break;
2677 case float_round_down:
2678 increment = zSign && zSig1;
2679 break;
2680 default:
2681 abort();
2683 if ( increment ) {
2684 ++zSig0;
2685 zSig0 &=
2686 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2687 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2689 return packFloatx80( zSign, zExp, zSig0 );
2692 if (zSig1) {
2693 status->float_exception_flags |= float_flag_inexact;
2695 if ( increment ) {
2696 ++zSig0;
2697 if ( zSig0 == 0 ) {
2698 ++zExp;
2699 zSig0 = LIT64( 0x8000000000000000 );
2701 else {
2702 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2705 else {
2706 if ( zSig0 == 0 ) zExp = 0;
2708 return packFloatx80( zSign, zExp, zSig0 );
2712 /*----------------------------------------------------------------------------
2713 | Takes an abstract floating-point value having sign `zSign', exponent
2714 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2715 | and returns the proper extended double-precision floating-point value
2716 | corresponding to the abstract input. This routine is just like
2717 | `roundAndPackFloatx80' except that the input significand does not have to be
2718 | normalized.
2719 *----------------------------------------------------------------------------*/
2721 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2722 flag zSign, int32_t zExp,
2723 uint64_t zSig0, uint64_t zSig1,
2724 float_status *status)
2726 int8_t shiftCount;
2728 if ( zSig0 == 0 ) {
2729 zSig0 = zSig1;
2730 zSig1 = 0;
2731 zExp -= 64;
2733 shiftCount = countLeadingZeros64( zSig0 );
2734 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2735 zExp -= shiftCount;
2736 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2737 zSig0, zSig1, status);
2741 /*----------------------------------------------------------------------------
2742 | Returns the least-significant 64 fraction bits of the quadruple-precision
2743 | floating-point value `a'.
2744 *----------------------------------------------------------------------------*/
2746 static inline uint64_t extractFloat128Frac1( float128 a )
2749 return a.low;
2753 /*----------------------------------------------------------------------------
2754 | Returns the most-significant 48 fraction bits of the quadruple-precision
2755 | floating-point value `a'.
2756 *----------------------------------------------------------------------------*/
2758 static inline uint64_t extractFloat128Frac0( float128 a )
2761 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2765 /*----------------------------------------------------------------------------
2766 | Returns the exponent bits of the quadruple-precision floating-point value
2767 | `a'.
2768 *----------------------------------------------------------------------------*/
2770 static inline int32_t extractFloat128Exp( float128 a )
2773 return ( a.high>>48 ) & 0x7FFF;
2777 /*----------------------------------------------------------------------------
2778 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2779 *----------------------------------------------------------------------------*/
2781 static inline flag extractFloat128Sign( float128 a )
2784 return a.high>>63;
2788 /*----------------------------------------------------------------------------
2789 | Normalizes the subnormal quadruple-precision floating-point value
2790 | represented by the denormalized significand formed by the concatenation of
2791 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
2792 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2793 | significand are stored at the location pointed to by `zSig0Ptr', and the
2794 | least significant 64 bits of the normalized significand are stored at the
2795 | location pointed to by `zSig1Ptr'.
2796 *----------------------------------------------------------------------------*/
2798 static void
2799 normalizeFloat128Subnormal(
2800 uint64_t aSig0,
2801 uint64_t aSig1,
2802 int32_t *zExpPtr,
2803 uint64_t *zSig0Ptr,
2804 uint64_t *zSig1Ptr
2807 int8_t shiftCount;
2809 if ( aSig0 == 0 ) {
2810 shiftCount = countLeadingZeros64( aSig1 ) - 15;
2811 if ( shiftCount < 0 ) {
2812 *zSig0Ptr = aSig1>>( - shiftCount );
2813 *zSig1Ptr = aSig1<<( shiftCount & 63 );
2815 else {
2816 *zSig0Ptr = aSig1<<shiftCount;
2817 *zSig1Ptr = 0;
2819 *zExpPtr = - shiftCount - 63;
2821 else {
2822 shiftCount = countLeadingZeros64( aSig0 ) - 15;
2823 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2824 *zExpPtr = 1 - shiftCount;
2829 /*----------------------------------------------------------------------------
2830 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2831 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2832 | floating-point value, returning the result. After being shifted into the
2833 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2834 | added together to form the most significant 32 bits of the result. This
2835 | means that any integer portion of `zSig0' will be added into the exponent.
2836 | Since a properly normalized significand will have an integer portion equal
2837 | to 1, the `zExp' input should be 1 less than the desired result exponent
2838 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2839 | significand.
2840 *----------------------------------------------------------------------------*/
2842 static inline float128
2843 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2845 float128 z;
2847 z.low = zSig1;
2848 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2849 return z;
2853 /*----------------------------------------------------------------------------
2854 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2855 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2856 | and `zSig2', and returns the proper quadruple-precision floating-point value
2857 | corresponding to the abstract input. Ordinarily, the abstract value is
2858 | simply rounded and packed into the quadruple-precision format, with the
2859 | inexact exception raised if the abstract input cannot be represented
2860 | exactly. However, if the abstract value is too large, the overflow and
2861 | inexact exceptions are raised and an infinity or maximal finite value is
2862 | returned. If the abstract value is too small, the input value is rounded to
2863 | a subnormal number, and the underflow and inexact exceptions are raised if
2864 | the abstract input cannot be represented exactly as a subnormal quadruple-
2865 | precision floating-point number.
2866 | The input significand must be normalized or smaller. If the input
2867 | significand is not normalized, `zExp' must be 0; in that case, the result
2868 | returned is a subnormal number, and it must not require rounding. In the
2869 | usual case that the input significand is normalized, `zExp' must be 1 less
2870 | than the ``true'' floating-point exponent. The handling of underflow and
2871 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2872 *----------------------------------------------------------------------------*/
2874 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2875 uint64_t zSig0, uint64_t zSig1,
2876 uint64_t zSig2, float_status *status)
2878 int8_t roundingMode;
2879 flag roundNearestEven, increment, isTiny;
2881 roundingMode = status->float_rounding_mode;
2882 roundNearestEven = ( roundingMode == float_round_nearest_even );
2883 switch (roundingMode) {
2884 case float_round_nearest_even:
2885 case float_round_ties_away:
2886 increment = ((int64_t)zSig2 < 0);
2887 break;
2888 case float_round_to_zero:
2889 increment = 0;
2890 break;
2891 case float_round_up:
2892 increment = !zSign && zSig2;
2893 break;
2894 case float_round_down:
2895 increment = zSign && zSig2;
2896 break;
2897 case float_round_to_odd:
2898 increment = !(zSig1 & 0x1) && zSig2;
2899 break;
2900 default:
2901 abort();
2903 if ( 0x7FFD <= (uint32_t) zExp ) {
2904 if ( ( 0x7FFD < zExp )
2905 || ( ( zExp == 0x7FFD )
2906 && eq128(
2907 LIT64( 0x0001FFFFFFFFFFFF ),
2908 LIT64( 0xFFFFFFFFFFFFFFFF ),
2909 zSig0,
2910 zSig1
2912 && increment
2915 float_raise(float_flag_overflow | float_flag_inexact, status);
2916 if ( ( roundingMode == float_round_to_zero )
2917 || ( zSign && ( roundingMode == float_round_up ) )
2918 || ( ! zSign && ( roundingMode == float_round_down ) )
2919 || (roundingMode == float_round_to_odd)
2921 return
2922 packFloat128(
2923 zSign,
2924 0x7FFE,
2925 LIT64( 0x0000FFFFFFFFFFFF ),
2926 LIT64( 0xFFFFFFFFFFFFFFFF )
2929 return packFloat128( zSign, 0x7FFF, 0, 0 );
2931 if ( zExp < 0 ) {
2932 if (status->flush_to_zero) {
2933 float_raise(float_flag_output_denormal, status);
2934 return packFloat128(zSign, 0, 0, 0);
2936 isTiny =
2937 (status->float_detect_tininess
2938 == float_tininess_before_rounding)
2939 || ( zExp < -1 )
2940 || ! increment
2941 || lt128(
2942 zSig0,
2943 zSig1,
2944 LIT64( 0x0001FFFFFFFFFFFF ),
2945 LIT64( 0xFFFFFFFFFFFFFFFF )
2947 shift128ExtraRightJamming(
2948 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2949 zExp = 0;
2950 if (isTiny && zSig2) {
2951 float_raise(float_flag_underflow, status);
2953 switch (roundingMode) {
2954 case float_round_nearest_even:
2955 case float_round_ties_away:
2956 increment = ((int64_t)zSig2 < 0);
2957 break;
2958 case float_round_to_zero:
2959 increment = 0;
2960 break;
2961 case float_round_up:
2962 increment = !zSign && zSig2;
2963 break;
2964 case float_round_down:
2965 increment = zSign && zSig2;
2966 break;
2967 case float_round_to_odd:
2968 increment = !(zSig1 & 0x1) && zSig2;
2969 break;
2970 default:
2971 abort();
2975 if (zSig2) {
2976 status->float_exception_flags |= float_flag_inexact;
2978 if ( increment ) {
2979 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2980 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2982 else {
2983 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2985 return packFloat128( zSign, zExp, zSig0, zSig1 );
2989 /*----------------------------------------------------------------------------
2990 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2991 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2992 | returns the proper quadruple-precision floating-point value corresponding
2993 | to the abstract input. This routine is just like `roundAndPackFloat128'
2994 | except that the input significand has fewer bits and does not have to be
2995 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
2996 | point exponent.
2997 *----------------------------------------------------------------------------*/
2999 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3000 uint64_t zSig0, uint64_t zSig1,
3001 float_status *status)
3003 int8_t shiftCount;
3004 uint64_t zSig2;
3006 if ( zSig0 == 0 ) {
3007 zSig0 = zSig1;
3008 zSig1 = 0;
3009 zExp -= 64;
3011 shiftCount = countLeadingZeros64( zSig0 ) - 15;
3012 if ( 0 <= shiftCount ) {
3013 zSig2 = 0;
3014 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3016 else {
3017 shift128ExtraRightJamming(
3018 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3020 zExp -= shiftCount;
3021 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3026 /*----------------------------------------------------------------------------
3027 | Returns the result of converting the 32-bit two's complement integer `a'
3028 | to the extended double-precision floating-point format. The conversion
3029 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3030 | Arithmetic.
3031 *----------------------------------------------------------------------------*/
3033 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3035 flag zSign;
3036 uint32_t absA;
3037 int8_t shiftCount;
3038 uint64_t zSig;
3040 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3041 zSign = ( a < 0 );
3042 absA = zSign ? - a : a;
3043 shiftCount = countLeadingZeros32( absA ) + 32;
3044 zSig = absA;
3045 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3049 /*----------------------------------------------------------------------------
3050 | Returns the result of converting the 32-bit two's complement integer `a' to
3051 | the quadruple-precision floating-point format. The conversion is performed
3052 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3053 *----------------------------------------------------------------------------*/
3055 float128 int32_to_float128(int32_t a, float_status *status)
3057 flag zSign;
3058 uint32_t absA;
3059 int8_t shiftCount;
3060 uint64_t zSig0;
3062 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3063 zSign = ( a < 0 );
3064 absA = zSign ? - a : a;
3065 shiftCount = countLeadingZeros32( absA ) + 17;
3066 zSig0 = absA;
3067 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3071 /*----------------------------------------------------------------------------
3072 | Returns the result of converting the 64-bit two's complement integer `a'
3073 | to the extended double-precision floating-point format. The conversion
3074 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3075 | Arithmetic.
3076 *----------------------------------------------------------------------------*/
3078 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3080 flag zSign;
3081 uint64_t absA;
3082 int8_t shiftCount;
3084 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3085 zSign = ( a < 0 );
3086 absA = zSign ? - a : a;
3087 shiftCount = countLeadingZeros64( absA );
3088 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3092 /*----------------------------------------------------------------------------
3093 | Returns the result of converting the 64-bit two's complement integer `a' to
3094 | the quadruple-precision floating-point format. The conversion is performed
3095 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3096 *----------------------------------------------------------------------------*/
3098 float128 int64_to_float128(int64_t a, float_status *status)
3100 flag zSign;
3101 uint64_t absA;
3102 int8_t shiftCount;
3103 int32_t zExp;
3104 uint64_t zSig0, zSig1;
3106 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3107 zSign = ( a < 0 );
3108 absA = zSign ? - a : a;
3109 shiftCount = countLeadingZeros64( absA ) + 49;
3110 zExp = 0x406E - shiftCount;
3111 if ( 64 <= shiftCount ) {
3112 zSig1 = 0;
3113 zSig0 = absA;
3114 shiftCount -= 64;
3116 else {
3117 zSig1 = absA;
3118 zSig0 = 0;
3120 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3121 return packFloat128( zSign, zExp, zSig0, zSig1 );
3125 /*----------------------------------------------------------------------------
3126 | Returns the result of converting the 64-bit unsigned integer `a'
3127 | to the quadruple-precision floating-point format. The conversion is performed
3128 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3129 *----------------------------------------------------------------------------*/
3131 float128 uint64_to_float128(uint64_t a, float_status *status)
3133 if (a == 0) {
3134 return float128_zero;
3136 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
3142 /*----------------------------------------------------------------------------
3143 | Returns the result of converting the single-precision floating-point value
3144 | `a' to the double-precision floating-point format. The conversion is
3145 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3146 | Arithmetic.
3147 *----------------------------------------------------------------------------*/
3149 float64 float32_to_float64(float32 a, float_status *status)
3151 flag aSign;
3152 int aExp;
3153 uint32_t aSig;
3154 a = float32_squash_input_denormal(a, status);
3156 aSig = extractFloat32Frac( a );
3157 aExp = extractFloat32Exp( a );
3158 aSign = extractFloat32Sign( a );
3159 if ( aExp == 0xFF ) {
3160 if (aSig) {
3161 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3163 return packFloat64( aSign, 0x7FF, 0 );
3165 if ( aExp == 0 ) {
3166 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3167 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3168 --aExp;
3170 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3174 /*----------------------------------------------------------------------------
3175 | Returns the result of converting the single-precision floating-point value
3176 | `a' to the extended double-precision floating-point format. The conversion
3177 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3178 | Arithmetic.
3179 *----------------------------------------------------------------------------*/
3181 floatx80 float32_to_floatx80(float32 a, float_status *status)
3183 flag aSign;
3184 int aExp;
3185 uint32_t aSig;
3187 a = float32_squash_input_denormal(a, status);
3188 aSig = extractFloat32Frac( a );
3189 aExp = extractFloat32Exp( a );
3190 aSign = extractFloat32Sign( a );
3191 if ( aExp == 0xFF ) {
3192 if (aSig) {
3193 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3195 return packFloatx80(aSign,
3196 floatx80_infinity_high,
3197 floatx80_infinity_low);
3199 if ( aExp == 0 ) {
3200 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3201 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3203 aSig |= 0x00800000;
3204 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3208 /*----------------------------------------------------------------------------
3209 | Returns the result of converting the single-precision floating-point value
3210 | `a' to the double-precision floating-point format. The conversion is
3211 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3212 | Arithmetic.
3213 *----------------------------------------------------------------------------*/
3215 float128 float32_to_float128(float32 a, float_status *status)
3217 flag aSign;
3218 int aExp;
3219 uint32_t aSig;
3221 a = float32_squash_input_denormal(a, status);
3222 aSig = extractFloat32Frac( a );
3223 aExp = extractFloat32Exp( a );
3224 aSign = extractFloat32Sign( a );
3225 if ( aExp == 0xFF ) {
3226 if (aSig) {
3227 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3229 return packFloat128( aSign, 0x7FFF, 0, 0 );
3231 if ( aExp == 0 ) {
3232 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3233 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3234 --aExp;
3236 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3240 /*----------------------------------------------------------------------------
3241 | Returns the remainder of the single-precision floating-point value `a'
3242 | with respect to the corresponding value `b'. The operation is performed
3243 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3244 *----------------------------------------------------------------------------*/
3246 float32 float32_rem(float32 a, float32 b, float_status *status)
3248 flag aSign, zSign;
3249 int aExp, bExp, expDiff;
3250 uint32_t aSig, bSig;
3251 uint32_t q;
3252 uint64_t aSig64, bSig64, q64;
3253 uint32_t alternateASig;
3254 int32_t sigMean;
3255 a = float32_squash_input_denormal(a, status);
3256 b = float32_squash_input_denormal(b, status);
3258 aSig = extractFloat32Frac( a );
3259 aExp = extractFloat32Exp( a );
3260 aSign = extractFloat32Sign( a );
3261 bSig = extractFloat32Frac( b );
3262 bExp = extractFloat32Exp( b );
3263 if ( aExp == 0xFF ) {
3264 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3265 return propagateFloat32NaN(a, b, status);
3267 float_raise(float_flag_invalid, status);
3268 return float32_default_nan(status);
3270 if ( bExp == 0xFF ) {
3271 if (bSig) {
3272 return propagateFloat32NaN(a, b, status);
3274 return a;
3276 if ( bExp == 0 ) {
3277 if ( bSig == 0 ) {
3278 float_raise(float_flag_invalid, status);
3279 return float32_default_nan(status);
3281 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3283 if ( aExp == 0 ) {
3284 if ( aSig == 0 ) return a;
3285 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3287 expDiff = aExp - bExp;
3288 aSig |= 0x00800000;
3289 bSig |= 0x00800000;
3290 if ( expDiff < 32 ) {
3291 aSig <<= 8;
3292 bSig <<= 8;
3293 if ( expDiff < 0 ) {
3294 if ( expDiff < -1 ) return a;
3295 aSig >>= 1;
3297 q = ( bSig <= aSig );
3298 if ( q ) aSig -= bSig;
3299 if ( 0 < expDiff ) {
3300 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3301 q >>= 32 - expDiff;
3302 bSig >>= 2;
3303 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3305 else {
3306 aSig >>= 2;
3307 bSig >>= 2;
3310 else {
3311 if ( bSig <= aSig ) aSig -= bSig;
3312 aSig64 = ( (uint64_t) aSig )<<40;
3313 bSig64 = ( (uint64_t) bSig )<<40;
3314 expDiff -= 64;
3315 while ( 0 < expDiff ) {
3316 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3317 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3318 aSig64 = - ( ( bSig * q64 )<<38 );
3319 expDiff -= 62;
3321 expDiff += 64;
3322 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3323 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3324 q = q64>>( 64 - expDiff );
3325 bSig <<= 6;
3326 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3328 do {
3329 alternateASig = aSig;
3330 ++q;
3331 aSig -= bSig;
3332 } while ( 0 <= (int32_t) aSig );
3333 sigMean = aSig + alternateASig;
3334 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3335 aSig = alternateASig;
3337 zSign = ( (int32_t) aSig < 0 );
3338 if ( zSign ) aSig = - aSig;
3339 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3344 /*----------------------------------------------------------------------------
3345 | Returns the binary exponential of the single-precision floating-point value
3346 | `a'. The operation is performed according to the IEC/IEEE Standard for
3347 | Binary Floating-Point Arithmetic.
3349 | Uses the following identities:
3351 | 1. -------------------------------------------------------------------------
3352 | x x*ln(2)
3353 | 2 = e
3355 | 2. -------------------------------------------------------------------------
3356 | 2 3 4 5 n
3357 | x x x x x x x
3358 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3359 | 1! 2! 3! 4! 5! n!
3360 *----------------------------------------------------------------------------*/
3362 static const float64 float32_exp2_coefficients[15] =
3364 const_float64( 0x3ff0000000000000ll ), /* 1 */
3365 const_float64( 0x3fe0000000000000ll ), /* 2 */
3366 const_float64( 0x3fc5555555555555ll ), /* 3 */
3367 const_float64( 0x3fa5555555555555ll ), /* 4 */
3368 const_float64( 0x3f81111111111111ll ), /* 5 */
3369 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3370 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3371 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3372 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3373 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3374 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3375 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3376 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3377 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3378 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3381 float32 float32_exp2(float32 a, float_status *status)
3383 flag aSign;
3384 int aExp;
3385 uint32_t aSig;
3386 float64 r, x, xn;
3387 int i;
3388 a = float32_squash_input_denormal(a, status);
3390 aSig = extractFloat32Frac( a );
3391 aExp = extractFloat32Exp( a );
3392 aSign = extractFloat32Sign( a );
3394 if ( aExp == 0xFF) {
3395 if (aSig) {
3396 return propagateFloat32NaN(a, float32_zero, status);
3398 return (aSign) ? float32_zero : a;
3400 if (aExp == 0) {
3401 if (aSig == 0) return float32_one;
3404 float_raise(float_flag_inexact, status);
3406 /* ******************************* */
3407 /* using float64 for approximation */
3408 /* ******************************* */
3409 x = float32_to_float64(a, status);
3410 x = float64_mul(x, float64_ln2, status);
3412 xn = x;
3413 r = float64_one;
3414 for (i = 0 ; i < 15 ; i++) {
3415 float64 f;
3417 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3418 r = float64_add(r, f, status);
3420 xn = float64_mul(xn, x, status);
3423 return float64_to_float32(r, status);
3426 /*----------------------------------------------------------------------------
3427 | Returns the binary log of the single-precision floating-point value `a'.
3428 | The operation is performed according to the IEC/IEEE Standard for Binary
3429 | Floating-Point Arithmetic.
3430 *----------------------------------------------------------------------------*/
3431 float32 float32_log2(float32 a, float_status *status)
3433 flag aSign, zSign;
3434 int aExp;
3435 uint32_t aSig, zSig, i;
3437 a = float32_squash_input_denormal(a, status);
3438 aSig = extractFloat32Frac( a );
3439 aExp = extractFloat32Exp( a );
3440 aSign = extractFloat32Sign( a );
3442 if ( aExp == 0 ) {
3443 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3444 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3446 if ( aSign ) {
3447 float_raise(float_flag_invalid, status);
3448 return float32_default_nan(status);
3450 if ( aExp == 0xFF ) {
3451 if (aSig) {
3452 return propagateFloat32NaN(a, float32_zero, status);
3454 return a;
3457 aExp -= 0x7F;
3458 aSig |= 0x00800000;
3459 zSign = aExp < 0;
3460 zSig = aExp << 23;
3462 for (i = 1 << 22; i > 0; i >>= 1) {
3463 aSig = ( (uint64_t)aSig * aSig ) >> 23;
3464 if ( aSig & 0x01000000 ) {
3465 aSig >>= 1;
3466 zSig |= i;
3470 if ( zSign )
3471 zSig = -zSig;
3473 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3476 /*----------------------------------------------------------------------------
3477 | Returns 1 if the single-precision floating-point value `a' is equal to
3478 | the corresponding value `b', and 0 otherwise. The invalid exception is
3479 | raised if either operand is a NaN. Otherwise, the comparison is performed
3480 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3481 *----------------------------------------------------------------------------*/
3483 int float32_eq(float32 a, float32 b, float_status *status)
3485 uint32_t av, bv;
3486 a = float32_squash_input_denormal(a, status);
3487 b = float32_squash_input_denormal(b, status);
3489 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3490 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3492 float_raise(float_flag_invalid, status);
3493 return 0;
3495 av = float32_val(a);
3496 bv = float32_val(b);
3497 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3500 /*----------------------------------------------------------------------------
3501 | Returns 1 if the single-precision floating-point value `a' is less than
3502 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3503 | exception is raised if either operand is a NaN. The comparison is performed
3504 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3505 *----------------------------------------------------------------------------*/
3507 int float32_le(float32 a, float32 b, float_status *status)
3509 flag aSign, bSign;
3510 uint32_t av, bv;
3511 a = float32_squash_input_denormal(a, status);
3512 b = float32_squash_input_denormal(b, status);
3514 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3515 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3517 float_raise(float_flag_invalid, status);
3518 return 0;
3520 aSign = extractFloat32Sign( a );
3521 bSign = extractFloat32Sign( b );
3522 av = float32_val(a);
3523 bv = float32_val(b);
3524 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3525 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3529 /*----------------------------------------------------------------------------
3530 | Returns 1 if the single-precision floating-point value `a' is less than
3531 | the corresponding value `b', and 0 otherwise. The invalid exception is
3532 | raised if either operand is a NaN. The comparison is performed according
3533 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3534 *----------------------------------------------------------------------------*/
3536 int float32_lt(float32 a, float32 b, float_status *status)
3538 flag aSign, bSign;
3539 uint32_t av, bv;
3540 a = float32_squash_input_denormal(a, status);
3541 b = float32_squash_input_denormal(b, status);
3543 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3544 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3546 float_raise(float_flag_invalid, status);
3547 return 0;
3549 aSign = extractFloat32Sign( a );
3550 bSign = extractFloat32Sign( b );
3551 av = float32_val(a);
3552 bv = float32_val(b);
3553 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3554 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3558 /*----------------------------------------------------------------------------
3559 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3560 | be compared, and 0 otherwise. The invalid exception is raised if either
3561 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3562 | Standard for Binary Floating-Point Arithmetic.
3563 *----------------------------------------------------------------------------*/
3565 int float32_unordered(float32 a, float32 b, float_status *status)
3567 a = float32_squash_input_denormal(a, status);
3568 b = float32_squash_input_denormal(b, status);
3570 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3571 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3573 float_raise(float_flag_invalid, status);
3574 return 1;
3576 return 0;
3579 /*----------------------------------------------------------------------------
3580 | Returns 1 if the single-precision floating-point value `a' is equal to
3581 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3582 | exception. The comparison is performed according to the IEC/IEEE Standard
3583 | for Binary Floating-Point Arithmetic.
3584 *----------------------------------------------------------------------------*/
3586 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3588 a = float32_squash_input_denormal(a, status);
3589 b = float32_squash_input_denormal(b, status);
3591 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3592 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3594 if (float32_is_signaling_nan(a, status)
3595 || float32_is_signaling_nan(b, status)) {
3596 float_raise(float_flag_invalid, status);
3598 return 0;
3600 return ( float32_val(a) == float32_val(b) ) ||
3601 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3604 /*----------------------------------------------------------------------------
3605 | Returns 1 if the single-precision floating-point value `a' is less than or
3606 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3607 | cause an exception. Otherwise, the comparison is performed according to the
3608 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3609 *----------------------------------------------------------------------------*/
3611 int float32_le_quiet(float32 a, float32 b, float_status *status)
3613 flag aSign, bSign;
3614 uint32_t av, bv;
3615 a = float32_squash_input_denormal(a, status);
3616 b = float32_squash_input_denormal(b, status);
3618 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3619 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3621 if (float32_is_signaling_nan(a, status)
3622 || float32_is_signaling_nan(b, status)) {
3623 float_raise(float_flag_invalid, status);
3625 return 0;
3627 aSign = extractFloat32Sign( a );
3628 bSign = extractFloat32Sign( b );
3629 av = float32_val(a);
3630 bv = float32_val(b);
3631 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3632 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3636 /*----------------------------------------------------------------------------
3637 | Returns 1 if the single-precision floating-point value `a' is less than
3638 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3639 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3640 | Standard for Binary Floating-Point Arithmetic.
3641 *----------------------------------------------------------------------------*/
3643 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3645 flag aSign, bSign;
3646 uint32_t av, bv;
3647 a = float32_squash_input_denormal(a, status);
3648 b = float32_squash_input_denormal(b, status);
3650 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3651 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3653 if (float32_is_signaling_nan(a, status)
3654 || float32_is_signaling_nan(b, status)) {
3655 float_raise(float_flag_invalid, status);
3657 return 0;
3659 aSign = extractFloat32Sign( a );
3660 bSign = extractFloat32Sign( b );
3661 av = float32_val(a);
3662 bv = float32_val(b);
3663 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3664 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3668 /*----------------------------------------------------------------------------
3669 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3670 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3671 | comparison is performed according to the IEC/IEEE Standard for Binary
3672 | Floating-Point Arithmetic.
3673 *----------------------------------------------------------------------------*/
3675 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3677 a = float32_squash_input_denormal(a, status);
3678 b = float32_squash_input_denormal(b, status);
3680 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3681 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3683 if (float32_is_signaling_nan(a, status)
3684 || float32_is_signaling_nan(b, status)) {
3685 float_raise(float_flag_invalid, status);
3687 return 1;
3689 return 0;
3693 /*----------------------------------------------------------------------------
3694 | Returns the result of converting the double-precision floating-point value
3695 | `a' to the single-precision floating-point format. The conversion is
3696 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3697 | Arithmetic.
3698 *----------------------------------------------------------------------------*/
3700 float32 float64_to_float32(float64 a, float_status *status)
3702 flag aSign;
3703 int aExp;
3704 uint64_t aSig;
3705 uint32_t zSig;
3706 a = float64_squash_input_denormal(a, status);
3708 aSig = extractFloat64Frac( a );
3709 aExp = extractFloat64Exp( a );
3710 aSign = extractFloat64Sign( a );
3711 if ( aExp == 0x7FF ) {
3712 if (aSig) {
3713 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3715 return packFloat32( aSign, 0xFF, 0 );
3717 shift64RightJamming( aSig, 22, &aSig );
3718 zSig = aSig;
3719 if ( aExp || zSig ) {
3720 zSig |= 0x40000000;
3721 aExp -= 0x381;
3723 return roundAndPackFloat32(aSign, aExp, zSig, status);
3728 /*----------------------------------------------------------------------------
3729 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3730 | half-precision floating-point value, returning the result. After being
3731 | shifted into the proper positions, the three fields are simply added
3732 | together to form the result. This means that any integer portion of `zSig'
3733 | will be added into the exponent. Since a properly normalized significand
3734 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3735 | than the desired result exponent whenever `zSig' is a complete, normalized
3736 | significand.
3737 *----------------------------------------------------------------------------*/
3738 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3740 return make_float16(
3741 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3744 /*----------------------------------------------------------------------------
3745 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3746 | and significand `zSig', and returns the proper half-precision floating-
3747 | point value corresponding to the abstract input. Ordinarily, the abstract
3748 | value is simply rounded and packed into the half-precision format, with
3749 | the inexact exception raised if the abstract input cannot be represented
3750 | exactly. However, if the abstract value is too large, the overflow and
3751 | inexact exceptions are raised and an infinity or maximal finite value is
3752 | returned. If the abstract value is too small, the input value is rounded to
3753 | a subnormal number, and the underflow and inexact exceptions are raised if
3754 | the abstract input cannot be represented exactly as a subnormal half-
3755 | precision floating-point number.
3756 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3757 | ARM-style "alternative representation", which omits the NaN and Inf
3758 | encodings in order to raise the maximum representable exponent by one.
3759 | The input significand `zSig' has its binary point between bits 22
3760 | and 23, which is 13 bits to the left of the usual location. This shifted
3761 | significand must be normalized or smaller. If `zSig' is not normalized,
3762 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3763 | and it must not require rounding. In the usual case that `zSig' is
3764 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3765 | Note the slightly odd position of the binary point in zSig compared with the
3766 | other roundAndPackFloat functions. This should probably be fixed if we
3767 | need to implement more float16 routines than just conversion.
3768 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3769 | Binary Floating-Point Arithmetic.
3770 *----------------------------------------------------------------------------*/
3772 static float16 roundAndPackFloat16(flag zSign, int zExp,
3773 uint32_t zSig, flag ieee,
3774 float_status *status)
3776 int maxexp = ieee ? 29 : 30;
3777 uint32_t mask;
3778 uint32_t increment;
3779 bool rounding_bumps_exp;
3780 bool is_tiny = false;
3782 /* Calculate the mask of bits of the mantissa which are not
3783 * representable in half-precision and will be lost.
3785 if (zExp < 1) {
3786 /* Will be denormal in halfprec */
3787 mask = 0x00ffffff;
3788 if (zExp >= -11) {
3789 mask >>= 11 + zExp;
3791 } else {
3792 /* Normal number in halfprec */
3793 mask = 0x00001fff;
3796 switch (status->float_rounding_mode) {
3797 case float_round_nearest_even:
3798 increment = (mask + 1) >> 1;
3799 if ((zSig & mask) == increment) {
3800 increment = zSig & (increment << 1);
3802 break;
3803 case float_round_ties_away:
3804 increment = (mask + 1) >> 1;
3805 break;
3806 case float_round_up:
3807 increment = zSign ? 0 : mask;
3808 break;
3809 case float_round_down:
3810 increment = zSign ? mask : 0;
3811 break;
3812 default: /* round_to_zero */
3813 increment = 0;
3814 break;
3817 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3819 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3820 if (ieee) {
3821 float_raise(float_flag_overflow | float_flag_inexact, status);
3822 return packFloat16(zSign, 0x1f, 0);
3823 } else {
3824 float_raise(float_flag_invalid, status);
3825 return packFloat16(zSign, 0x1f, 0x3ff);
3829 if (zExp < 0) {
3830 /* Note that flush-to-zero does not affect half-precision results */
3831 is_tiny =
3832 (status->float_detect_tininess == float_tininess_before_rounding)
3833 || (zExp < -1)
3834 || (!rounding_bumps_exp);
3836 if (zSig & mask) {
3837 float_raise(float_flag_inexact, status);
3838 if (is_tiny) {
3839 float_raise(float_flag_underflow, status);
3843 zSig += increment;
3844 if (rounding_bumps_exp) {
3845 zSig >>= 1;
3846 zExp++;
3849 if (zExp < -10) {
3850 return packFloat16(zSign, 0, 0);
3852 if (zExp < 0) {
3853 zSig >>= -zExp;
3854 zExp = 0;
3856 return packFloat16(zSign, zExp, zSig >> 13);
3859 /*----------------------------------------------------------------------------
3860 | If `a' is denormal and we are in flush-to-zero mode then set the
3861 | input-denormal exception and return zero. Otherwise just return the value.
3862 *----------------------------------------------------------------------------*/
3863 float16 float16_squash_input_denormal(float16 a, float_status *status)
3865 if (status->flush_inputs_to_zero) {
3866 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3867 float_raise(float_flag_input_denormal, status);
3868 return make_float16(float16_val(a) & 0x8000);
3871 return a;
3874 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3875 uint32_t *zSigPtr)
3877 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3878 *zSigPtr = aSig << shiftCount;
3879 *zExpPtr = 1 - shiftCount;
3882 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3883 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3885 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3887 flag aSign;
3888 int aExp;
3889 uint32_t aSig;
3891 aSign = extractFloat16Sign(a);
3892 aExp = extractFloat16Exp(a);
3893 aSig = extractFloat16Frac(a);
3895 if (aExp == 0x1f && ieee) {
3896 if (aSig) {
3897 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3899 return packFloat32(aSign, 0xff, 0);
3901 if (aExp == 0) {
3902 if (aSig == 0) {
3903 return packFloat32(aSign, 0, 0);
3906 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3907 aExp--;
3909 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3912 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3914 flag aSign;
3915 int aExp;
3916 uint32_t aSig;
3918 a = float32_squash_input_denormal(a, status);
3920 aSig = extractFloat32Frac( a );
3921 aExp = extractFloat32Exp( a );
3922 aSign = extractFloat32Sign( a );
3923 if ( aExp == 0xFF ) {
3924 if (aSig) {
3925 /* Input is a NaN */
3926 if (!ieee) {
3927 float_raise(float_flag_invalid, status);
3928 return packFloat16(aSign, 0, 0);
3930 return commonNaNToFloat16(
3931 float32ToCommonNaN(a, status), status);
3933 /* Infinity */
3934 if (!ieee) {
3935 float_raise(float_flag_invalid, status);
3936 return packFloat16(aSign, 0x1f, 0x3ff);
3938 return packFloat16(aSign, 0x1f, 0);
3940 if (aExp == 0 && aSig == 0) {
3941 return packFloat16(aSign, 0, 0);
3943 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3944 * even if the input is denormal; however this is harmless because
3945 * the largest possible single-precision denormal is still smaller
3946 * than the smallest representable half-precision denormal, and so we
3947 * will end up ignoring aSig and returning via the "always return zero"
3948 * codepath.
3950 aSig |= 0x00800000;
3951 aExp -= 0x71;
3953 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3956 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3958 flag aSign;
3959 int aExp;
3960 uint32_t aSig;
3962 aSign = extractFloat16Sign(a);
3963 aExp = extractFloat16Exp(a);
3964 aSig = extractFloat16Frac(a);
3966 if (aExp == 0x1f && ieee) {
3967 if (aSig) {
3968 return commonNaNToFloat64(
3969 float16ToCommonNaN(a, status), status);
3971 return packFloat64(aSign, 0x7ff, 0);
3973 if (aExp == 0) {
3974 if (aSig == 0) {
3975 return packFloat64(aSign, 0, 0);
3978 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3979 aExp--;
3981 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3984 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3986 flag aSign;
3987 int aExp;
3988 uint64_t aSig;
3989 uint32_t zSig;
3991 a = float64_squash_input_denormal(a, status);
3993 aSig = extractFloat64Frac(a);
3994 aExp = extractFloat64Exp(a);
3995 aSign = extractFloat64Sign(a);
3996 if (aExp == 0x7FF) {
3997 if (aSig) {
3998 /* Input is a NaN */
3999 if (!ieee) {
4000 float_raise(float_flag_invalid, status);
4001 return packFloat16(aSign, 0, 0);
4003 return commonNaNToFloat16(
4004 float64ToCommonNaN(a, status), status);
4006 /* Infinity */
4007 if (!ieee) {
4008 float_raise(float_flag_invalid, status);
4009 return packFloat16(aSign, 0x1f, 0x3ff);
4011 return packFloat16(aSign, 0x1f, 0);
4013 shift64RightJamming(aSig, 29, &aSig);
4014 zSig = aSig;
4015 if (aExp == 0 && zSig == 0) {
4016 return packFloat16(aSign, 0, 0);
4018 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4019 * even if the input is denormal; however this is harmless because
4020 * the largest possible single-precision denormal is still smaller
4021 * than the smallest representable half-precision denormal, and so we
4022 * will end up ignoring aSig and returning via the "always return zero"
4023 * codepath.
4025 zSig |= 0x00800000;
4026 aExp -= 0x3F1;
4028 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4031 /*----------------------------------------------------------------------------
4032 | Returns the result of converting the double-precision floating-point value
4033 | `a' to the extended double-precision floating-point format. The conversion
4034 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4035 | Arithmetic.
4036 *----------------------------------------------------------------------------*/
4038 floatx80 float64_to_floatx80(float64 a, float_status *status)
4040 flag aSign;
4041 int aExp;
4042 uint64_t aSig;
4044 a = float64_squash_input_denormal(a, status);
4045 aSig = extractFloat64Frac( a );
4046 aExp = extractFloat64Exp( a );
4047 aSign = extractFloat64Sign( a );
4048 if ( aExp == 0x7FF ) {
4049 if (aSig) {
4050 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4052 return packFloatx80(aSign,
4053 floatx80_infinity_high,
4054 floatx80_infinity_low);
4056 if ( aExp == 0 ) {
4057 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4058 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4060 return
4061 packFloatx80(
4062 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4066 /*----------------------------------------------------------------------------
4067 | Returns the result of converting the double-precision floating-point value
4068 | `a' to the quadruple-precision floating-point format. The conversion is
4069 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4070 | Arithmetic.
4071 *----------------------------------------------------------------------------*/
4073 float128 float64_to_float128(float64 a, float_status *status)
4075 flag aSign;
4076 int aExp;
4077 uint64_t aSig, zSig0, zSig1;
4079 a = float64_squash_input_denormal(a, status);
4080 aSig = extractFloat64Frac( a );
4081 aExp = extractFloat64Exp( a );
4082 aSign = extractFloat64Sign( a );
4083 if ( aExp == 0x7FF ) {
4084 if (aSig) {
4085 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4087 return packFloat128( aSign, 0x7FFF, 0, 0 );
4089 if ( aExp == 0 ) {
4090 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4091 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4092 --aExp;
4094 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4095 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4100 /*----------------------------------------------------------------------------
4101 | Returns the remainder of the double-precision floating-point value `a'
4102 | with respect to the corresponding value `b'. The operation is performed
4103 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4104 *----------------------------------------------------------------------------*/
4106 float64 float64_rem(float64 a, float64 b, float_status *status)
4108 flag aSign, zSign;
4109 int aExp, bExp, expDiff;
4110 uint64_t aSig, bSig;
4111 uint64_t q, alternateASig;
4112 int64_t sigMean;
4114 a = float64_squash_input_denormal(a, status);
4115 b = float64_squash_input_denormal(b, status);
4116 aSig = extractFloat64Frac( a );
4117 aExp = extractFloat64Exp( a );
4118 aSign = extractFloat64Sign( a );
4119 bSig = extractFloat64Frac( b );
4120 bExp = extractFloat64Exp( b );
4121 if ( aExp == 0x7FF ) {
4122 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4123 return propagateFloat64NaN(a, b, status);
4125 float_raise(float_flag_invalid, status);
4126 return float64_default_nan(status);
4128 if ( bExp == 0x7FF ) {
4129 if (bSig) {
4130 return propagateFloat64NaN(a, b, status);
4132 return a;
4134 if ( bExp == 0 ) {
4135 if ( bSig == 0 ) {
4136 float_raise(float_flag_invalid, status);
4137 return float64_default_nan(status);
4139 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4141 if ( aExp == 0 ) {
4142 if ( aSig == 0 ) return a;
4143 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4145 expDiff = aExp - bExp;
4146 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4147 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4148 if ( expDiff < 0 ) {
4149 if ( expDiff < -1 ) return a;
4150 aSig >>= 1;
4152 q = ( bSig <= aSig );
4153 if ( q ) aSig -= bSig;
4154 expDiff -= 64;
4155 while ( 0 < expDiff ) {
4156 q = estimateDiv128To64( aSig, 0, bSig );
4157 q = ( 2 < q ) ? q - 2 : 0;
4158 aSig = - ( ( bSig>>2 ) * q );
4159 expDiff -= 62;
4161 expDiff += 64;
4162 if ( 0 < expDiff ) {
4163 q = estimateDiv128To64( aSig, 0, bSig );
4164 q = ( 2 < q ) ? q - 2 : 0;
4165 q >>= 64 - expDiff;
4166 bSig >>= 2;
4167 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4169 else {
4170 aSig >>= 2;
4171 bSig >>= 2;
4173 do {
4174 alternateASig = aSig;
4175 ++q;
4176 aSig -= bSig;
4177 } while ( 0 <= (int64_t) aSig );
4178 sigMean = aSig + alternateASig;
4179 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4180 aSig = alternateASig;
4182 zSign = ( (int64_t) aSig < 0 );
4183 if ( zSign ) aSig = - aSig;
4184 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4188 /*----------------------------------------------------------------------------
4189 | Returns the binary log of the double-precision floating-point value `a'.
4190 | The operation is performed according to the IEC/IEEE Standard for Binary
4191 | Floating-Point Arithmetic.
4192 *----------------------------------------------------------------------------*/
4193 float64 float64_log2(float64 a, float_status *status)
4195 flag aSign, zSign;
4196 int aExp;
4197 uint64_t aSig, aSig0, aSig1, zSig, i;
4198 a = float64_squash_input_denormal(a, status);
4200 aSig = extractFloat64Frac( a );
4201 aExp = extractFloat64Exp( a );
4202 aSign = extractFloat64Sign( a );
4204 if ( aExp == 0 ) {
4205 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4206 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4208 if ( aSign ) {
4209 float_raise(float_flag_invalid, status);
4210 return float64_default_nan(status);
4212 if ( aExp == 0x7FF ) {
4213 if (aSig) {
4214 return propagateFloat64NaN(a, float64_zero, status);
4216 return a;
4219 aExp -= 0x3FF;
4220 aSig |= LIT64( 0x0010000000000000 );
4221 zSign = aExp < 0;
4222 zSig = (uint64_t)aExp << 52;
4223 for (i = 1LL << 51; i > 0; i >>= 1) {
4224 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4225 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4226 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4227 aSig >>= 1;
4228 zSig |= i;
4232 if ( zSign )
4233 zSig = -zSig;
4234 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4237 /*----------------------------------------------------------------------------
4238 | Returns 1 if the double-precision floating-point value `a' is equal to the
4239 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4240 | if either operand is a NaN. Otherwise, the comparison is performed
4241 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4242 *----------------------------------------------------------------------------*/
4244 int float64_eq(float64 a, float64 b, float_status *status)
4246 uint64_t av, bv;
4247 a = float64_squash_input_denormal(a, status);
4248 b = float64_squash_input_denormal(b, status);
4250 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4251 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4253 float_raise(float_flag_invalid, status);
4254 return 0;
4256 av = float64_val(a);
4257 bv = float64_val(b);
4258 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4262 /*----------------------------------------------------------------------------
4263 | Returns 1 if the double-precision floating-point value `a' is less than or
4264 | equal to the corresponding value `b', and 0 otherwise. The invalid
4265 | exception is raised if either operand is a NaN. The comparison is performed
4266 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4267 *----------------------------------------------------------------------------*/
4269 int float64_le(float64 a, float64 b, float_status *status)
4271 flag aSign, bSign;
4272 uint64_t av, bv;
4273 a = float64_squash_input_denormal(a, status);
4274 b = float64_squash_input_denormal(b, status);
4276 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4277 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4279 float_raise(float_flag_invalid, status);
4280 return 0;
4282 aSign = extractFloat64Sign( a );
4283 bSign = extractFloat64Sign( b );
4284 av = float64_val(a);
4285 bv = float64_val(b);
4286 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4287 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4291 /*----------------------------------------------------------------------------
4292 | Returns 1 if the double-precision floating-point value `a' is less than
4293 | the corresponding value `b', and 0 otherwise. The invalid exception is
4294 | raised if either operand is a NaN. The comparison is performed according
4295 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4296 *----------------------------------------------------------------------------*/
4298 int float64_lt(float64 a, float64 b, float_status *status)
4300 flag aSign, bSign;
4301 uint64_t av, bv;
4303 a = float64_squash_input_denormal(a, status);
4304 b = float64_squash_input_denormal(b, status);
4305 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4306 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4308 float_raise(float_flag_invalid, status);
4309 return 0;
4311 aSign = extractFloat64Sign( a );
4312 bSign = extractFloat64Sign( b );
4313 av = float64_val(a);
4314 bv = float64_val(b);
4315 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4316 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4320 /*----------------------------------------------------------------------------
4321 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4322 | be compared, and 0 otherwise. The invalid exception is raised if either
4323 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4324 | Standard for Binary Floating-Point Arithmetic.
4325 *----------------------------------------------------------------------------*/
4327 int float64_unordered(float64 a, float64 b, float_status *status)
4329 a = float64_squash_input_denormal(a, status);
4330 b = float64_squash_input_denormal(b, status);
4332 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4333 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4335 float_raise(float_flag_invalid, status);
4336 return 1;
4338 return 0;
4341 /*----------------------------------------------------------------------------
4342 | Returns 1 if the double-precision floating-point value `a' is equal to the
4343 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4344 | exception.The comparison is performed according to the IEC/IEEE Standard
4345 | for Binary Floating-Point Arithmetic.
4346 *----------------------------------------------------------------------------*/
4348 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4350 uint64_t av, bv;
4351 a = float64_squash_input_denormal(a, status);
4352 b = float64_squash_input_denormal(b, status);
4354 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4355 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4357 if (float64_is_signaling_nan(a, status)
4358 || float64_is_signaling_nan(b, status)) {
4359 float_raise(float_flag_invalid, status);
4361 return 0;
4363 av = float64_val(a);
4364 bv = float64_val(b);
4365 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4369 /*----------------------------------------------------------------------------
4370 | Returns 1 if the double-precision floating-point value `a' is less than or
4371 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4372 | cause an exception. Otherwise, the comparison is performed according to the
4373 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4374 *----------------------------------------------------------------------------*/
4376 int float64_le_quiet(float64 a, float64 b, float_status *status)
4378 flag aSign, bSign;
4379 uint64_t av, bv;
4380 a = float64_squash_input_denormal(a, status);
4381 b = float64_squash_input_denormal(b, status);
4383 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4384 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4386 if (float64_is_signaling_nan(a, status)
4387 || float64_is_signaling_nan(b, status)) {
4388 float_raise(float_flag_invalid, status);
4390 return 0;
4392 aSign = extractFloat64Sign( a );
4393 bSign = extractFloat64Sign( b );
4394 av = float64_val(a);
4395 bv = float64_val(b);
4396 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4397 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4401 /*----------------------------------------------------------------------------
4402 | Returns 1 if the double-precision floating-point value `a' is less than
4403 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4404 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4405 | Standard for Binary Floating-Point Arithmetic.
4406 *----------------------------------------------------------------------------*/
4408 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4410 flag aSign, bSign;
4411 uint64_t av, bv;
4412 a = float64_squash_input_denormal(a, status);
4413 b = float64_squash_input_denormal(b, status);
4415 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4416 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4418 if (float64_is_signaling_nan(a, status)
4419 || float64_is_signaling_nan(b, status)) {
4420 float_raise(float_flag_invalid, status);
4422 return 0;
4424 aSign = extractFloat64Sign( a );
4425 bSign = extractFloat64Sign( b );
4426 av = float64_val(a);
4427 bv = float64_val(b);
4428 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4429 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4433 /*----------------------------------------------------------------------------
4434 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4435 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4436 | comparison is performed according to the IEC/IEEE Standard for Binary
4437 | Floating-Point Arithmetic.
4438 *----------------------------------------------------------------------------*/
4440 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4442 a = float64_squash_input_denormal(a, status);
4443 b = float64_squash_input_denormal(b, status);
4445 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4446 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4448 if (float64_is_signaling_nan(a, status)
4449 || float64_is_signaling_nan(b, status)) {
4450 float_raise(float_flag_invalid, status);
4452 return 1;
4454 return 0;
4457 /*----------------------------------------------------------------------------
4458 | Returns the result of converting the extended double-precision floating-
4459 | point value `a' to the 32-bit two's complement integer format. The
4460 | conversion is performed according to the IEC/IEEE Standard for Binary
4461 | Floating-Point Arithmetic---which means in particular that the conversion
4462 | is rounded according to the current rounding mode. If `a' is a NaN, the
4463 | largest positive integer is returned. Otherwise, if the conversion
4464 | overflows, the largest integer with the same sign as `a' is returned.
4465 *----------------------------------------------------------------------------*/
4467 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4469 flag aSign;
4470 int32_t aExp, shiftCount;
4471 uint64_t aSig;
4473 if (floatx80_invalid_encoding(a)) {
4474 float_raise(float_flag_invalid, status);
4475 return 1 << 31;
4477 aSig = extractFloatx80Frac( a );
4478 aExp = extractFloatx80Exp( a );
4479 aSign = extractFloatx80Sign( a );
4480 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4481 shiftCount = 0x4037 - aExp;
4482 if ( shiftCount <= 0 ) shiftCount = 1;
4483 shift64RightJamming( aSig, shiftCount, &aSig );
4484 return roundAndPackInt32(aSign, aSig, status);
4488 /*----------------------------------------------------------------------------
4489 | Returns the result of converting the extended double-precision floating-
4490 | point value `a' to the 32-bit two's complement integer format. The
4491 | conversion is performed according to the IEC/IEEE Standard for Binary
4492 | Floating-Point Arithmetic, except that the conversion is always rounded
4493 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4494 | Otherwise, if the conversion overflows, the largest integer with the same
4495 | sign as `a' is returned.
4496 *----------------------------------------------------------------------------*/
4498 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4500 flag aSign;
4501 int32_t aExp, shiftCount;
4502 uint64_t aSig, savedASig;
4503 int32_t z;
4505 if (floatx80_invalid_encoding(a)) {
4506 float_raise(float_flag_invalid, status);
4507 return 1 << 31;
4509 aSig = extractFloatx80Frac( a );
4510 aExp = extractFloatx80Exp( a );
4511 aSign = extractFloatx80Sign( a );
4512 if ( 0x401E < aExp ) {
4513 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4514 goto invalid;
4516 else if ( aExp < 0x3FFF ) {
4517 if (aExp || aSig) {
4518 status->float_exception_flags |= float_flag_inexact;
4520 return 0;
4522 shiftCount = 0x403E - aExp;
4523 savedASig = aSig;
4524 aSig >>= shiftCount;
4525 z = aSig;
4526 if ( aSign ) z = - z;
4527 if ( ( z < 0 ) ^ aSign ) {
4528 invalid:
4529 float_raise(float_flag_invalid, status);
4530 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4532 if ( ( aSig<<shiftCount ) != savedASig ) {
4533 status->float_exception_flags |= float_flag_inexact;
4535 return z;
4539 /*----------------------------------------------------------------------------
4540 | Returns the result of converting the extended double-precision floating-
4541 | point value `a' to the 64-bit two's complement integer format. The
4542 | conversion is performed according to the IEC/IEEE Standard for Binary
4543 | Floating-Point Arithmetic---which means in particular that the conversion
4544 | is rounded according to the current rounding mode. If `a' is a NaN,
4545 | the largest positive integer is returned. Otherwise, if the conversion
4546 | overflows, the largest integer with the same sign as `a' is returned.
4547 *----------------------------------------------------------------------------*/
4549 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4551 flag aSign;
4552 int32_t aExp, shiftCount;
4553 uint64_t aSig, aSigExtra;
4555 if (floatx80_invalid_encoding(a)) {
4556 float_raise(float_flag_invalid, status);
4557 return 1ULL << 63;
4559 aSig = extractFloatx80Frac( a );
4560 aExp = extractFloatx80Exp( a );
4561 aSign = extractFloatx80Sign( a );
4562 shiftCount = 0x403E - aExp;
4563 if ( shiftCount <= 0 ) {
4564 if ( shiftCount ) {
4565 float_raise(float_flag_invalid, status);
4566 if (!aSign || floatx80_is_any_nan(a)) {
4567 return LIT64( 0x7FFFFFFFFFFFFFFF );
4569 return (int64_t) LIT64( 0x8000000000000000 );
4571 aSigExtra = 0;
4573 else {
4574 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4576 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4580 /*----------------------------------------------------------------------------
4581 | Returns the result of converting the extended double-precision floating-
4582 | point value `a' to the 64-bit two's complement integer format. The
4583 | conversion is performed according to the IEC/IEEE Standard for Binary
4584 | Floating-Point Arithmetic, except that the conversion is always rounded
4585 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4586 | Otherwise, if the conversion overflows, the largest integer with the same
4587 | sign as `a' is returned.
4588 *----------------------------------------------------------------------------*/
4590 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4592 flag aSign;
4593 int32_t aExp, shiftCount;
4594 uint64_t aSig;
4595 int64_t z;
4597 if (floatx80_invalid_encoding(a)) {
4598 float_raise(float_flag_invalid, status);
4599 return 1ULL << 63;
4601 aSig = extractFloatx80Frac( a );
4602 aExp = extractFloatx80Exp( a );
4603 aSign = extractFloatx80Sign( a );
4604 shiftCount = aExp - 0x403E;
4605 if ( 0 <= shiftCount ) {
4606 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4607 if ( ( a.high != 0xC03E ) || aSig ) {
4608 float_raise(float_flag_invalid, status);
4609 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4610 return LIT64( 0x7FFFFFFFFFFFFFFF );
4613 return (int64_t) LIT64( 0x8000000000000000 );
4615 else if ( aExp < 0x3FFF ) {
4616 if (aExp | aSig) {
4617 status->float_exception_flags |= float_flag_inexact;
4619 return 0;
4621 z = aSig>>( - shiftCount );
4622 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4623 status->float_exception_flags |= float_flag_inexact;
4625 if ( aSign ) z = - z;
4626 return z;
4630 /*----------------------------------------------------------------------------
4631 | Returns the result of converting the extended double-precision floating-
4632 | point value `a' to the single-precision floating-point format. The
4633 | conversion is performed according to the IEC/IEEE Standard for Binary
4634 | Floating-Point Arithmetic.
4635 *----------------------------------------------------------------------------*/
4637 float32 floatx80_to_float32(floatx80 a, float_status *status)
4639 flag aSign;
4640 int32_t aExp;
4641 uint64_t aSig;
4643 if (floatx80_invalid_encoding(a)) {
4644 float_raise(float_flag_invalid, status);
4645 return float32_default_nan(status);
4647 aSig = extractFloatx80Frac( a );
4648 aExp = extractFloatx80Exp( a );
4649 aSign = extractFloatx80Sign( a );
4650 if ( aExp == 0x7FFF ) {
4651 if ( (uint64_t) ( aSig<<1 ) ) {
4652 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4654 return packFloat32( aSign, 0xFF, 0 );
4656 shift64RightJamming( aSig, 33, &aSig );
4657 if ( aExp || aSig ) aExp -= 0x3F81;
4658 return roundAndPackFloat32(aSign, aExp, aSig, status);
4662 /*----------------------------------------------------------------------------
4663 | Returns the result of converting the extended double-precision floating-
4664 | point value `a' to the double-precision floating-point format. The
4665 | conversion is performed according to the IEC/IEEE Standard for Binary
4666 | Floating-Point Arithmetic.
4667 *----------------------------------------------------------------------------*/
4669 float64 floatx80_to_float64(floatx80 a, float_status *status)
4671 flag aSign;
4672 int32_t aExp;
4673 uint64_t aSig, zSig;
4675 if (floatx80_invalid_encoding(a)) {
4676 float_raise(float_flag_invalid, status);
4677 return float64_default_nan(status);
4679 aSig = extractFloatx80Frac( a );
4680 aExp = extractFloatx80Exp( a );
4681 aSign = extractFloatx80Sign( a );
4682 if ( aExp == 0x7FFF ) {
4683 if ( (uint64_t) ( aSig<<1 ) ) {
4684 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4686 return packFloat64( aSign, 0x7FF, 0 );
4688 shift64RightJamming( aSig, 1, &zSig );
4689 if ( aExp || aSig ) aExp -= 0x3C01;
4690 return roundAndPackFloat64(aSign, aExp, zSig, status);
4694 /*----------------------------------------------------------------------------
4695 | Returns the result of converting the extended double-precision floating-
4696 | point value `a' to the quadruple-precision floating-point format. The
4697 | conversion is performed according to the IEC/IEEE Standard for Binary
4698 | Floating-Point Arithmetic.
4699 *----------------------------------------------------------------------------*/
4701 float128 floatx80_to_float128(floatx80 a, float_status *status)
4703 flag aSign;
4704 int aExp;
4705 uint64_t aSig, zSig0, zSig1;
4707 if (floatx80_invalid_encoding(a)) {
4708 float_raise(float_flag_invalid, status);
4709 return float128_default_nan(status);
4711 aSig = extractFloatx80Frac( a );
4712 aExp = extractFloatx80Exp( a );
4713 aSign = extractFloatx80Sign( a );
4714 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4715 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4717 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4718 return packFloat128( aSign, aExp, zSig0, zSig1 );
4722 /*----------------------------------------------------------------------------
4723 | Rounds the extended double-precision floating-point value `a'
4724 | to the precision provided by floatx80_rounding_precision and returns the
4725 | result as an extended double-precision floating-point value.
4726 | The operation is performed according to the IEC/IEEE Standard for Binary
4727 | Floating-Point Arithmetic.
4728 *----------------------------------------------------------------------------*/
4730 floatx80 floatx80_round(floatx80 a, float_status *status)
4732 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4733 extractFloatx80Sign(a),
4734 extractFloatx80Exp(a),
4735 extractFloatx80Frac(a), 0, status);
4738 /*----------------------------------------------------------------------------
4739 | Rounds the extended double-precision floating-point value `a' to an integer,
4740 | and returns the result as an extended quadruple-precision floating-point
4741 | value. The operation is performed according to the IEC/IEEE Standard for
4742 | Binary Floating-Point Arithmetic.
4743 *----------------------------------------------------------------------------*/
4745 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4747 flag aSign;
4748 int32_t aExp;
4749 uint64_t lastBitMask, roundBitsMask;
4750 floatx80 z;
4752 if (floatx80_invalid_encoding(a)) {
4753 float_raise(float_flag_invalid, status);
4754 return floatx80_default_nan(status);
4756 aExp = extractFloatx80Exp( a );
4757 if ( 0x403E <= aExp ) {
4758 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4759 return propagateFloatx80NaN(a, a, status);
4761 return a;
4763 if ( aExp < 0x3FFF ) {
4764 if ( ( aExp == 0 )
4765 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4766 return a;
4768 status->float_exception_flags |= float_flag_inexact;
4769 aSign = extractFloatx80Sign( a );
4770 switch (status->float_rounding_mode) {
4771 case float_round_nearest_even:
4772 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4774 return
4775 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4777 break;
4778 case float_round_ties_away:
4779 if (aExp == 0x3FFE) {
4780 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4782 break;
4783 case float_round_down:
4784 return
4785 aSign ?
4786 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4787 : packFloatx80( 0, 0, 0 );
4788 case float_round_up:
4789 return
4790 aSign ? packFloatx80( 1, 0, 0 )
4791 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4793 return packFloatx80( aSign, 0, 0 );
4795 lastBitMask = 1;
4796 lastBitMask <<= 0x403E - aExp;
4797 roundBitsMask = lastBitMask - 1;
4798 z = a;
4799 switch (status->float_rounding_mode) {
4800 case float_round_nearest_even:
4801 z.low += lastBitMask>>1;
4802 if ((z.low & roundBitsMask) == 0) {
4803 z.low &= ~lastBitMask;
4805 break;
4806 case float_round_ties_away:
4807 z.low += lastBitMask >> 1;
4808 break;
4809 case float_round_to_zero:
4810 break;
4811 case float_round_up:
4812 if (!extractFloatx80Sign(z)) {
4813 z.low += roundBitsMask;
4815 break;
4816 case float_round_down:
4817 if (extractFloatx80Sign(z)) {
4818 z.low += roundBitsMask;
4820 break;
4821 default:
4822 abort();
4824 z.low &= ~ roundBitsMask;
4825 if ( z.low == 0 ) {
4826 ++z.high;
4827 z.low = LIT64( 0x8000000000000000 );
4829 if (z.low != a.low) {
4830 status->float_exception_flags |= float_flag_inexact;
4832 return z;
4836 /*----------------------------------------------------------------------------
4837 | Returns the result of adding the absolute values of the extended double-
4838 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4839 | negated before being returned. `zSign' is ignored if the result is a NaN.
4840 | The addition is performed according to the IEC/IEEE Standard for Binary
4841 | Floating-Point Arithmetic.
4842 *----------------------------------------------------------------------------*/
4844 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4845 float_status *status)
4847 int32_t aExp, bExp, zExp;
4848 uint64_t aSig, bSig, zSig0, zSig1;
4849 int32_t expDiff;
4851 aSig = extractFloatx80Frac( a );
4852 aExp = extractFloatx80Exp( a );
4853 bSig = extractFloatx80Frac( b );
4854 bExp = extractFloatx80Exp( b );
4855 expDiff = aExp - bExp;
4856 if ( 0 < expDiff ) {
4857 if ( aExp == 0x7FFF ) {
4858 if ((uint64_t)(aSig << 1)) {
4859 return propagateFloatx80NaN(a, b, status);
4861 return a;
4863 if ( bExp == 0 ) --expDiff;
4864 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4865 zExp = aExp;
4867 else if ( expDiff < 0 ) {
4868 if ( bExp == 0x7FFF ) {
4869 if ((uint64_t)(bSig << 1)) {
4870 return propagateFloatx80NaN(a, b, status);
4872 return packFloatx80(zSign,
4873 floatx80_infinity_high,
4874 floatx80_infinity_low);
4876 if ( aExp == 0 ) ++expDiff;
4877 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4878 zExp = bExp;
4880 else {
4881 if ( aExp == 0x7FFF ) {
4882 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4883 return propagateFloatx80NaN(a, b, status);
4885 return a;
4887 zSig1 = 0;
4888 zSig0 = aSig + bSig;
4889 if ( aExp == 0 ) {
4890 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4891 goto roundAndPack;
4893 zExp = aExp;
4894 goto shiftRight1;
4896 zSig0 = aSig + bSig;
4897 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4898 shiftRight1:
4899 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4900 zSig0 |= LIT64( 0x8000000000000000 );
4901 ++zExp;
4902 roundAndPack:
4903 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4904 zSign, zExp, zSig0, zSig1, status);
4907 /*----------------------------------------------------------------------------
4908 | Returns the result of subtracting the absolute values of the extended
4909 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4910 | difference is negated before being returned. `zSign' is ignored if the
4911 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4912 | Standard for Binary Floating-Point Arithmetic.
4913 *----------------------------------------------------------------------------*/
4915 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4916 float_status *status)
4918 int32_t aExp, bExp, zExp;
4919 uint64_t aSig, bSig, zSig0, zSig1;
4920 int32_t expDiff;
4922 aSig = extractFloatx80Frac( a );
4923 aExp = extractFloatx80Exp( a );
4924 bSig = extractFloatx80Frac( b );
4925 bExp = extractFloatx80Exp( b );
4926 expDiff = aExp - bExp;
4927 if ( 0 < expDiff ) goto aExpBigger;
4928 if ( expDiff < 0 ) goto bExpBigger;
4929 if ( aExp == 0x7FFF ) {
4930 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4931 return propagateFloatx80NaN(a, b, status);
4933 float_raise(float_flag_invalid, status);
4934 return floatx80_default_nan(status);
4936 if ( aExp == 0 ) {
4937 aExp = 1;
4938 bExp = 1;
4940 zSig1 = 0;
4941 if ( bSig < aSig ) goto aBigger;
4942 if ( aSig < bSig ) goto bBigger;
4943 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4944 bExpBigger:
4945 if ( bExp == 0x7FFF ) {
4946 if ((uint64_t)(bSig << 1)) {
4947 return propagateFloatx80NaN(a, b, status);
4949 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4950 floatx80_infinity_low);
4952 if ( aExp == 0 ) ++expDiff;
4953 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4954 bBigger:
4955 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4956 zExp = bExp;
4957 zSign ^= 1;
4958 goto normalizeRoundAndPack;
4959 aExpBigger:
4960 if ( aExp == 0x7FFF ) {
4961 if ((uint64_t)(aSig << 1)) {
4962 return propagateFloatx80NaN(a, b, status);
4964 return a;
4966 if ( bExp == 0 ) --expDiff;
4967 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4968 aBigger:
4969 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4970 zExp = aExp;
4971 normalizeRoundAndPack:
4972 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4973 zSign, zExp, zSig0, zSig1, status);
4976 /*----------------------------------------------------------------------------
4977 | Returns the result of adding the extended double-precision floating-point
4978 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4979 | Standard for Binary Floating-Point Arithmetic.
4980 *----------------------------------------------------------------------------*/
4982 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4984 flag aSign, bSign;
4986 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4987 float_raise(float_flag_invalid, status);
4988 return floatx80_default_nan(status);
4990 aSign = extractFloatx80Sign( a );
4991 bSign = extractFloatx80Sign( b );
4992 if ( aSign == bSign ) {
4993 return addFloatx80Sigs(a, b, aSign, status);
4995 else {
4996 return subFloatx80Sigs(a, b, aSign, status);
5001 /*----------------------------------------------------------------------------
5002 | Returns the result of subtracting the extended double-precision floating-
5003 | point values `a' and `b'. The operation is performed according to the
5004 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5005 *----------------------------------------------------------------------------*/
5007 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5009 flag aSign, bSign;
5011 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5012 float_raise(float_flag_invalid, status);
5013 return floatx80_default_nan(status);
5015 aSign = extractFloatx80Sign( a );
5016 bSign = extractFloatx80Sign( b );
5017 if ( aSign == bSign ) {
5018 return subFloatx80Sigs(a, b, aSign, status);
5020 else {
5021 return addFloatx80Sigs(a, b, aSign, status);
5026 /*----------------------------------------------------------------------------
5027 | Returns the result of multiplying the extended double-precision floating-
5028 | point values `a' and `b'. The operation is performed according to the
5029 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5030 *----------------------------------------------------------------------------*/
5032 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5034 flag aSign, bSign, zSign;
5035 int32_t aExp, bExp, zExp;
5036 uint64_t aSig, bSig, zSig0, zSig1;
5038 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5039 float_raise(float_flag_invalid, status);
5040 return floatx80_default_nan(status);
5042 aSig = extractFloatx80Frac( a );
5043 aExp = extractFloatx80Exp( a );
5044 aSign = extractFloatx80Sign( a );
5045 bSig = extractFloatx80Frac( b );
5046 bExp = extractFloatx80Exp( b );
5047 bSign = extractFloatx80Sign( b );
5048 zSign = aSign ^ bSign;
5049 if ( aExp == 0x7FFF ) {
5050 if ( (uint64_t) ( aSig<<1 )
5051 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5052 return propagateFloatx80NaN(a, b, status);
5054 if ( ( bExp | bSig ) == 0 ) goto invalid;
5055 return packFloatx80(zSign, floatx80_infinity_high,
5056 floatx80_infinity_low);
5058 if ( bExp == 0x7FFF ) {
5059 if ((uint64_t)(bSig << 1)) {
5060 return propagateFloatx80NaN(a, b, status);
5062 if ( ( aExp | aSig ) == 0 ) {
5063 invalid:
5064 float_raise(float_flag_invalid, status);
5065 return floatx80_default_nan(status);
5067 return packFloatx80(zSign, floatx80_infinity_high,
5068 floatx80_infinity_low);
5070 if ( aExp == 0 ) {
5071 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5072 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5074 if ( bExp == 0 ) {
5075 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5076 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5078 zExp = aExp + bExp - 0x3FFE;
5079 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5080 if ( 0 < (int64_t) zSig0 ) {
5081 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5082 --zExp;
5084 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5085 zSign, zExp, zSig0, zSig1, status);
5088 /*----------------------------------------------------------------------------
5089 | Returns the result of dividing the extended double-precision floating-point
5090 | value `a' by the corresponding value `b'. The operation is performed
5091 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5092 *----------------------------------------------------------------------------*/
5094 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5096 flag aSign, bSign, zSign;
5097 int32_t aExp, bExp, zExp;
5098 uint64_t aSig, bSig, zSig0, zSig1;
5099 uint64_t rem0, rem1, rem2, term0, term1, term2;
5101 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5102 float_raise(float_flag_invalid, status);
5103 return floatx80_default_nan(status);
5105 aSig = extractFloatx80Frac( a );
5106 aExp = extractFloatx80Exp( a );
5107 aSign = extractFloatx80Sign( a );
5108 bSig = extractFloatx80Frac( b );
5109 bExp = extractFloatx80Exp( b );
5110 bSign = extractFloatx80Sign( b );
5111 zSign = aSign ^ bSign;
5112 if ( aExp == 0x7FFF ) {
5113 if ((uint64_t)(aSig << 1)) {
5114 return propagateFloatx80NaN(a, b, status);
5116 if ( bExp == 0x7FFF ) {
5117 if ((uint64_t)(bSig << 1)) {
5118 return propagateFloatx80NaN(a, b, status);
5120 goto invalid;
5122 return packFloatx80(zSign, floatx80_infinity_high,
5123 floatx80_infinity_low);
5125 if ( bExp == 0x7FFF ) {
5126 if ((uint64_t)(bSig << 1)) {
5127 return propagateFloatx80NaN(a, b, status);
5129 return packFloatx80( zSign, 0, 0 );
5131 if ( bExp == 0 ) {
5132 if ( bSig == 0 ) {
5133 if ( ( aExp | aSig ) == 0 ) {
5134 invalid:
5135 float_raise(float_flag_invalid, status);
5136 return floatx80_default_nan(status);
5138 float_raise(float_flag_divbyzero, status);
5139 return packFloatx80(zSign, floatx80_infinity_high,
5140 floatx80_infinity_low);
5142 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5144 if ( aExp == 0 ) {
5145 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5146 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5148 zExp = aExp - bExp + 0x3FFE;
5149 rem1 = 0;
5150 if ( bSig <= aSig ) {
5151 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5152 ++zExp;
5154 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5155 mul64To128( bSig, zSig0, &term0, &term1 );
5156 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5157 while ( (int64_t) rem0 < 0 ) {
5158 --zSig0;
5159 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5161 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5162 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5163 mul64To128( bSig, zSig1, &term1, &term2 );
5164 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5165 while ( (int64_t) rem1 < 0 ) {
5166 --zSig1;
5167 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5169 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5171 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5172 zSign, zExp, zSig0, zSig1, status);
5175 /*----------------------------------------------------------------------------
5176 | Returns the remainder of the extended double-precision floating-point value
5177 | `a' with respect to the corresponding value `b'. The operation is performed
5178 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5179 *----------------------------------------------------------------------------*/
5181 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5183 flag aSign, zSign;
5184 int32_t aExp, bExp, expDiff;
5185 uint64_t aSig0, aSig1, bSig;
5186 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5188 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5189 float_raise(float_flag_invalid, status);
5190 return floatx80_default_nan(status);
5192 aSig0 = extractFloatx80Frac( a );
5193 aExp = extractFloatx80Exp( a );
5194 aSign = extractFloatx80Sign( a );
5195 bSig = extractFloatx80Frac( b );
5196 bExp = extractFloatx80Exp( b );
5197 if ( aExp == 0x7FFF ) {
5198 if ( (uint64_t) ( aSig0<<1 )
5199 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5200 return propagateFloatx80NaN(a, b, status);
5202 goto invalid;
5204 if ( bExp == 0x7FFF ) {
5205 if ((uint64_t)(bSig << 1)) {
5206 return propagateFloatx80NaN(a, b, status);
5208 return a;
5210 if ( bExp == 0 ) {
5211 if ( bSig == 0 ) {
5212 invalid:
5213 float_raise(float_flag_invalid, status);
5214 return floatx80_default_nan(status);
5216 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5218 if ( aExp == 0 ) {
5219 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5220 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5222 bSig |= LIT64( 0x8000000000000000 );
5223 zSign = aSign;
5224 expDiff = aExp - bExp;
5225 aSig1 = 0;
5226 if ( expDiff < 0 ) {
5227 if ( expDiff < -1 ) return a;
5228 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5229 expDiff = 0;
5231 q = ( bSig <= aSig0 );
5232 if ( q ) aSig0 -= bSig;
5233 expDiff -= 64;
5234 while ( 0 < expDiff ) {
5235 q = estimateDiv128To64( aSig0, aSig1, bSig );
5236 q = ( 2 < q ) ? q - 2 : 0;
5237 mul64To128( bSig, q, &term0, &term1 );
5238 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5239 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5240 expDiff -= 62;
5242 expDiff += 64;
5243 if ( 0 < expDiff ) {
5244 q = estimateDiv128To64( aSig0, aSig1, bSig );
5245 q = ( 2 < q ) ? q - 2 : 0;
5246 q >>= 64 - expDiff;
5247 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5248 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5249 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5250 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5251 ++q;
5252 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5255 else {
5256 term1 = 0;
5257 term0 = bSig;
5259 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5260 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5261 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5262 && ( q & 1 ) )
5264 aSig0 = alternateASig0;
5265 aSig1 = alternateASig1;
5266 zSign = ! zSign;
5268 return
5269 normalizeRoundAndPackFloatx80(
5270 80, zSign, bExp + expDiff, aSig0, aSig1, status);
5274 /*----------------------------------------------------------------------------
5275 | Returns the square root of the extended double-precision floating-point
5276 | value `a'. The operation is performed according to the IEC/IEEE Standard
5277 | for Binary Floating-Point Arithmetic.
5278 *----------------------------------------------------------------------------*/
5280 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5282 flag aSign;
5283 int32_t aExp, zExp;
5284 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5285 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5287 if (floatx80_invalid_encoding(a)) {
5288 float_raise(float_flag_invalid, status);
5289 return floatx80_default_nan(status);
5291 aSig0 = extractFloatx80Frac( a );
5292 aExp = extractFloatx80Exp( a );
5293 aSign = extractFloatx80Sign( a );
5294 if ( aExp == 0x7FFF ) {
5295 if ((uint64_t)(aSig0 << 1)) {
5296 return propagateFloatx80NaN(a, a, status);
5298 if ( ! aSign ) return a;
5299 goto invalid;
5301 if ( aSign ) {
5302 if ( ( aExp | aSig0 ) == 0 ) return a;
5303 invalid:
5304 float_raise(float_flag_invalid, status);
5305 return floatx80_default_nan(status);
5307 if ( aExp == 0 ) {
5308 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5309 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5311 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5312 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5313 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5314 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5315 doubleZSig0 = zSig0<<1;
5316 mul64To128( zSig0, zSig0, &term0, &term1 );
5317 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5318 while ( (int64_t) rem0 < 0 ) {
5319 --zSig0;
5320 doubleZSig0 -= 2;
5321 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5323 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5324 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5325 if ( zSig1 == 0 ) zSig1 = 1;
5326 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5327 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5328 mul64To128( zSig1, zSig1, &term2, &term3 );
5329 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5330 while ( (int64_t) rem1 < 0 ) {
5331 --zSig1;
5332 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5333 term3 |= 1;
5334 term2 |= doubleZSig0;
5335 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5337 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5339 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5340 zSig0 |= doubleZSig0;
5341 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5342 0, zExp, zSig0, zSig1, status);
5345 /*----------------------------------------------------------------------------
5346 | Returns 1 if the extended double-precision floating-point value `a' is equal
5347 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5348 | raised if either operand is a NaN. Otherwise, the comparison is performed
5349 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5350 *----------------------------------------------------------------------------*/
5352 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5355 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5356 || (extractFloatx80Exp(a) == 0x7FFF
5357 && (uint64_t) (extractFloatx80Frac(a) << 1))
5358 || (extractFloatx80Exp(b) == 0x7FFF
5359 && (uint64_t) (extractFloatx80Frac(b) << 1))
5361 float_raise(float_flag_invalid, status);
5362 return 0;
5364 return
5365 ( a.low == b.low )
5366 && ( ( a.high == b.high )
5367 || ( ( a.low == 0 )
5368 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5373 /*----------------------------------------------------------------------------
5374 | Returns 1 if the extended double-precision floating-point value `a' is
5375 | less than or equal to the corresponding value `b', and 0 otherwise. The
5376 | invalid exception is raised if either operand is a NaN. The comparison is
5377 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5378 | Arithmetic.
5379 *----------------------------------------------------------------------------*/
5381 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5383 flag aSign, bSign;
5385 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5386 || (extractFloatx80Exp(a) == 0x7FFF
5387 && (uint64_t) (extractFloatx80Frac(a) << 1))
5388 || (extractFloatx80Exp(b) == 0x7FFF
5389 && (uint64_t) (extractFloatx80Frac(b) << 1))
5391 float_raise(float_flag_invalid, status);
5392 return 0;
5394 aSign = extractFloatx80Sign( a );
5395 bSign = extractFloatx80Sign( b );
5396 if ( aSign != bSign ) {
5397 return
5398 aSign
5399 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5400 == 0 );
5402 return
5403 aSign ? le128( b.high, b.low, a.high, a.low )
5404 : le128( a.high, a.low, b.high, b.low );
5408 /*----------------------------------------------------------------------------
5409 | Returns 1 if the extended double-precision floating-point value `a' is
5410 | less than the corresponding value `b', and 0 otherwise. The invalid
5411 | exception is raised if either operand is a NaN. The comparison is performed
5412 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5413 *----------------------------------------------------------------------------*/
5415 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5417 flag aSign, bSign;
5419 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5420 || (extractFloatx80Exp(a) == 0x7FFF
5421 && (uint64_t) (extractFloatx80Frac(a) << 1))
5422 || (extractFloatx80Exp(b) == 0x7FFF
5423 && (uint64_t) (extractFloatx80Frac(b) << 1))
5425 float_raise(float_flag_invalid, status);
5426 return 0;
5428 aSign = extractFloatx80Sign( a );
5429 bSign = extractFloatx80Sign( b );
5430 if ( aSign != bSign ) {
5431 return
5432 aSign
5433 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5434 != 0 );
5436 return
5437 aSign ? lt128( b.high, b.low, a.high, a.low )
5438 : lt128( a.high, a.low, b.high, b.low );
5442 /*----------------------------------------------------------------------------
5443 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5444 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5445 | either operand is a NaN. The comparison is performed according to the
5446 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5447 *----------------------------------------------------------------------------*/
5448 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5450 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5451 || (extractFloatx80Exp(a) == 0x7FFF
5452 && (uint64_t) (extractFloatx80Frac(a) << 1))
5453 || (extractFloatx80Exp(b) == 0x7FFF
5454 && (uint64_t) (extractFloatx80Frac(b) << 1))
5456 float_raise(float_flag_invalid, status);
5457 return 1;
5459 return 0;
5462 /*----------------------------------------------------------------------------
5463 | Returns 1 if the extended double-precision floating-point value `a' is
5464 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5465 | cause an exception. The comparison is performed according to the IEC/IEEE
5466 | Standard for Binary Floating-Point Arithmetic.
5467 *----------------------------------------------------------------------------*/
5469 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5472 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5473 float_raise(float_flag_invalid, status);
5474 return 0;
5476 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5477 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5478 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5479 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5481 if (floatx80_is_signaling_nan(a, status)
5482 || floatx80_is_signaling_nan(b, status)) {
5483 float_raise(float_flag_invalid, status);
5485 return 0;
5487 return
5488 ( a.low == b.low )
5489 && ( ( a.high == b.high )
5490 || ( ( a.low == 0 )
5491 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5496 /*----------------------------------------------------------------------------
5497 | Returns 1 if the extended double-precision floating-point value `a' is less
5498 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5499 | do not cause an exception. Otherwise, the comparison is performed according
5500 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5501 *----------------------------------------------------------------------------*/
5503 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5505 flag aSign, bSign;
5507 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5508 float_raise(float_flag_invalid, status);
5509 return 0;
5511 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5512 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5513 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5514 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5516 if (floatx80_is_signaling_nan(a, status)
5517 || floatx80_is_signaling_nan(b, status)) {
5518 float_raise(float_flag_invalid, status);
5520 return 0;
5522 aSign = extractFloatx80Sign( a );
5523 bSign = extractFloatx80Sign( b );
5524 if ( aSign != bSign ) {
5525 return
5526 aSign
5527 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5528 == 0 );
5530 return
5531 aSign ? le128( b.high, b.low, a.high, a.low )
5532 : le128( a.high, a.low, b.high, b.low );
5536 /*----------------------------------------------------------------------------
5537 | Returns 1 if the extended double-precision floating-point value `a' is less
5538 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5539 | an exception. Otherwise, the comparison is performed according to the
5540 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5541 *----------------------------------------------------------------------------*/
5543 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5545 flag aSign, bSign;
5547 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5548 float_raise(float_flag_invalid, status);
5549 return 0;
5551 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5552 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5553 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5554 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5556 if (floatx80_is_signaling_nan(a, status)
5557 || floatx80_is_signaling_nan(b, status)) {
5558 float_raise(float_flag_invalid, status);
5560 return 0;
5562 aSign = extractFloatx80Sign( a );
5563 bSign = extractFloatx80Sign( b );
5564 if ( aSign != bSign ) {
5565 return
5566 aSign
5567 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5568 != 0 );
5570 return
5571 aSign ? lt128( b.high, b.low, a.high, a.low )
5572 : lt128( a.high, a.low, b.high, b.low );
5576 /*----------------------------------------------------------------------------
5577 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5578 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5579 | The comparison is performed according to the IEC/IEEE Standard for Binary
5580 | Floating-Point Arithmetic.
5581 *----------------------------------------------------------------------------*/
5582 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5584 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5585 float_raise(float_flag_invalid, status);
5586 return 1;
5588 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5589 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5590 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5591 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5593 if (floatx80_is_signaling_nan(a, status)
5594 || floatx80_is_signaling_nan(b, status)) {
5595 float_raise(float_flag_invalid, status);
5597 return 1;
5599 return 0;
5602 /*----------------------------------------------------------------------------
5603 | Returns the result of converting the quadruple-precision floating-point
5604 | value `a' to the 32-bit two's complement integer format. The conversion
5605 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5606 | Arithmetic---which means in particular that the conversion is rounded
5607 | according to the current rounding mode. If `a' is a NaN, the largest
5608 | positive integer is returned. Otherwise, if the conversion overflows, the
5609 | largest integer with the same sign as `a' is returned.
5610 *----------------------------------------------------------------------------*/
5612 int32_t float128_to_int32(float128 a, float_status *status)
5614 flag aSign;
5615 int32_t aExp, shiftCount;
5616 uint64_t aSig0, aSig1;
5618 aSig1 = extractFloat128Frac1( a );
5619 aSig0 = extractFloat128Frac0( a );
5620 aExp = extractFloat128Exp( a );
5621 aSign = extractFloat128Sign( a );
5622 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5623 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5624 aSig0 |= ( aSig1 != 0 );
5625 shiftCount = 0x4028 - aExp;
5626 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5627 return roundAndPackInt32(aSign, aSig0, status);
5631 /*----------------------------------------------------------------------------
5632 | Returns the result of converting the quadruple-precision floating-point
5633 | value `a' to the 32-bit two's complement integer format. The conversion
5634 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5635 | Arithmetic, except that the conversion is always rounded toward zero. If
5636 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5637 | conversion overflows, the largest integer with the same sign as `a' is
5638 | returned.
5639 *----------------------------------------------------------------------------*/
5641 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5643 flag aSign;
5644 int32_t aExp, shiftCount;
5645 uint64_t aSig0, aSig1, savedASig;
5646 int32_t z;
5648 aSig1 = extractFloat128Frac1( a );
5649 aSig0 = extractFloat128Frac0( a );
5650 aExp = extractFloat128Exp( a );
5651 aSign = extractFloat128Sign( a );
5652 aSig0 |= ( aSig1 != 0 );
5653 if ( 0x401E < aExp ) {
5654 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5655 goto invalid;
5657 else if ( aExp < 0x3FFF ) {
5658 if (aExp || aSig0) {
5659 status->float_exception_flags |= float_flag_inexact;
5661 return 0;
5663 aSig0 |= LIT64( 0x0001000000000000 );
5664 shiftCount = 0x402F - aExp;
5665 savedASig = aSig0;
5666 aSig0 >>= shiftCount;
5667 z = aSig0;
5668 if ( aSign ) z = - z;
5669 if ( ( z < 0 ) ^ aSign ) {
5670 invalid:
5671 float_raise(float_flag_invalid, status);
5672 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5674 if ( ( aSig0<<shiftCount ) != savedASig ) {
5675 status->float_exception_flags |= float_flag_inexact;
5677 return z;
5681 /*----------------------------------------------------------------------------
5682 | Returns the result of converting the quadruple-precision floating-point
5683 | value `a' to the 64-bit two's complement integer format. The conversion
5684 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5685 | Arithmetic---which means in particular that the conversion is rounded
5686 | according to the current rounding mode. If `a' is a NaN, the largest
5687 | positive integer is returned. Otherwise, if the conversion overflows, the
5688 | largest integer with the same sign as `a' is returned.
5689 *----------------------------------------------------------------------------*/
5691 int64_t float128_to_int64(float128 a, float_status *status)
5693 flag aSign;
5694 int32_t aExp, shiftCount;
5695 uint64_t aSig0, aSig1;
5697 aSig1 = extractFloat128Frac1( a );
5698 aSig0 = extractFloat128Frac0( a );
5699 aExp = extractFloat128Exp( a );
5700 aSign = extractFloat128Sign( a );
5701 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5702 shiftCount = 0x402F - aExp;
5703 if ( shiftCount <= 0 ) {
5704 if ( 0x403E < aExp ) {
5705 float_raise(float_flag_invalid, status);
5706 if ( ! aSign
5707 || ( ( aExp == 0x7FFF )
5708 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5711 return LIT64( 0x7FFFFFFFFFFFFFFF );
5713 return (int64_t) LIT64( 0x8000000000000000 );
5715 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5717 else {
5718 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5720 return roundAndPackInt64(aSign, aSig0, aSig1, status);
5724 /*----------------------------------------------------------------------------
5725 | Returns the result of converting the quadruple-precision floating-point
5726 | value `a' to the 64-bit two's complement integer format. The conversion
5727 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5728 | Arithmetic, except that the conversion is always rounded toward zero.
5729 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5730 | the conversion overflows, the largest integer with the same sign as `a' is
5731 | returned.
5732 *----------------------------------------------------------------------------*/
5734 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5736 flag aSign;
5737 int32_t aExp, shiftCount;
5738 uint64_t aSig0, aSig1;
5739 int64_t z;
5741 aSig1 = extractFloat128Frac1( a );
5742 aSig0 = extractFloat128Frac0( a );
5743 aExp = extractFloat128Exp( a );
5744 aSign = extractFloat128Sign( a );
5745 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5746 shiftCount = aExp - 0x402F;
5747 if ( 0 < shiftCount ) {
5748 if ( 0x403E <= aExp ) {
5749 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5750 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5751 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5752 if (aSig1) {
5753 status->float_exception_flags |= float_flag_inexact;
5756 else {
5757 float_raise(float_flag_invalid, status);
5758 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5759 return LIT64( 0x7FFFFFFFFFFFFFFF );
5762 return (int64_t) LIT64( 0x8000000000000000 );
5764 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5765 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5766 status->float_exception_flags |= float_flag_inexact;
5769 else {
5770 if ( aExp < 0x3FFF ) {
5771 if ( aExp | aSig0 | aSig1 ) {
5772 status->float_exception_flags |= float_flag_inexact;
5774 return 0;
5776 z = aSig0>>( - shiftCount );
5777 if ( aSig1
5778 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5779 status->float_exception_flags |= float_flag_inexact;
5782 if ( aSign ) z = - z;
5783 return z;
5787 /*----------------------------------------------------------------------------
5788 | Returns the result of converting the quadruple-precision floating-point value
5789 | `a' to the 64-bit unsigned integer format. The conversion is
5790 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5791 | Arithmetic---which means in particular that the conversion is rounded
5792 | according to the current rounding mode. If `a' is a NaN, the largest
5793 | positive integer is returned. If the conversion overflows, the
5794 | largest unsigned integer is returned. If 'a' is negative, the value is
5795 | rounded and zero is returned; negative values that do not round to zero
5796 | will raise the inexact exception.
5797 *----------------------------------------------------------------------------*/
5799 uint64_t float128_to_uint64(float128 a, float_status *status)
5801 flag aSign;
5802 int aExp;
5803 int shiftCount;
5804 uint64_t aSig0, aSig1;
5806 aSig0 = extractFloat128Frac0(a);
5807 aSig1 = extractFloat128Frac1(a);
5808 aExp = extractFloat128Exp(a);
5809 aSign = extractFloat128Sign(a);
5810 if (aSign && (aExp > 0x3FFE)) {
5811 float_raise(float_flag_invalid, status);
5812 if (float128_is_any_nan(a)) {
5813 return LIT64(0xFFFFFFFFFFFFFFFF);
5814 } else {
5815 return 0;
5818 if (aExp) {
5819 aSig0 |= LIT64(0x0001000000000000);
5821 shiftCount = 0x402F - aExp;
5822 if (shiftCount <= 0) {
5823 if (0x403E < aExp) {
5824 float_raise(float_flag_invalid, status);
5825 return LIT64(0xFFFFFFFFFFFFFFFF);
5827 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5828 } else {
5829 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5831 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5834 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5836 uint64_t v;
5837 signed char current_rounding_mode = status->float_rounding_mode;
5839 set_float_rounding_mode(float_round_to_zero, status);
5840 v = float128_to_uint64(a, status);
5841 set_float_rounding_mode(current_rounding_mode, status);
5843 return v;
5846 /*----------------------------------------------------------------------------
5847 | Returns the result of converting the quadruple-precision floating-point
5848 | value `a' to the 32-bit unsigned integer format. The conversion
5849 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5850 | Arithmetic except that the conversion is always rounded toward zero.
5851 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5852 | if the conversion overflows, the largest unsigned integer is returned.
5853 | If 'a' is negative, the value is rounded and zero is returned; negative
5854 | values that do not round to zero will raise the inexact exception.
5855 *----------------------------------------------------------------------------*/
5857 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5859 uint64_t v;
5860 uint32_t res;
5861 int old_exc_flags = get_float_exception_flags(status);
5863 v = float128_to_uint64_round_to_zero(a, status);
5864 if (v > 0xffffffff) {
5865 res = 0xffffffff;
5866 } else {
5867 return v;
5869 set_float_exception_flags(old_exc_flags, status);
5870 float_raise(float_flag_invalid, status);
5871 return res;
5874 /*----------------------------------------------------------------------------
5875 | Returns the result of converting the quadruple-precision floating-point
5876 | value `a' to the single-precision floating-point format. The conversion
5877 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5878 | Arithmetic.
5879 *----------------------------------------------------------------------------*/
5881 float32 float128_to_float32(float128 a, float_status *status)
5883 flag aSign;
5884 int32_t aExp;
5885 uint64_t aSig0, aSig1;
5886 uint32_t zSig;
5888 aSig1 = extractFloat128Frac1( a );
5889 aSig0 = extractFloat128Frac0( a );
5890 aExp = extractFloat128Exp( a );
5891 aSign = extractFloat128Sign( a );
5892 if ( aExp == 0x7FFF ) {
5893 if ( aSig0 | aSig1 ) {
5894 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5896 return packFloat32( aSign, 0xFF, 0 );
5898 aSig0 |= ( aSig1 != 0 );
5899 shift64RightJamming( aSig0, 18, &aSig0 );
5900 zSig = aSig0;
5901 if ( aExp || zSig ) {
5902 zSig |= 0x40000000;
5903 aExp -= 0x3F81;
5905 return roundAndPackFloat32(aSign, aExp, zSig, status);
5909 /*----------------------------------------------------------------------------
5910 | Returns the result of converting the quadruple-precision floating-point
5911 | value `a' to the double-precision floating-point format. The conversion
5912 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5913 | Arithmetic.
5914 *----------------------------------------------------------------------------*/
5916 float64 float128_to_float64(float128 a, float_status *status)
5918 flag aSign;
5919 int32_t aExp;
5920 uint64_t aSig0, aSig1;
5922 aSig1 = extractFloat128Frac1( a );
5923 aSig0 = extractFloat128Frac0( a );
5924 aExp = extractFloat128Exp( a );
5925 aSign = extractFloat128Sign( a );
5926 if ( aExp == 0x7FFF ) {
5927 if ( aSig0 | aSig1 ) {
5928 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5930 return packFloat64( aSign, 0x7FF, 0 );
5932 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5933 aSig0 |= ( aSig1 != 0 );
5934 if ( aExp || aSig0 ) {
5935 aSig0 |= LIT64( 0x4000000000000000 );
5936 aExp -= 0x3C01;
5938 return roundAndPackFloat64(aSign, aExp, aSig0, status);
5942 /*----------------------------------------------------------------------------
5943 | Returns the result of converting the quadruple-precision floating-point
5944 | value `a' to the extended double-precision floating-point format. The
5945 | conversion is performed according to the IEC/IEEE Standard for Binary
5946 | Floating-Point Arithmetic.
5947 *----------------------------------------------------------------------------*/
5949 floatx80 float128_to_floatx80(float128 a, float_status *status)
5951 flag aSign;
5952 int32_t aExp;
5953 uint64_t aSig0, aSig1;
5955 aSig1 = extractFloat128Frac1( a );
5956 aSig0 = extractFloat128Frac0( a );
5957 aExp = extractFloat128Exp( a );
5958 aSign = extractFloat128Sign( a );
5959 if ( aExp == 0x7FFF ) {
5960 if ( aSig0 | aSig1 ) {
5961 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5963 return packFloatx80(aSign, floatx80_infinity_high,
5964 floatx80_infinity_low);
5966 if ( aExp == 0 ) {
5967 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5968 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5970 else {
5971 aSig0 |= LIT64( 0x0001000000000000 );
5973 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5974 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5978 /*----------------------------------------------------------------------------
5979 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5980 | returns the result as a quadruple-precision floating-point value. The
5981 | operation is performed according to the IEC/IEEE Standard for Binary
5982 | Floating-Point Arithmetic.
5983 *----------------------------------------------------------------------------*/
5985 float128 float128_round_to_int(float128 a, float_status *status)
5987 flag aSign;
5988 int32_t aExp;
5989 uint64_t lastBitMask, roundBitsMask;
5990 float128 z;
5992 aExp = extractFloat128Exp( a );
5993 if ( 0x402F <= aExp ) {
5994 if ( 0x406F <= aExp ) {
5995 if ( ( aExp == 0x7FFF )
5996 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5998 return propagateFloat128NaN(a, a, status);
6000 return a;
6002 lastBitMask = 1;
6003 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6004 roundBitsMask = lastBitMask - 1;
6005 z = a;
6006 switch (status->float_rounding_mode) {
6007 case float_round_nearest_even:
6008 if ( lastBitMask ) {
6009 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6010 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6012 else {
6013 if ( (int64_t) z.low < 0 ) {
6014 ++z.high;
6015 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6018 break;
6019 case float_round_ties_away:
6020 if (lastBitMask) {
6021 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6022 } else {
6023 if ((int64_t) z.low < 0) {
6024 ++z.high;
6027 break;
6028 case float_round_to_zero:
6029 break;
6030 case float_round_up:
6031 if (!extractFloat128Sign(z)) {
6032 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6034 break;
6035 case float_round_down:
6036 if (extractFloat128Sign(z)) {
6037 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6039 break;
6040 default:
6041 abort();
6043 z.low &= ~ roundBitsMask;
6045 else {
6046 if ( aExp < 0x3FFF ) {
6047 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6048 status->float_exception_flags |= float_flag_inexact;
6049 aSign = extractFloat128Sign( a );
6050 switch (status->float_rounding_mode) {
6051 case float_round_nearest_even:
6052 if ( ( aExp == 0x3FFE )
6053 && ( extractFloat128Frac0( a )
6054 | extractFloat128Frac1( a ) )
6056 return packFloat128( aSign, 0x3FFF, 0, 0 );
6058 break;
6059 case float_round_ties_away:
6060 if (aExp == 0x3FFE) {
6061 return packFloat128(aSign, 0x3FFF, 0, 0);
6063 break;
6064 case float_round_down:
6065 return
6066 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6067 : packFloat128( 0, 0, 0, 0 );
6068 case float_round_up:
6069 return
6070 aSign ? packFloat128( 1, 0, 0, 0 )
6071 : packFloat128( 0, 0x3FFF, 0, 0 );
6073 return packFloat128( aSign, 0, 0, 0 );
6075 lastBitMask = 1;
6076 lastBitMask <<= 0x402F - aExp;
6077 roundBitsMask = lastBitMask - 1;
6078 z.low = 0;
6079 z.high = a.high;
6080 switch (status->float_rounding_mode) {
6081 case float_round_nearest_even:
6082 z.high += lastBitMask>>1;
6083 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6084 z.high &= ~ lastBitMask;
6086 break;
6087 case float_round_ties_away:
6088 z.high += lastBitMask>>1;
6089 break;
6090 case float_round_to_zero:
6091 break;
6092 case float_round_up:
6093 if (!extractFloat128Sign(z)) {
6094 z.high |= ( a.low != 0 );
6095 z.high += roundBitsMask;
6097 break;
6098 case float_round_down:
6099 if (extractFloat128Sign(z)) {
6100 z.high |= (a.low != 0);
6101 z.high += roundBitsMask;
6103 break;
6104 default:
6105 abort();
6107 z.high &= ~ roundBitsMask;
6109 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6110 status->float_exception_flags |= float_flag_inexact;
6112 return z;
6116 /*----------------------------------------------------------------------------
6117 | Returns the result of adding the absolute values of the quadruple-precision
6118 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6119 | before being returned. `zSign' is ignored if the result is a NaN.
6120 | The addition is performed according to the IEC/IEEE Standard for Binary
6121 | Floating-Point Arithmetic.
6122 *----------------------------------------------------------------------------*/
6124 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6125 float_status *status)
6127 int32_t aExp, bExp, zExp;
6128 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6129 int32_t expDiff;
6131 aSig1 = extractFloat128Frac1( a );
6132 aSig0 = extractFloat128Frac0( a );
6133 aExp = extractFloat128Exp( a );
6134 bSig1 = extractFloat128Frac1( b );
6135 bSig0 = extractFloat128Frac0( b );
6136 bExp = extractFloat128Exp( b );
6137 expDiff = aExp - bExp;
6138 if ( 0 < expDiff ) {
6139 if ( aExp == 0x7FFF ) {
6140 if (aSig0 | aSig1) {
6141 return propagateFloat128NaN(a, b, status);
6143 return a;
6145 if ( bExp == 0 ) {
6146 --expDiff;
6148 else {
6149 bSig0 |= LIT64( 0x0001000000000000 );
6151 shift128ExtraRightJamming(
6152 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6153 zExp = aExp;
6155 else if ( expDiff < 0 ) {
6156 if ( bExp == 0x7FFF ) {
6157 if (bSig0 | bSig1) {
6158 return propagateFloat128NaN(a, b, status);
6160 return packFloat128( zSign, 0x7FFF, 0, 0 );
6162 if ( aExp == 0 ) {
6163 ++expDiff;
6165 else {
6166 aSig0 |= LIT64( 0x0001000000000000 );
6168 shift128ExtraRightJamming(
6169 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6170 zExp = bExp;
6172 else {
6173 if ( aExp == 0x7FFF ) {
6174 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6175 return propagateFloat128NaN(a, b, status);
6177 return a;
6179 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6180 if ( aExp == 0 ) {
6181 if (status->flush_to_zero) {
6182 if (zSig0 | zSig1) {
6183 float_raise(float_flag_output_denormal, status);
6185 return packFloat128(zSign, 0, 0, 0);
6187 return packFloat128( zSign, 0, zSig0, zSig1 );
6189 zSig2 = 0;
6190 zSig0 |= LIT64( 0x0002000000000000 );
6191 zExp = aExp;
6192 goto shiftRight1;
6194 aSig0 |= LIT64( 0x0001000000000000 );
6195 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6196 --zExp;
6197 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6198 ++zExp;
6199 shiftRight1:
6200 shift128ExtraRightJamming(
6201 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6202 roundAndPack:
6203 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6207 /*----------------------------------------------------------------------------
6208 | Returns the result of subtracting the absolute values of the quadruple-
6209 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6210 | difference is negated before being returned. `zSign' is ignored if the
6211 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6212 | Standard for Binary Floating-Point Arithmetic.
6213 *----------------------------------------------------------------------------*/
6215 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6216 float_status *status)
6218 int32_t aExp, bExp, zExp;
6219 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6220 int32_t expDiff;
6222 aSig1 = extractFloat128Frac1( a );
6223 aSig0 = extractFloat128Frac0( a );
6224 aExp = extractFloat128Exp( a );
6225 bSig1 = extractFloat128Frac1( b );
6226 bSig0 = extractFloat128Frac0( b );
6227 bExp = extractFloat128Exp( b );
6228 expDiff = aExp - bExp;
6229 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6230 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6231 if ( 0 < expDiff ) goto aExpBigger;
6232 if ( expDiff < 0 ) goto bExpBigger;
6233 if ( aExp == 0x7FFF ) {
6234 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6235 return propagateFloat128NaN(a, b, status);
6237 float_raise(float_flag_invalid, status);
6238 return float128_default_nan(status);
6240 if ( aExp == 0 ) {
6241 aExp = 1;
6242 bExp = 1;
6244 if ( bSig0 < aSig0 ) goto aBigger;
6245 if ( aSig0 < bSig0 ) goto bBigger;
6246 if ( bSig1 < aSig1 ) goto aBigger;
6247 if ( aSig1 < bSig1 ) goto bBigger;
6248 return packFloat128(status->float_rounding_mode == float_round_down,
6249 0, 0, 0);
6250 bExpBigger:
6251 if ( bExp == 0x7FFF ) {
6252 if (bSig0 | bSig1) {
6253 return propagateFloat128NaN(a, b, status);
6255 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6257 if ( aExp == 0 ) {
6258 ++expDiff;
6260 else {
6261 aSig0 |= LIT64( 0x4000000000000000 );
6263 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6264 bSig0 |= LIT64( 0x4000000000000000 );
6265 bBigger:
6266 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6267 zExp = bExp;
6268 zSign ^= 1;
6269 goto normalizeRoundAndPack;
6270 aExpBigger:
6271 if ( aExp == 0x7FFF ) {
6272 if (aSig0 | aSig1) {
6273 return propagateFloat128NaN(a, b, status);
6275 return a;
6277 if ( bExp == 0 ) {
6278 --expDiff;
6280 else {
6281 bSig0 |= LIT64( 0x4000000000000000 );
6283 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6284 aSig0 |= LIT64( 0x4000000000000000 );
6285 aBigger:
6286 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6287 zExp = aExp;
6288 normalizeRoundAndPack:
6289 --zExp;
6290 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6291 status);
6295 /*----------------------------------------------------------------------------
6296 | Returns the result of adding the quadruple-precision floating-point values
6297 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6298 | for Binary Floating-Point Arithmetic.
6299 *----------------------------------------------------------------------------*/
6301 float128 float128_add(float128 a, float128 b, float_status *status)
6303 flag aSign, bSign;
6305 aSign = extractFloat128Sign( a );
6306 bSign = extractFloat128Sign( b );
6307 if ( aSign == bSign ) {
6308 return addFloat128Sigs(a, b, aSign, status);
6310 else {
6311 return subFloat128Sigs(a, b, aSign, status);
6316 /*----------------------------------------------------------------------------
6317 | Returns the result of subtracting the quadruple-precision floating-point
6318 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6319 | Standard for Binary Floating-Point Arithmetic.
6320 *----------------------------------------------------------------------------*/
6322 float128 float128_sub(float128 a, float128 b, float_status *status)
6324 flag aSign, bSign;
6326 aSign = extractFloat128Sign( a );
6327 bSign = extractFloat128Sign( b );
6328 if ( aSign == bSign ) {
6329 return subFloat128Sigs(a, b, aSign, status);
6331 else {
6332 return addFloat128Sigs(a, b, aSign, status);
6337 /*----------------------------------------------------------------------------
6338 | Returns the result of multiplying the quadruple-precision floating-point
6339 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6340 | Standard for Binary Floating-Point Arithmetic.
6341 *----------------------------------------------------------------------------*/
6343 float128 float128_mul(float128 a, float128 b, float_status *status)
6345 flag aSign, bSign, zSign;
6346 int32_t aExp, bExp, zExp;
6347 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6349 aSig1 = extractFloat128Frac1( a );
6350 aSig0 = extractFloat128Frac0( a );
6351 aExp = extractFloat128Exp( a );
6352 aSign = extractFloat128Sign( a );
6353 bSig1 = extractFloat128Frac1( b );
6354 bSig0 = extractFloat128Frac0( b );
6355 bExp = extractFloat128Exp( b );
6356 bSign = extractFloat128Sign( b );
6357 zSign = aSign ^ bSign;
6358 if ( aExp == 0x7FFF ) {
6359 if ( ( aSig0 | aSig1 )
6360 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6361 return propagateFloat128NaN(a, b, status);
6363 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6364 return packFloat128( zSign, 0x7FFF, 0, 0 );
6366 if ( bExp == 0x7FFF ) {
6367 if (bSig0 | bSig1) {
6368 return propagateFloat128NaN(a, b, status);
6370 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6371 invalid:
6372 float_raise(float_flag_invalid, status);
6373 return float128_default_nan(status);
6375 return packFloat128( zSign, 0x7FFF, 0, 0 );
6377 if ( aExp == 0 ) {
6378 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6379 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6381 if ( bExp == 0 ) {
6382 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6383 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6385 zExp = aExp + bExp - 0x4000;
6386 aSig0 |= LIT64( 0x0001000000000000 );
6387 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6388 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6389 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6390 zSig2 |= ( zSig3 != 0 );
6391 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6392 shift128ExtraRightJamming(
6393 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6394 ++zExp;
6396 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6400 /*----------------------------------------------------------------------------
6401 | Returns the result of dividing the quadruple-precision floating-point value
6402 | `a' by the corresponding value `b'. The operation is performed according to
6403 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6404 *----------------------------------------------------------------------------*/
6406 float128 float128_div(float128 a, float128 b, float_status *status)
6408 flag aSign, bSign, zSign;
6409 int32_t aExp, bExp, zExp;
6410 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6411 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6413 aSig1 = extractFloat128Frac1( a );
6414 aSig0 = extractFloat128Frac0( a );
6415 aExp = extractFloat128Exp( a );
6416 aSign = extractFloat128Sign( a );
6417 bSig1 = extractFloat128Frac1( b );
6418 bSig0 = extractFloat128Frac0( b );
6419 bExp = extractFloat128Exp( b );
6420 bSign = extractFloat128Sign( b );
6421 zSign = aSign ^ bSign;
6422 if ( aExp == 0x7FFF ) {
6423 if (aSig0 | aSig1) {
6424 return propagateFloat128NaN(a, b, status);
6426 if ( bExp == 0x7FFF ) {
6427 if (bSig0 | bSig1) {
6428 return propagateFloat128NaN(a, b, status);
6430 goto invalid;
6432 return packFloat128( zSign, 0x7FFF, 0, 0 );
6434 if ( bExp == 0x7FFF ) {
6435 if (bSig0 | bSig1) {
6436 return propagateFloat128NaN(a, b, status);
6438 return packFloat128( zSign, 0, 0, 0 );
6440 if ( bExp == 0 ) {
6441 if ( ( bSig0 | bSig1 ) == 0 ) {
6442 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6443 invalid:
6444 float_raise(float_flag_invalid, status);
6445 return float128_default_nan(status);
6447 float_raise(float_flag_divbyzero, status);
6448 return packFloat128( zSign, 0x7FFF, 0, 0 );
6450 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6452 if ( aExp == 0 ) {
6453 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6454 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6456 zExp = aExp - bExp + 0x3FFD;
6457 shortShift128Left(
6458 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6459 shortShift128Left(
6460 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6461 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6462 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6463 ++zExp;
6465 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6466 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6467 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6468 while ( (int64_t) rem0 < 0 ) {
6469 --zSig0;
6470 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6472 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6473 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6474 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6475 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6476 while ( (int64_t) rem1 < 0 ) {
6477 --zSig1;
6478 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6480 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6482 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6483 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6487 /*----------------------------------------------------------------------------
6488 | Returns the remainder of the quadruple-precision floating-point value `a'
6489 | with respect to the corresponding value `b'. The operation is performed
6490 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6491 *----------------------------------------------------------------------------*/
6493 float128 float128_rem(float128 a, float128 b, float_status *status)
6495 flag aSign, zSign;
6496 int32_t aExp, bExp, expDiff;
6497 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6498 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6499 int64_t sigMean0;
6501 aSig1 = extractFloat128Frac1( a );
6502 aSig0 = extractFloat128Frac0( a );
6503 aExp = extractFloat128Exp( a );
6504 aSign = extractFloat128Sign( a );
6505 bSig1 = extractFloat128Frac1( b );
6506 bSig0 = extractFloat128Frac0( b );
6507 bExp = extractFloat128Exp( b );
6508 if ( aExp == 0x7FFF ) {
6509 if ( ( aSig0 | aSig1 )
6510 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6511 return propagateFloat128NaN(a, b, status);
6513 goto invalid;
6515 if ( bExp == 0x7FFF ) {
6516 if (bSig0 | bSig1) {
6517 return propagateFloat128NaN(a, b, status);
6519 return a;
6521 if ( bExp == 0 ) {
6522 if ( ( bSig0 | bSig1 ) == 0 ) {
6523 invalid:
6524 float_raise(float_flag_invalid, status);
6525 return float128_default_nan(status);
6527 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6529 if ( aExp == 0 ) {
6530 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6531 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6533 expDiff = aExp - bExp;
6534 if ( expDiff < -1 ) return a;
6535 shortShift128Left(
6536 aSig0 | LIT64( 0x0001000000000000 ),
6537 aSig1,
6538 15 - ( expDiff < 0 ),
6539 &aSig0,
6540 &aSig1
6542 shortShift128Left(
6543 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6544 q = le128( bSig0, bSig1, aSig0, aSig1 );
6545 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6546 expDiff -= 64;
6547 while ( 0 < expDiff ) {
6548 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6549 q = ( 4 < q ) ? q - 4 : 0;
6550 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6551 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6552 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6553 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6554 expDiff -= 61;
6556 if ( -64 < expDiff ) {
6557 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6558 q = ( 4 < q ) ? q - 4 : 0;
6559 q >>= - expDiff;
6560 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6561 expDiff += 52;
6562 if ( expDiff < 0 ) {
6563 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6565 else {
6566 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6568 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6569 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6571 else {
6572 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6573 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6575 do {
6576 alternateASig0 = aSig0;
6577 alternateASig1 = aSig1;
6578 ++q;
6579 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6580 } while ( 0 <= (int64_t) aSig0 );
6581 add128(
6582 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6583 if ( ( sigMean0 < 0 )
6584 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6585 aSig0 = alternateASig0;
6586 aSig1 = alternateASig1;
6588 zSign = ( (int64_t) aSig0 < 0 );
6589 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6590 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6591 status);
6594 /*----------------------------------------------------------------------------
6595 | Returns the square root of the quadruple-precision floating-point value `a'.
6596 | The operation is performed according to the IEC/IEEE Standard for Binary
6597 | Floating-Point Arithmetic.
6598 *----------------------------------------------------------------------------*/
6600 float128 float128_sqrt(float128 a, float_status *status)
6602 flag aSign;
6603 int32_t aExp, zExp;
6604 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6605 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6607 aSig1 = extractFloat128Frac1( a );
6608 aSig0 = extractFloat128Frac0( a );
6609 aExp = extractFloat128Exp( a );
6610 aSign = extractFloat128Sign( a );
6611 if ( aExp == 0x7FFF ) {
6612 if (aSig0 | aSig1) {
6613 return propagateFloat128NaN(a, a, status);
6615 if ( ! aSign ) return a;
6616 goto invalid;
6618 if ( aSign ) {
6619 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6620 invalid:
6621 float_raise(float_flag_invalid, status);
6622 return float128_default_nan(status);
6624 if ( aExp == 0 ) {
6625 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6626 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6628 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6629 aSig0 |= LIT64( 0x0001000000000000 );
6630 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6631 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6632 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6633 doubleZSig0 = zSig0<<1;
6634 mul64To128( zSig0, zSig0, &term0, &term1 );
6635 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6636 while ( (int64_t) rem0 < 0 ) {
6637 --zSig0;
6638 doubleZSig0 -= 2;
6639 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6641 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6642 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6643 if ( zSig1 == 0 ) zSig1 = 1;
6644 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6645 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6646 mul64To128( zSig1, zSig1, &term2, &term3 );
6647 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6648 while ( (int64_t) rem1 < 0 ) {
6649 --zSig1;
6650 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6651 term3 |= 1;
6652 term2 |= doubleZSig0;
6653 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6655 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6657 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6658 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6662 /*----------------------------------------------------------------------------
6663 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6664 | the corresponding value `b', and 0 otherwise. The invalid exception is
6665 | raised if either operand is a NaN. Otherwise, the comparison is performed
6666 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6667 *----------------------------------------------------------------------------*/
6669 int float128_eq(float128 a, float128 b, float_status *status)
6672 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6673 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6674 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6675 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6677 float_raise(float_flag_invalid, status);
6678 return 0;
6680 return
6681 ( a.low == b.low )
6682 && ( ( a.high == b.high )
6683 || ( ( a.low == 0 )
6684 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6689 /*----------------------------------------------------------------------------
6690 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6691 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6692 | exception is raised if either operand is a NaN. The comparison is performed
6693 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6694 *----------------------------------------------------------------------------*/
6696 int float128_le(float128 a, float128 b, float_status *status)
6698 flag aSign, bSign;
6700 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6701 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6702 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6703 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6705 float_raise(float_flag_invalid, status);
6706 return 0;
6708 aSign = extractFloat128Sign( a );
6709 bSign = extractFloat128Sign( b );
6710 if ( aSign != bSign ) {
6711 return
6712 aSign
6713 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6714 == 0 );
6716 return
6717 aSign ? le128( b.high, b.low, a.high, a.low )
6718 : le128( a.high, a.low, b.high, b.low );
6722 /*----------------------------------------------------------------------------
6723 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6724 | the corresponding value `b', and 0 otherwise. The invalid exception is
6725 | raised if either operand is a NaN. The comparison is performed according
6726 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6727 *----------------------------------------------------------------------------*/
6729 int float128_lt(float128 a, float128 b, float_status *status)
6731 flag aSign, bSign;
6733 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6734 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6735 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6736 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6738 float_raise(float_flag_invalid, status);
6739 return 0;
6741 aSign = extractFloat128Sign( a );
6742 bSign = extractFloat128Sign( b );
6743 if ( aSign != bSign ) {
6744 return
6745 aSign
6746 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6747 != 0 );
6749 return
6750 aSign ? lt128( b.high, b.low, a.high, a.low )
6751 : lt128( a.high, a.low, b.high, b.low );
6755 /*----------------------------------------------------------------------------
6756 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6757 | be compared, and 0 otherwise. The invalid exception is raised if either
6758 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6759 | Standard for Binary Floating-Point Arithmetic.
6760 *----------------------------------------------------------------------------*/
6762 int float128_unordered(float128 a, float128 b, float_status *status)
6764 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6765 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6766 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6767 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6769 float_raise(float_flag_invalid, status);
6770 return 1;
6772 return 0;
6775 /*----------------------------------------------------------------------------
6776 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6777 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6778 | exception. The comparison is performed according to the IEC/IEEE Standard
6779 | for Binary Floating-Point Arithmetic.
6780 *----------------------------------------------------------------------------*/
6782 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6785 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6786 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6787 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6788 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6790 if (float128_is_signaling_nan(a, status)
6791 || float128_is_signaling_nan(b, status)) {
6792 float_raise(float_flag_invalid, status);
6794 return 0;
6796 return
6797 ( a.low == b.low )
6798 && ( ( a.high == b.high )
6799 || ( ( a.low == 0 )
6800 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6805 /*----------------------------------------------------------------------------
6806 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6807 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6808 | cause an exception. Otherwise, the comparison is performed according to the
6809 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6810 *----------------------------------------------------------------------------*/
6812 int float128_le_quiet(float128 a, float128 b, float_status *status)
6814 flag aSign, bSign;
6816 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6817 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6818 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6819 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6821 if (float128_is_signaling_nan(a, status)
6822 || float128_is_signaling_nan(b, status)) {
6823 float_raise(float_flag_invalid, status);
6825 return 0;
6827 aSign = extractFloat128Sign( a );
6828 bSign = extractFloat128Sign( b );
6829 if ( aSign != bSign ) {
6830 return
6831 aSign
6832 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6833 == 0 );
6835 return
6836 aSign ? le128( b.high, b.low, a.high, a.low )
6837 : le128( a.high, a.low, b.high, b.low );
6841 /*----------------------------------------------------------------------------
6842 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6843 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6844 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6845 | Standard for Binary Floating-Point Arithmetic.
6846 *----------------------------------------------------------------------------*/
6848 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6850 flag aSign, bSign;
6852 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6853 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6854 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6855 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6857 if (float128_is_signaling_nan(a, status)
6858 || float128_is_signaling_nan(b, status)) {
6859 float_raise(float_flag_invalid, status);
6861 return 0;
6863 aSign = extractFloat128Sign( a );
6864 bSign = extractFloat128Sign( b );
6865 if ( aSign != bSign ) {
6866 return
6867 aSign
6868 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6869 != 0 );
6871 return
6872 aSign ? lt128( b.high, b.low, a.high, a.low )
6873 : lt128( a.high, a.low, b.high, b.low );
6877 /*----------------------------------------------------------------------------
6878 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6879 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6880 | comparison is performed according to the IEC/IEEE Standard for Binary
6881 | Floating-Point Arithmetic.
6882 *----------------------------------------------------------------------------*/
6884 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6886 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6887 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6888 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6889 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6891 if (float128_is_signaling_nan(a, status)
6892 || float128_is_signaling_nan(b, status)) {
6893 float_raise(float_flag_invalid, status);
6895 return 1;
6897 return 0;
6900 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6901 int is_quiet, float_status *status)
6903 flag aSign, bSign;
6905 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6906 float_raise(float_flag_invalid, status);
6907 return float_relation_unordered;
6909 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6910 ( extractFloatx80Frac( a )<<1 ) ) ||
6911 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6912 ( extractFloatx80Frac( b )<<1 ) )) {
6913 if (!is_quiet ||
6914 floatx80_is_signaling_nan(a, status) ||
6915 floatx80_is_signaling_nan(b, status)) {
6916 float_raise(float_flag_invalid, status);
6918 return float_relation_unordered;
6920 aSign = extractFloatx80Sign( a );
6921 bSign = extractFloatx80Sign( b );
6922 if ( aSign != bSign ) {
6924 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6925 ( ( a.low | b.low ) == 0 ) ) {
6926 /* zero case */
6927 return float_relation_equal;
6928 } else {
6929 return 1 - (2 * aSign);
6931 } else {
6932 if (a.low == b.low && a.high == b.high) {
6933 return float_relation_equal;
6934 } else {
6935 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6940 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6942 return floatx80_compare_internal(a, b, 0, status);
6945 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6947 return floatx80_compare_internal(a, b, 1, status);
6950 static inline int float128_compare_internal(float128 a, float128 b,
6951 int is_quiet, float_status *status)
6953 flag aSign, bSign;
6955 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6956 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6957 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6958 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6959 if (!is_quiet ||
6960 float128_is_signaling_nan(a, status) ||
6961 float128_is_signaling_nan(b, status)) {
6962 float_raise(float_flag_invalid, status);
6964 return float_relation_unordered;
6966 aSign = extractFloat128Sign( a );
6967 bSign = extractFloat128Sign( b );
6968 if ( aSign != bSign ) {
6969 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6970 /* zero case */
6971 return float_relation_equal;
6972 } else {
6973 return 1 - (2 * aSign);
6975 } else {
6976 if (a.low == b.low && a.high == b.high) {
6977 return float_relation_equal;
6978 } else {
6979 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6984 int float128_compare(float128 a, float128 b, float_status *status)
6986 return float128_compare_internal(a, b, 0, status);
6989 int float128_compare_quiet(float128 a, float128 b, float_status *status)
6991 return float128_compare_internal(a, b, 1, status);
6994 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
6996 flag aSign;
6997 int32_t aExp;
6998 uint64_t aSig;
7000 if (floatx80_invalid_encoding(a)) {
7001 float_raise(float_flag_invalid, status);
7002 return floatx80_default_nan(status);
7004 aSig = extractFloatx80Frac( a );
7005 aExp = extractFloatx80Exp( a );
7006 aSign = extractFloatx80Sign( a );
7008 if ( aExp == 0x7FFF ) {
7009 if ( aSig<<1 ) {
7010 return propagateFloatx80NaN(a, a, status);
7012 return a;
7015 if (aExp == 0) {
7016 if (aSig == 0) {
7017 return a;
7019 aExp++;
7022 if (n > 0x10000) {
7023 n = 0x10000;
7024 } else if (n < -0x10000) {
7025 n = -0x10000;
7028 aExp += n;
7029 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7030 aSign, aExp, aSig, 0, status);
7033 float128 float128_scalbn(float128 a, int n, float_status *status)
7035 flag aSign;
7036 int32_t aExp;
7037 uint64_t aSig0, aSig1;
7039 aSig1 = extractFloat128Frac1( a );
7040 aSig0 = extractFloat128Frac0( a );
7041 aExp = extractFloat128Exp( a );
7042 aSign = extractFloat128Sign( a );
7043 if ( aExp == 0x7FFF ) {
7044 if ( aSig0 | aSig1 ) {
7045 return propagateFloat128NaN(a, a, status);
7047 return a;
7049 if (aExp != 0) {
7050 aSig0 |= LIT64( 0x0001000000000000 );
7051 } else if (aSig0 == 0 && aSig1 == 0) {
7052 return a;
7053 } else {
7054 aExp++;
7057 if (n > 0x10000) {
7058 n = 0x10000;
7059 } else if (n < -0x10000) {
7060 n = -0x10000;
7063 aExp += n - 1;
7064 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7065 , status);