4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine: (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output. These details are target-
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
112 static inline uint32_t extractFloat16Frac(float16 a
)
114 return float16_val(a
) & 0x3ff;
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
121 static inline int extractFloat16Exp(float16 a
)
123 return (float16_val(a
) >> 10) & 0x1f;
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
130 static inline flag
extractFloat16Sign(float16 a
)
132 return float16_val(a
)>>15;
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
139 static inline uint32_t extractFloat32Frac(float32 a
)
141 return float32_val(a
) & 0x007FFFFF;
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
148 static inline int extractFloat32Exp(float32 a
)
150 return (float32_val(a
) >> 23) & 0xFF;
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
157 static inline flag
extractFloat32Sign(float32 a
)
159 return float32_val(a
) >> 31;
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
166 static inline uint64_t extractFloat64Frac(float64 a
)
168 return float64_val(a
) & LIT64(0x000FFFFFFFFFFFFF);
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
175 static inline int extractFloat64Exp(float64 a
)
177 return (float64_val(a
) >> 52) & 0x7FF;
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
184 static inline flag
extractFloat64Sign(float64 a
)
186 return float64_val(a
) >> 63;
190 * Classify a floating point number. Everything above float_class_qnan
191 * is a NaN so cls >= float_class_qnan is any NaN.
194 typedef enum __attribute__ ((__packed__
)) {
195 float_class_unclassified
,
199 float_class_qnan
, /* all NaNs from here */
202 float_class_msnan
, /* maybe silenced */
206 * Structure holding all of the decomposed parts of a float. The
207 * exponent is unbiased and the fraction is normalized. All
208 * calculations are done with a 64 bit fraction and then rounded as
209 * appropriate for the final format.
211 * Thanks to the packed FloatClass a decent compiler should be able to
212 * fit the whole structure into registers and avoid using the stack
213 * for parameter passing.
223 #define DECOMPOSED_BINARY_POINT (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
227 /* Structure holding all of the relevant parameters for a format.
228 * exp_size: the size of the exponent field
229 * exp_bias: the offset applied to the exponent field
230 * exp_max: the maximum normalised exponent
231 * frac_size: the size of the fraction field
232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233 * The following are computed based the size of fraction
234 * frac_lsb: least significant bit of fraction
235 * fram_lsbm1: the bit bellow the least significant bit (for rounding)
236 * round_mask/roundeven_mask: masks used for rounding
247 uint64_t roundeven_mask
;
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F) \
253 .exp_bias = ((1 << E) - 1) >> 1, \
254 .exp_max = (1 << E) - 1, \
256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
262 static const FloatFmt float16_params
= {
266 static const FloatFmt float32_params
= {
270 static const FloatFmt float64_params
= {
274 /* Unpack a float to parts, but do not canonicalize. */
275 static inline FloatParts
unpack_raw(FloatFmt fmt
, uint64_t raw
)
277 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
279 return (FloatParts
) {
280 .cls
= float_class_unclassified
,
281 .sign
= extract64(raw
, sign_pos
, 1),
282 .exp
= extract64(raw
, fmt
.frac_size
, fmt
.exp_size
),
283 .frac
= extract64(raw
, 0, fmt
.frac_size
),
287 static inline FloatParts
float16_unpack_raw(float16 f
)
289 return unpack_raw(float16_params
, f
);
292 static inline FloatParts
float32_unpack_raw(float32 f
)
294 return unpack_raw(float32_params
, f
);
297 static inline FloatParts
float64_unpack_raw(float64 f
)
299 return unpack_raw(float64_params
, f
);
302 /* Pack a float from parts, but do not canonicalize. */
303 static inline uint64_t pack_raw(FloatFmt fmt
, FloatParts p
)
305 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
306 uint64_t ret
= deposit64(p
.frac
, fmt
.frac_size
, fmt
.exp_size
, p
.exp
);
307 return deposit64(ret
, sign_pos
, 1, p
.sign
);
310 static inline float16
float16_pack_raw(FloatParts p
)
312 return make_float16(pack_raw(float16_params
, p
));
315 static inline float32
float32_pack_raw(FloatParts p
)
317 return make_float32(pack_raw(float32_params
, p
));
320 static inline float64
float64_pack_raw(FloatParts p
)
322 return make_float64(pack_raw(float64_params
, p
));
325 /* Canonicalize EXP and FRAC, setting CLS. */
326 static FloatParts
canonicalize(FloatParts part
, const FloatFmt
*parm
,
327 float_status
*status
)
329 if (part
.exp
== parm
->exp_max
) {
330 if (part
.frac
== 0) {
331 part
.cls
= float_class_inf
;
333 #ifdef NO_SIGNALING_NANS
334 part
.cls
= float_class_qnan
;
336 int64_t msb
= part
.frac
<< (parm
->frac_shift
+ 2);
337 if ((msb
< 0) == status
->snan_bit_is_one
) {
338 part
.cls
= float_class_snan
;
340 part
.cls
= float_class_qnan
;
344 } else if (part
.exp
== 0) {
345 if (likely(part
.frac
== 0)) {
346 part
.cls
= float_class_zero
;
347 } else if (status
->flush_inputs_to_zero
) {
348 float_raise(float_flag_input_denormal
, status
);
349 part
.cls
= float_class_zero
;
352 int shift
= clz64(part
.frac
) - 1;
353 part
.cls
= float_class_normal
;
354 part
.exp
= parm
->frac_shift
- parm
->exp_bias
- shift
+ 1;
358 part
.cls
= float_class_normal
;
359 part
.exp
-= parm
->exp_bias
;
360 part
.frac
= DECOMPOSED_IMPLICIT_BIT
+ (part
.frac
<< parm
->frac_shift
);
365 /* Round and uncanonicalize a floating-point number by parts. There
366 * are FRAC_SHIFT bits that may require rounding at the bottom of the
367 * fraction; these bits will be removed. The exponent will be biased
368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
371 static FloatParts
round_canonical(FloatParts p
, float_status
*s
,
372 const FloatFmt
*parm
)
374 const uint64_t frac_lsbm1
= parm
->frac_lsbm1
;
375 const uint64_t round_mask
= parm
->round_mask
;
376 const uint64_t roundeven_mask
= parm
->roundeven_mask
;
377 const int exp_max
= parm
->exp_max
;
378 const int frac_shift
= parm
->frac_shift
;
387 case float_class_normal
:
388 switch (s
->float_rounding_mode
) {
389 case float_round_nearest_even
:
390 overflow_norm
= false;
391 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
393 case float_round_ties_away
:
394 overflow_norm
= false;
397 case float_round_to_zero
:
398 overflow_norm
= true;
402 inc
= p
.sign
? 0 : round_mask
;
403 overflow_norm
= p
.sign
;
405 case float_round_down
:
406 inc
= p
.sign
? round_mask
: 0;
407 overflow_norm
= !p
.sign
;
410 g_assert_not_reached();
413 exp
+= parm
->exp_bias
;
414 if (likely(exp
> 0)) {
415 if (frac
& round_mask
) {
416 flags
|= float_flag_inexact
;
418 if (frac
& DECOMPOSED_OVERFLOW_BIT
) {
425 if (unlikely(exp
>= exp_max
)) {
426 flags
|= float_flag_overflow
| float_flag_inexact
;
431 p
.cls
= float_class_inf
;
435 } else if (s
->flush_to_zero
) {
436 flags
|= float_flag_output_denormal
;
437 p
.cls
= float_class_zero
;
440 bool is_tiny
= (s
->float_detect_tininess
441 == float_tininess_before_rounding
)
443 || !((frac
+ inc
) & DECOMPOSED_OVERFLOW_BIT
);
445 shift64RightJamming(frac
, 1 - exp
, &frac
);
446 if (frac
& round_mask
) {
447 /* Need to recompute round-to-even. */
448 if (s
->float_rounding_mode
== float_round_nearest_even
) {
449 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
452 flags
|= float_flag_inexact
;
456 exp
= (frac
& DECOMPOSED_IMPLICIT_BIT
? 1 : 0);
459 if (is_tiny
&& (flags
& float_flag_inexact
)) {
460 flags
|= float_flag_underflow
;
462 if (exp
== 0 && frac
== 0) {
463 p
.cls
= float_class_zero
;
468 case float_class_zero
:
474 case float_class_inf
:
480 case float_class_qnan
:
481 case float_class_snan
:
486 g_assert_not_reached();
489 float_raise(flags
, s
);
495 static FloatParts
float16_unpack_canonical(float16 f
, float_status
*s
)
497 return canonicalize(float16_unpack_raw(f
), &float16_params
, s
);
500 static float16
float16_round_pack_canonical(FloatParts p
, float_status
*s
)
503 case float_class_dnan
:
504 return float16_default_nan(s
);
505 case float_class_msnan
:
506 return float16_maybe_silence_nan(float16_pack_raw(p
), s
);
508 p
= round_canonical(p
, s
, &float16_params
);
509 return float16_pack_raw(p
);
513 static FloatParts
float32_unpack_canonical(float32 f
, float_status
*s
)
515 return canonicalize(float32_unpack_raw(f
), &float32_params
, s
);
518 static float32
float32_round_pack_canonical(FloatParts p
, float_status
*s
)
521 case float_class_dnan
:
522 return float32_default_nan(s
);
523 case float_class_msnan
:
524 return float32_maybe_silence_nan(float32_pack_raw(p
), s
);
526 p
= round_canonical(p
, s
, &float32_params
);
527 return float32_pack_raw(p
);
531 static FloatParts
float64_unpack_canonical(float64 f
, float_status
*s
)
533 return canonicalize(float64_unpack_raw(f
), &float64_params
, s
);
536 static float64
float64_round_pack_canonical(FloatParts p
, float_status
*s
)
539 case float_class_dnan
:
540 return float64_default_nan(s
);
541 case float_class_msnan
:
542 return float64_maybe_silence_nan(float64_pack_raw(p
), s
);
544 p
= round_canonical(p
, s
, &float64_params
);
545 return float64_pack_raw(p
);
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c
)
552 return unlikely(c
>= float_class_qnan
);
554 static bool is_snan(FloatClass c
)
556 return c
== float_class_snan
;
558 static bool is_qnan(FloatClass c
)
560 return c
== float_class_qnan
;
563 static FloatParts
return_nan(FloatParts a
, float_status
*s
)
566 case float_class_snan
:
567 s
->float_exception_flags
|= float_flag_invalid
;
568 a
.cls
= float_class_msnan
;
570 case float_class_qnan
:
571 if (s
->default_nan_mode
) {
572 a
.cls
= float_class_dnan
;
577 g_assert_not_reached();
582 static FloatParts
pick_nan(FloatParts a
, FloatParts b
, float_status
*s
)
584 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
585 s
->float_exception_flags
|= float_flag_invalid
;
588 if (s
->default_nan_mode
) {
589 a
.cls
= float_class_dnan
;
591 if (pickNaN(is_qnan(a
.cls
), is_snan(a
.cls
),
592 is_qnan(b
.cls
), is_snan(b
.cls
),
594 (a
.frac
== b
.frac
&& a
.sign
< b
.sign
))) {
597 a
.cls
= float_class_msnan
;
602 static FloatParts
pick_nan_muladd(FloatParts a
, FloatParts b
, FloatParts c
,
603 bool inf_zero
, float_status
*s
)
605 if (is_snan(a
.cls
) || is_snan(b
.cls
) || is_snan(c
.cls
)) {
606 s
->float_exception_flags
|= float_flag_invalid
;
609 if (s
->default_nan_mode
) {
610 a
.cls
= float_class_dnan
;
612 switch (pickNaNMulAdd(is_qnan(a
.cls
), is_snan(a
.cls
),
613 is_qnan(b
.cls
), is_snan(b
.cls
),
614 is_qnan(c
.cls
), is_snan(c
.cls
),
625 a
.cls
= float_class_dnan
;
628 g_assert_not_reached();
631 a
.cls
= float_class_msnan
;
637 * Returns the result of adding or subtracting the values of the
638 * floating-point values `a' and `b'. The operation is performed
639 * according to the IEC/IEEE Standard for Binary Floating-Point
643 static FloatParts
addsub_floats(FloatParts a
, FloatParts b
, bool subtract
,
646 bool a_sign
= a
.sign
;
647 bool b_sign
= b
.sign
^ subtract
;
649 if (a_sign
!= b_sign
) {
652 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
653 if (a
.exp
> b
.exp
|| (a
.exp
== b
.exp
&& a
.frac
>= b
.frac
)) {
654 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
655 a
.frac
= a
.frac
- b
.frac
;
657 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
658 a
.frac
= b
.frac
- a
.frac
;
664 a
.cls
= float_class_zero
;
665 a
.sign
= s
->float_rounding_mode
== float_round_down
;
667 int shift
= clz64(a
.frac
) - 1;
668 a
.frac
= a
.frac
<< shift
;
669 a
.exp
= a
.exp
- shift
;
674 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
675 return pick_nan(a
, b
, s
);
677 if (a
.cls
== float_class_inf
) {
678 if (b
.cls
== float_class_inf
) {
679 float_raise(float_flag_invalid
, s
);
680 a
.cls
= float_class_dnan
;
684 if (a
.cls
== float_class_zero
&& b
.cls
== float_class_zero
) {
685 a
.sign
= s
->float_rounding_mode
== float_round_down
;
688 if (a
.cls
== float_class_zero
|| b
.cls
== float_class_inf
) {
692 if (b
.cls
== float_class_zero
) {
697 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
699 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
700 } else if (a
.exp
< b
.exp
) {
701 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
705 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
711 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
712 return pick_nan(a
, b
, s
);
714 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
717 if (b
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
722 g_assert_not_reached();
726 * Returns the result of adding or subtracting the floating-point
727 * values `a' and `b'. The operation is performed according to the
728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
731 float16
__attribute__((flatten
)) float16_add(float16 a
, float16 b
,
732 float_status
*status
)
734 FloatParts pa
= float16_unpack_canonical(a
, status
);
735 FloatParts pb
= float16_unpack_canonical(b
, status
);
736 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
738 return float16_round_pack_canonical(pr
, status
);
741 float32
__attribute__((flatten
)) float32_add(float32 a
, float32 b
,
742 float_status
*status
)
744 FloatParts pa
= float32_unpack_canonical(a
, status
);
745 FloatParts pb
= float32_unpack_canonical(b
, status
);
746 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
748 return float32_round_pack_canonical(pr
, status
);
751 float64
__attribute__((flatten
)) float64_add(float64 a
, float64 b
,
752 float_status
*status
)
754 FloatParts pa
= float64_unpack_canonical(a
, status
);
755 FloatParts pb
= float64_unpack_canonical(b
, status
);
756 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
758 return float64_round_pack_canonical(pr
, status
);
761 float16
__attribute__((flatten
)) float16_sub(float16 a
, float16 b
,
762 float_status
*status
)
764 FloatParts pa
= float16_unpack_canonical(a
, status
);
765 FloatParts pb
= float16_unpack_canonical(b
, status
);
766 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
768 return float16_round_pack_canonical(pr
, status
);
771 float32
__attribute__((flatten
)) float32_sub(float32 a
, float32 b
,
772 float_status
*status
)
774 FloatParts pa
= float32_unpack_canonical(a
, status
);
775 FloatParts pb
= float32_unpack_canonical(b
, status
);
776 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
778 return float32_round_pack_canonical(pr
, status
);
781 float64
__attribute__((flatten
)) float64_sub(float64 a
, float64 b
,
782 float_status
*status
)
784 FloatParts pa
= float64_unpack_canonical(a
, status
);
785 FloatParts pb
= float64_unpack_canonical(b
, status
);
786 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
788 return float64_round_pack_canonical(pr
, status
);
792 * Returns the result of multiplying the floating-point values `a' and
793 * `b'. The operation is performed according to the IEC/IEEE Standard
794 * for Binary Floating-Point Arithmetic.
797 static FloatParts
mul_floats(FloatParts a
, FloatParts b
, float_status
*s
)
799 bool sign
= a
.sign
^ b
.sign
;
801 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
803 int exp
= a
.exp
+ b
.exp
;
805 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
806 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
807 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
808 shift64RightJamming(lo
, 1, &lo
);
818 /* handle all the NaN cases */
819 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
820 return pick_nan(a
, b
, s
);
822 /* Inf * Zero == NaN */
823 if ((a
.cls
== float_class_inf
&& b
.cls
== float_class_zero
) ||
824 (a
.cls
== float_class_zero
&& b
.cls
== float_class_inf
)) {
825 s
->float_exception_flags
|= float_flag_invalid
;
826 a
.cls
= float_class_dnan
;
830 /* Multiply by 0 or Inf */
831 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
835 if (b
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
839 g_assert_not_reached();
842 float16
__attribute__((flatten
)) float16_mul(float16 a
, float16 b
,
843 float_status
*status
)
845 FloatParts pa
= float16_unpack_canonical(a
, status
);
846 FloatParts pb
= float16_unpack_canonical(b
, status
);
847 FloatParts pr
= mul_floats(pa
, pb
, status
);
849 return float16_round_pack_canonical(pr
, status
);
852 float32
__attribute__((flatten
)) float32_mul(float32 a
, float32 b
,
853 float_status
*status
)
855 FloatParts pa
= float32_unpack_canonical(a
, status
);
856 FloatParts pb
= float32_unpack_canonical(b
, status
);
857 FloatParts pr
= mul_floats(pa
, pb
, status
);
859 return float32_round_pack_canonical(pr
, status
);
862 float64
__attribute__((flatten
)) float64_mul(float64 a
, float64 b
,
863 float_status
*status
)
865 FloatParts pa
= float64_unpack_canonical(a
, status
);
866 FloatParts pb
= float64_unpack_canonical(b
, status
);
867 FloatParts pr
= mul_floats(pa
, pb
, status
);
869 return float64_round_pack_canonical(pr
, status
);
873 * Returns the result of multiplying the floating-point values `a' and
874 * `b' then adding 'c', with no intermediate rounding step after the
875 * multiplication. The operation is performed according to the
876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
877 * The flags argument allows the caller to select negation of the
878 * addend, the intermediate product, or the final result. (The
879 * difference between this and having the caller do a separate
880 * negation is that negating externally will flip the sign bit on
884 static FloatParts
muladd_floats(FloatParts a
, FloatParts b
, FloatParts c
,
885 int flags
, float_status
*s
)
887 bool inf_zero
= ((1 << a
.cls
) | (1 << b
.cls
)) ==
888 ((1 << float_class_inf
) | (1 << float_class_zero
));
890 bool sign_flip
= flags
& float_muladd_negate_result
;
895 /* It is implementation-defined whether the cases of (0,inf,qnan)
896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
897 * they return if they do), so we have to hand this information
898 * off to the target-specific pick-a-NaN routine.
900 if (is_nan(a
.cls
) || is_nan(b
.cls
) || is_nan(c
.cls
)) {
901 return pick_nan_muladd(a
, b
, c
, inf_zero
, s
);
905 s
->float_exception_flags
|= float_flag_invalid
;
906 a
.cls
= float_class_dnan
;
910 if (flags
& float_muladd_negate_c
) {
914 p_sign
= a
.sign
^ b
.sign
;
916 if (flags
& float_muladd_negate_product
) {
920 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_inf
) {
921 p_class
= float_class_inf
;
922 } else if (a
.cls
== float_class_zero
|| b
.cls
== float_class_zero
) {
923 p_class
= float_class_zero
;
925 p_class
= float_class_normal
;
928 if (c
.cls
== float_class_inf
) {
929 if (p_class
== float_class_inf
&& p_sign
!= c
.sign
) {
930 s
->float_exception_flags
|= float_flag_invalid
;
931 a
.cls
= float_class_dnan
;
933 a
.cls
= float_class_inf
;
934 a
.sign
= c
.sign
^ sign_flip
;
939 if (p_class
== float_class_inf
) {
940 a
.cls
= float_class_inf
;
941 a
.sign
= p_sign
^ sign_flip
;
945 if (p_class
== float_class_zero
) {
946 if (c
.cls
== float_class_zero
) {
947 if (p_sign
!= c
.sign
) {
948 p_sign
= s
->float_rounding_mode
== float_round_down
;
951 } else if (flags
& float_muladd_halve_result
) {
958 /* a & b should be normals now... */
959 assert(a
.cls
== float_class_normal
&&
960 b
.cls
== float_class_normal
);
962 p_exp
= a
.exp
+ b
.exp
;
964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
967 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
968 /* binary point now at bit 124 */
970 /* check for overflow */
971 if (hi
& (1ULL << (DECOMPOSED_BINARY_POINT
* 2 + 1 - 64))) {
972 shift128RightJamming(hi
, lo
, 1, &hi
, &lo
);
977 if (c
.cls
== float_class_zero
) {
978 /* move binary point back to 62 */
979 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
981 int exp_diff
= p_exp
- c
.exp
;
982 if (p_sign
== c
.sign
) {
985 shift128RightJamming(hi
, lo
,
986 DECOMPOSED_BINARY_POINT
- exp_diff
,
992 /* shift c to the same binary point as the product (124) */
995 shift128RightJamming(c_hi
, c_lo
,
998 add128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
999 /* move binary point back to 62 */
1000 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1003 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
1004 shift64RightJamming(lo
, 1, &lo
);
1010 uint64_t c_hi
, c_lo
;
1011 /* make C binary point match product at bit 124 */
1015 if (exp_diff
<= 0) {
1016 shift128RightJamming(hi
, lo
, -exp_diff
, &hi
, &lo
);
1019 (hi
> c_hi
|| (hi
== c_hi
&& lo
>= c_lo
))) {
1020 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1022 sub128(c_hi
, c_lo
, hi
, lo
, &hi
, &lo
);
1027 shift128RightJamming(c_hi
, c_lo
,
1030 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1033 if (hi
== 0 && lo
== 0) {
1034 a
.cls
= float_class_zero
;
1035 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1036 a
.sign
^= sign_flip
;
1043 shift
= clz64(lo
) + 64;
1045 /* Normalizing to a binary point of 124 is the
1046 correct adjust for the exponent. However since we're
1047 shifting, we might as well put the binary point back
1048 at 62 where we really want it. Therefore shift as
1049 if we're leaving 1 bit at the top of the word, but
1050 adjust the exponent as if we're leaving 3 bits. */
1053 lo
= lo
<< (shift
- 64);
1055 hi
= (hi
<< shift
) | (lo
>> (64 - shift
));
1056 lo
= hi
| ((lo
<< shift
) != 0);
1063 if (flags
& float_muladd_halve_result
) {
1067 /* finally prepare our result */
1068 a
.cls
= float_class_normal
;
1069 a
.sign
= p_sign
^ sign_flip
;
1076 float16
__attribute__((flatten
)) float16_muladd(float16 a
, float16 b
, float16 c
,
1077 int flags
, float_status
*status
)
1079 FloatParts pa
= float16_unpack_canonical(a
, status
);
1080 FloatParts pb
= float16_unpack_canonical(b
, status
);
1081 FloatParts pc
= float16_unpack_canonical(c
, status
);
1082 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1084 return float16_round_pack_canonical(pr
, status
);
1087 float32
__attribute__((flatten
)) float32_muladd(float32 a
, float32 b
, float32 c
,
1088 int flags
, float_status
*status
)
1090 FloatParts pa
= float32_unpack_canonical(a
, status
);
1091 FloatParts pb
= float32_unpack_canonical(b
, status
);
1092 FloatParts pc
= float32_unpack_canonical(c
, status
);
1093 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1095 return float32_round_pack_canonical(pr
, status
);
1098 float64
__attribute__((flatten
)) float64_muladd(float64 a
, float64 b
, float64 c
,
1099 int flags
, float_status
*status
)
1101 FloatParts pa
= float64_unpack_canonical(a
, status
);
1102 FloatParts pb
= float64_unpack_canonical(b
, status
);
1103 FloatParts pc
= float64_unpack_canonical(c
, status
);
1104 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1106 return float64_round_pack_canonical(pr
, status
);
1110 * Returns the result of dividing the floating-point value `a' by the
1111 * corresponding value `b'. The operation is performed according to
1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 static FloatParts
div_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1117 bool sign
= a
.sign
^ b
.sign
;
1119 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1120 uint64_t temp_lo
, temp_hi
;
1121 int exp
= a
.exp
- b
.exp
;
1122 if (a
.frac
< b
.frac
) {
1124 shortShift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1,
1125 &temp_hi
, &temp_lo
);
1127 shortShift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
,
1128 &temp_hi
, &temp_lo
);
1130 /* LSB of quot is set if inexact which roundandpack will use
1131 * to set flags. Yet again we re-use a for the result */
1132 a
.frac
= div128To64(temp_lo
, temp_hi
, b
.frac
);
1137 /* handle all the NaN cases */
1138 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1139 return pick_nan(a
, b
, s
);
1141 /* 0/0 or Inf/Inf */
1144 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1145 s
->float_exception_flags
|= float_flag_invalid
;
1146 a
.cls
= float_class_dnan
;
1149 /* Inf / x or 0 / x */
1150 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1155 if (b
.cls
== float_class_zero
) {
1156 s
->float_exception_flags
|= float_flag_divbyzero
;
1157 a
.cls
= float_class_inf
;
1162 if (b
.cls
== float_class_inf
) {
1163 a
.cls
= float_class_zero
;
1167 g_assert_not_reached();
1170 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1172 FloatParts pa
= float16_unpack_canonical(a
, status
);
1173 FloatParts pb
= float16_unpack_canonical(b
, status
);
1174 FloatParts pr
= div_floats(pa
, pb
, status
);
1176 return float16_round_pack_canonical(pr
, status
);
1179 float32
float32_div(float32 a
, float32 b
, float_status
*status
)
1181 FloatParts pa
= float32_unpack_canonical(a
, status
);
1182 FloatParts pb
= float32_unpack_canonical(b
, status
);
1183 FloatParts pr
= div_floats(pa
, pb
, status
);
1185 return float32_round_pack_canonical(pr
, status
);
1188 float64
float64_div(float64 a
, float64 b
, float_status
*status
)
1190 FloatParts pa
= float64_unpack_canonical(a
, status
);
1191 FloatParts pb
= float64_unpack_canonical(b
, status
);
1192 FloatParts pr
= div_floats(pa
, pb
, status
);
1194 return float64_round_pack_canonical(pr
, status
);
1198 * Rounds the floating-point value `a' to an integer, and returns the
1199 * result as a floating-point value. The operation is performed
1200 * according to the IEC/IEEE Standard for Binary Floating-Point
1204 static FloatParts
round_to_int(FloatParts a
, int rounding_mode
, float_status
*s
)
1206 if (is_nan(a
.cls
)) {
1207 return return_nan(a
, s
);
1211 case float_class_zero
:
1212 case float_class_inf
:
1213 case float_class_qnan
:
1214 /* already "integral" */
1216 case float_class_normal
:
1217 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
1218 /* already integral */
1223 /* all fractional */
1224 s
->float_exception_flags
|= float_flag_inexact
;
1225 switch (rounding_mode
) {
1226 case float_round_nearest_even
:
1227 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
1229 case float_round_ties_away
:
1230 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
1232 case float_round_to_zero
:
1235 case float_round_up
:
1238 case float_round_down
:
1242 g_assert_not_reached();
1246 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
1249 a
.cls
= float_class_zero
;
1252 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
1253 uint64_t frac_lsbm1
= frac_lsb
>> 1;
1254 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
1255 uint64_t rnd_mask
= rnd_even_mask
>> 1;
1258 switch (rounding_mode
) {
1259 case float_round_nearest_even
:
1260 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
1262 case float_round_ties_away
:
1265 case float_round_to_zero
:
1268 case float_round_up
:
1269 inc
= a
.sign
? 0 : rnd_mask
;
1271 case float_round_down
:
1272 inc
= a
.sign
? rnd_mask
: 0;
1275 g_assert_not_reached();
1278 if (a
.frac
& rnd_mask
) {
1279 s
->float_exception_flags
|= float_flag_inexact
;
1281 a
.frac
&= ~rnd_mask
;
1282 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
1290 g_assert_not_reached();
1295 float16
float16_round_to_int(float16 a
, float_status
*s
)
1297 FloatParts pa
= float16_unpack_canonical(a
, s
);
1298 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, s
);
1299 return float16_round_pack_canonical(pr
, s
);
1302 float32
float32_round_to_int(float32 a
, float_status
*s
)
1304 FloatParts pa
= float32_unpack_canonical(a
, s
);
1305 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, s
);
1306 return float32_round_pack_canonical(pr
, s
);
1309 float64
float64_round_to_int(float64 a
, float_status
*s
)
1311 FloatParts pa
= float64_unpack_canonical(a
, s
);
1312 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, s
);
1313 return float64_round_pack_canonical(pr
, s
);
1316 float64
float64_trunc_to_int(float64 a
, float_status
*s
)
1318 FloatParts pa
= float64_unpack_canonical(a
, s
);
1319 FloatParts pr
= round_to_int(pa
, float_round_to_zero
, s
);
1320 return float64_round_pack_canonical(pr
, s
);
1324 * Returns the result of converting the floating-point value `a' to
1325 * the two's complement integer format. The conversion is performed
1326 * according to the IEC/IEEE Standard for Binary Floating-Point
1327 * Arithmetic---which means in particular that the conversion is
1328 * rounded according to the current rounding mode. If `a' is a NaN,
1329 * the largest positive integer is returned. Otherwise, if the
1330 * conversion overflows, the largest integer with the same sign as `a'
1334 static int64_t round_to_int_and_pack(FloatParts in
, int rmode
,
1335 int64_t min
, int64_t max
,
1339 int orig_flags
= get_float_exception_flags(s
);
1340 FloatParts p
= round_to_int(in
, rmode
, s
);
1343 case float_class_snan
:
1344 case float_class_qnan
:
1345 case float_class_dnan
:
1346 case float_class_msnan
:
1347 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1349 case float_class_inf
:
1350 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1351 return p
.sign
? min
: max
;
1352 case float_class_zero
:
1354 case float_class_normal
:
1355 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
1356 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
1357 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
1358 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
1363 if (r
< -(uint64_t) min
) {
1366 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1373 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1378 g_assert_not_reached();
1382 #define FLOAT_TO_INT(fsz, isz) \
1383 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1386 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1387 return round_to_int_and_pack(p, s->float_rounding_mode, \
1388 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1392 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1393 (float ## fsz a, float_status *s) \
1395 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1396 return round_to_int_and_pack(p, float_round_to_zero, \
1397 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1401 FLOAT_TO_INT(16, 16)
1402 FLOAT_TO_INT(16, 32)
1403 FLOAT_TO_INT(16, 64)
1405 FLOAT_TO_INT(32, 16)
1406 FLOAT_TO_INT(32, 32)
1407 FLOAT_TO_INT(32, 64)
1409 FLOAT_TO_INT(64, 16)
1410 FLOAT_TO_INT(64, 32)
1411 FLOAT_TO_INT(64, 64)
1416 * Returns the result of converting the floating-point value `a' to
1417 * the unsigned integer format. The conversion is performed according
1418 * to the IEC/IEEE Standard for Binary Floating-Point
1419 * Arithmetic---which means in particular that the conversion is
1420 * rounded according to the current rounding mode. If `a' is a NaN,
1421 * the largest unsigned integer is returned. Otherwise, if the
1422 * conversion overflows, the largest unsigned integer is returned. If
1423 * the 'a' is negative, the result is rounded and zero is returned;
1424 * values that do not round to zero will raise the inexact exception
1428 static uint64_t round_to_uint_and_pack(FloatParts in
, int rmode
, uint64_t max
,
1431 int orig_flags
= get_float_exception_flags(s
);
1432 FloatParts p
= round_to_int(in
, rmode
, s
);
1435 case float_class_snan
:
1436 case float_class_qnan
:
1437 case float_class_dnan
:
1438 case float_class_msnan
:
1439 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1441 case float_class_inf
:
1442 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1443 return p
.sign
? 0 : max
;
1444 case float_class_zero
:
1446 case float_class_normal
:
1450 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1454 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
1455 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
1456 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
1457 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
1459 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1463 /* For uint64 this will never trip, but if p.exp is too large
1464 * to shift a decomposed fraction we shall have exited via the
1468 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1475 g_assert_not_reached();
1479 #define FLOAT_TO_UINT(fsz, isz) \
1480 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1483 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1484 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1485 UINT ## isz ## _MAX, s); \
1488 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1489 (float ## fsz a, float_status *s) \
1491 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1492 return round_to_uint_and_pack(p, float_round_to_zero, \
1493 UINT ## isz ## _MAX, s); \
1496 FLOAT_TO_UINT(16, 16)
1497 FLOAT_TO_UINT(16, 32)
1498 FLOAT_TO_UINT(16, 64)
1500 FLOAT_TO_UINT(32, 16)
1501 FLOAT_TO_UINT(32, 32)
1502 FLOAT_TO_UINT(32, 64)
1504 FLOAT_TO_UINT(64, 16)
1505 FLOAT_TO_UINT(64, 32)
1506 FLOAT_TO_UINT(64, 64)
1508 #undef FLOAT_TO_UINT
1511 * Integer to float conversions
1513 * Returns the result of converting the two's complement integer `a'
1514 * to the floating-point format. The conversion is performed according
1515 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1518 static FloatParts
int_to_float(int64_t a
, float_status
*status
)
1522 r
.cls
= float_class_zero
;
1524 } else if (a
== (1ULL << 63)) {
1525 r
.cls
= float_class_normal
;
1527 r
.frac
= DECOMPOSED_IMPLICIT_BIT
;
1538 int shift
= clz64(f
) - 1;
1539 r
.cls
= float_class_normal
;
1540 r
.exp
= (DECOMPOSED_BINARY_POINT
- shift
);
1541 r
.frac
= f
<< shift
;
1547 float16
int64_to_float16(int64_t a
, float_status
*status
)
1549 FloatParts pa
= int_to_float(a
, status
);
1550 return float16_round_pack_canonical(pa
, status
);
1553 float16
int32_to_float16(int32_t a
, float_status
*status
)
1555 return int64_to_float16(a
, status
);
1558 float16
int16_to_float16(int16_t a
, float_status
*status
)
1560 return int64_to_float16(a
, status
);
1563 float32
int64_to_float32(int64_t a
, float_status
*status
)
1565 FloatParts pa
= int_to_float(a
, status
);
1566 return float32_round_pack_canonical(pa
, status
);
1569 float32
int32_to_float32(int32_t a
, float_status
*status
)
1571 return int64_to_float32(a
, status
);
1574 float32
int16_to_float32(int16_t a
, float_status
*status
)
1576 return int64_to_float32(a
, status
);
1579 float64
int64_to_float64(int64_t a
, float_status
*status
)
1581 FloatParts pa
= int_to_float(a
, status
);
1582 return float64_round_pack_canonical(pa
, status
);
1585 float64
int32_to_float64(int32_t a
, float_status
*status
)
1587 return int64_to_float64(a
, status
);
1590 float64
int16_to_float64(int16_t a
, float_status
*status
)
1592 return int64_to_float64(a
, status
);
1597 * Unsigned Integer to float conversions
1599 * Returns the result of converting the unsigned integer `a' to the
1600 * floating-point format. The conversion is performed according to the
1601 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1604 static FloatParts
uint_to_float(uint64_t a
, float_status
*status
)
1606 FloatParts r
= { .sign
= false};
1609 r
.cls
= float_class_zero
;
1611 int spare_bits
= clz64(a
) - 1;
1612 r
.cls
= float_class_normal
;
1613 r
.exp
= DECOMPOSED_BINARY_POINT
- spare_bits
;
1614 if (spare_bits
< 0) {
1615 shift64RightJamming(a
, -spare_bits
, &a
);
1618 r
.frac
= a
<< spare_bits
;
1625 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
1627 FloatParts pa
= uint_to_float(a
, status
);
1628 return float16_round_pack_canonical(pa
, status
);
1631 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
1633 return uint64_to_float16(a
, status
);
1636 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
1638 return uint64_to_float16(a
, status
);
1641 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
1643 FloatParts pa
= uint_to_float(a
, status
);
1644 return float32_round_pack_canonical(pa
, status
);
1647 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
1649 return uint64_to_float32(a
, status
);
1652 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
1654 return uint64_to_float32(a
, status
);
1657 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
1659 FloatParts pa
= uint_to_float(a
, status
);
1660 return float64_round_pack_canonical(pa
, status
);
1663 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
1665 return uint64_to_float64(a
, status
);
1668 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
1670 return uint64_to_float64(a
, status
);
1674 /* min() and max() functions. These can't be implemented as
1675 * 'compare and pick one input' because that would mishandle
1676 * NaNs and +0 vs -0.
1678 * minnum() and maxnum() functions. These are similar to the min()
1679 * and max() functions but if one of the arguments is a QNaN and
1680 * the other is numerical then the numerical argument is returned.
1681 * SNaNs will get quietened before being returned.
1682 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1683 * and maxNum() operations. min() and max() are the typical min/max
1684 * semantics provided by many CPUs which predate that specification.
1686 * minnummag() and maxnummag() functions correspond to minNumMag()
1687 * and minNumMag() from the IEEE-754 2008.
1689 static FloatParts
minmax_floats(FloatParts a
, FloatParts b
, bool ismin
,
1690 bool ieee
, bool ismag
, float_status
*s
)
1692 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
1694 /* Takes two floating-point values `a' and `b', one of
1695 * which is a NaN, and returns the appropriate NaN
1696 * result. If either `a' or `b' is a signaling NaN,
1697 * the invalid exception is raised.
1699 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
1700 return pick_nan(a
, b
, s
);
1701 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
1703 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
1707 return pick_nan(a
, b
, s
);
1712 case float_class_normal
:
1715 case float_class_inf
:
1718 case float_class_zero
:
1722 g_assert_not_reached();
1726 case float_class_normal
:
1729 case float_class_inf
:
1732 case float_class_zero
:
1736 g_assert_not_reached();
1740 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
1741 bool a_less
= a_exp
< b_exp
;
1742 if (a_exp
== b_exp
) {
1743 a_less
= a
.frac
< b
.frac
;
1745 return a_less
^ ismin
? b
: a
;
1748 if (a
.sign
== b
.sign
) {
1749 bool a_less
= a_exp
< b_exp
;
1750 if (a_exp
== b_exp
) {
1751 a_less
= a
.frac
< b
.frac
;
1753 return a
.sign
^ a_less
^ ismin
? b
: a
;
1755 return a
.sign
^ ismin
? b
: a
;
1760 #define MINMAX(sz, name, ismin, isiee, ismag) \
1761 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1764 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1765 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1766 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1768 return float ## sz ## _round_pack_canonical(pr, s); \
1771 MINMAX(16, min
, true, false, false)
1772 MINMAX(16, minnum
, true, true, false)
1773 MINMAX(16, minnummag
, true, true, true)
1774 MINMAX(16, max
, false, false, false)
1775 MINMAX(16, maxnum
, false, true, false)
1776 MINMAX(16, maxnummag
, false, true, true)
1778 MINMAX(32, min
, true, false, false)
1779 MINMAX(32, minnum
, true, true, false)
1780 MINMAX(32, minnummag
, true, true, true)
1781 MINMAX(32, max
, false, false, false)
1782 MINMAX(32, maxnum
, false, true, false)
1783 MINMAX(32, maxnummag
, false, true, true)
1785 MINMAX(64, min
, true, false, false)
1786 MINMAX(64, minnum
, true, true, false)
1787 MINMAX(64, minnummag
, true, true, true)
1788 MINMAX(64, max
, false, false, false)
1789 MINMAX(64, maxnum
, false, true, false)
1790 MINMAX(64, maxnummag
, false, true, true)
1794 /* Floating point compare */
1795 static int compare_floats(FloatParts a
, FloatParts b
, bool is_quiet
,
1798 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1800 a
.cls
== float_class_snan
||
1801 b
.cls
== float_class_snan
) {
1802 s
->float_exception_flags
|= float_flag_invalid
;
1804 return float_relation_unordered
;
1807 if (a
.cls
== float_class_zero
) {
1808 if (b
.cls
== float_class_zero
) {
1809 return float_relation_equal
;
1811 return b
.sign
? float_relation_greater
: float_relation_less
;
1812 } else if (b
.cls
== float_class_zero
) {
1813 return a
.sign
? float_relation_less
: float_relation_greater
;
1816 /* The only really important thing about infinity is its sign. If
1817 * both are infinities the sign marks the smallest of the two.
1819 if (a
.cls
== float_class_inf
) {
1820 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
1821 return float_relation_equal
;
1823 return a
.sign
? float_relation_less
: float_relation_greater
;
1824 } else if (b
.cls
== float_class_inf
) {
1825 return b
.sign
? float_relation_greater
: float_relation_less
;
1828 if (a
.sign
!= b
.sign
) {
1829 return a
.sign
? float_relation_less
: float_relation_greater
;
1832 if (a
.exp
== b
.exp
) {
1833 if (a
.frac
== b
.frac
) {
1834 return float_relation_equal
;
1837 return a
.frac
> b
.frac
?
1838 float_relation_less
: float_relation_greater
;
1840 return a
.frac
> b
.frac
?
1841 float_relation_greater
: float_relation_less
;
1845 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
1847 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
1852 #define COMPARE(sz) \
1853 int float ## sz ## _compare(float ## sz a, float ## sz b, \
1856 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1857 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1858 return compare_floats(pa, pb, false, s); \
1860 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
1863 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1864 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1865 return compare_floats(pa, pb, true, s); \
1874 /* Multiply A by 2 raised to the power N. */
1875 static FloatParts
scalbn_decomposed(FloatParts a
, int n
, float_status
*s
)
1877 if (unlikely(is_nan(a
.cls
))) {
1878 return return_nan(a
, s
);
1880 if (a
.cls
== float_class_normal
) {
1881 /* The largest float type (even though not supported by FloatParts)
1882 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
1883 * still allows rounding to infinity, without allowing overflow
1884 * within the int32_t that backs FloatParts.exp.
1886 n
= MIN(MAX(n
, -0x10000), 0x10000);
1892 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
1894 FloatParts pa
= float16_unpack_canonical(a
, status
);
1895 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
1896 return float16_round_pack_canonical(pr
, status
);
1899 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
1901 FloatParts pa
= float32_unpack_canonical(a
, status
);
1902 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
1903 return float32_round_pack_canonical(pr
, status
);
1906 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
1908 FloatParts pa
= float64_unpack_canonical(a
, status
);
1909 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
1910 return float64_round_pack_canonical(pr
, status
);
1916 * The old softfloat code did an approximation step before zeroing in
1917 * on the final result. However for simpleness we just compute the
1918 * square root by iterating down from the implicit bit to enough extra
1919 * bits to ensure we get a correctly rounded result.
1921 * This does mean however the calculation is slower than before,
1922 * especially for 64 bit floats.
1925 static FloatParts
sqrt_float(FloatParts a
, float_status
*s
, const FloatFmt
*p
)
1927 uint64_t a_frac
, r_frac
, s_frac
;
1930 if (is_nan(a
.cls
)) {
1931 return return_nan(a
, s
);
1933 if (a
.cls
== float_class_zero
) {
1934 return a
; /* sqrt(+-0) = +-0 */
1937 s
->float_exception_flags
|= float_flag_invalid
;
1938 a
.cls
= float_class_dnan
;
1941 if (a
.cls
== float_class_inf
) {
1942 return a
; /* sqrt(+inf) = +inf */
1945 assert(a
.cls
== float_class_normal
);
1947 /* We need two overflow bits at the top. Adding room for that is a
1948 * right shift. If the exponent is odd, we can discard the low bit
1949 * by multiplying the fraction by 2; that's a left shift. Combine
1950 * those and we shift right if the exponent is even.
1958 /* Bit-by-bit computation of sqrt. */
1962 /* Iterate from implicit bit down to the 3 extra bits to compute a
1963 * properly rounded result. Remember we've inserted one more bit
1964 * at the top, so these positions are one less.
1966 bit
= DECOMPOSED_BINARY_POINT
- 1;
1967 last_bit
= MAX(p
->frac_shift
- 4, 0);
1969 uint64_t q
= 1ULL << bit
;
1970 uint64_t t_frac
= s_frac
+ q
;
1971 if (t_frac
<= a_frac
) {
1972 s_frac
= t_frac
+ q
;
1977 } while (--bit
>= last_bit
);
1979 /* Undo the right shift done above. If there is any remaining
1980 * fraction, the result is inexact. Set the sticky bit.
1982 a
.frac
= (r_frac
<< 1) + (a_frac
!= 0);
1987 float16
__attribute__((flatten
)) float16_sqrt(float16 a
, float_status
*status
)
1989 FloatParts pa
= float16_unpack_canonical(a
, status
);
1990 FloatParts pr
= sqrt_float(pa
, status
, &float16_params
);
1991 return float16_round_pack_canonical(pr
, status
);
1994 float32
__attribute__((flatten
)) float32_sqrt(float32 a
, float_status
*status
)
1996 FloatParts pa
= float32_unpack_canonical(a
, status
);
1997 FloatParts pr
= sqrt_float(pa
, status
, &float32_params
);
1998 return float32_round_pack_canonical(pr
, status
);
2001 float64
__attribute__((flatten
)) float64_sqrt(float64 a
, float_status
*status
)
2003 FloatParts pa
= float64_unpack_canonical(a
, status
);
2004 FloatParts pr
= sqrt_float(pa
, status
, &float64_params
);
2005 return float64_round_pack_canonical(pr
, status
);
2009 /*----------------------------------------------------------------------------
2010 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2011 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2012 | input. If `zSign' is 1, the input is negated before being converted to an
2013 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2014 | is simply rounded to an integer, with the inexact exception raised if the
2015 | input cannot be represented exactly as an integer. However, if the fixed-
2016 | point input is too large, the invalid exception is raised and the largest
2017 | positive or negative integer is returned.
2018 *----------------------------------------------------------------------------*/
2020 static int32_t roundAndPackInt32(flag zSign
, uint64_t absZ
, float_status
*status
)
2022 int8_t roundingMode
;
2023 flag roundNearestEven
;
2024 int8_t roundIncrement
, roundBits
;
2027 roundingMode
= status
->float_rounding_mode
;
2028 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2029 switch (roundingMode
) {
2030 case float_round_nearest_even
:
2031 case float_round_ties_away
:
2032 roundIncrement
= 0x40;
2034 case float_round_to_zero
:
2037 case float_round_up
:
2038 roundIncrement
= zSign
? 0 : 0x7f;
2040 case float_round_down
:
2041 roundIncrement
= zSign
? 0x7f : 0;
2046 roundBits
= absZ
& 0x7F;
2047 absZ
= ( absZ
+ roundIncrement
)>>7;
2048 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
2050 if ( zSign
) z
= - z
;
2051 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
2052 float_raise(float_flag_invalid
, status
);
2053 return zSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
2056 status
->float_exception_flags
|= float_flag_inexact
;
2062 /*----------------------------------------------------------------------------
2063 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2064 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2065 | and returns the properly rounded 64-bit integer corresponding to the input.
2066 | If `zSign' is 1, the input is negated before being converted to an integer.
2067 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2068 | the inexact exception raised if the input cannot be represented exactly as
2069 | an integer. However, if the fixed-point input is too large, the invalid
2070 | exception is raised and the largest positive or negative integer is
2072 *----------------------------------------------------------------------------*/
2074 static int64_t roundAndPackInt64(flag zSign
, uint64_t absZ0
, uint64_t absZ1
,
2075 float_status
*status
)
2077 int8_t roundingMode
;
2078 flag roundNearestEven
, increment
;
2081 roundingMode
= status
->float_rounding_mode
;
2082 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2083 switch (roundingMode
) {
2084 case float_round_nearest_even
:
2085 case float_round_ties_away
:
2086 increment
= ((int64_t) absZ1
< 0);
2088 case float_round_to_zero
:
2091 case float_round_up
:
2092 increment
= !zSign
&& absZ1
;
2094 case float_round_down
:
2095 increment
= zSign
&& absZ1
;
2102 if ( absZ0
== 0 ) goto overflow
;
2103 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
2106 if ( zSign
) z
= - z
;
2107 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
2109 float_raise(float_flag_invalid
, status
);
2111 zSign
? (int64_t) LIT64( 0x8000000000000000 )
2112 : LIT64( 0x7FFFFFFFFFFFFFFF );
2115 status
->float_exception_flags
|= float_flag_inexact
;
2121 /*----------------------------------------------------------------------------
2122 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2123 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2124 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2125 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2126 | with the inexact exception raised if the input cannot be represented exactly
2127 | as an integer. However, if the fixed-point input is too large, the invalid
2128 | exception is raised and the largest unsigned integer is returned.
2129 *----------------------------------------------------------------------------*/
2131 static int64_t roundAndPackUint64(flag zSign
, uint64_t absZ0
,
2132 uint64_t absZ1
, float_status
*status
)
2134 int8_t roundingMode
;
2135 flag roundNearestEven
, increment
;
2137 roundingMode
= status
->float_rounding_mode
;
2138 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
2139 switch (roundingMode
) {
2140 case float_round_nearest_even
:
2141 case float_round_ties_away
:
2142 increment
= ((int64_t)absZ1
< 0);
2144 case float_round_to_zero
:
2147 case float_round_up
:
2148 increment
= !zSign
&& absZ1
;
2150 case float_round_down
:
2151 increment
= zSign
&& absZ1
;
2159 float_raise(float_flag_invalid
, status
);
2160 return LIT64(0xFFFFFFFFFFFFFFFF);
2162 absZ0
&= ~(((uint64_t)(absZ1
<<1) == 0) & roundNearestEven
);
2165 if (zSign
&& absZ0
) {
2166 float_raise(float_flag_invalid
, status
);
2171 status
->float_exception_flags
|= float_flag_inexact
;
2176 /*----------------------------------------------------------------------------
2177 | If `a' is denormal and we are in flush-to-zero mode then set the
2178 | input-denormal exception and return zero. Otherwise just return the value.
2179 *----------------------------------------------------------------------------*/
2180 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
2182 if (status
->flush_inputs_to_zero
) {
2183 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
2184 float_raise(float_flag_input_denormal
, status
);
2185 return make_float32(float32_val(a
) & 0x80000000);
2191 /*----------------------------------------------------------------------------
2192 | Normalizes the subnormal single-precision floating-point value represented
2193 | by the denormalized significand `aSig'. The normalized exponent and
2194 | significand are stored at the locations pointed to by `zExpPtr' and
2195 | `zSigPtr', respectively.
2196 *----------------------------------------------------------------------------*/
2199 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
2203 shiftCount
= countLeadingZeros32( aSig
) - 8;
2204 *zSigPtr
= aSig
<<shiftCount
;
2205 *zExpPtr
= 1 - shiftCount
;
2209 /*----------------------------------------------------------------------------
2210 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2211 | and significand `zSig', and returns the proper single-precision floating-
2212 | point value corresponding to the abstract input. Ordinarily, the abstract
2213 | value is simply rounded and packed into the single-precision format, with
2214 | the inexact exception raised if the abstract input cannot be represented
2215 | exactly. However, if the abstract value is too large, the overflow and
2216 | inexact exceptions are raised and an infinity or maximal finite value is
2217 | returned. If the abstract value is too small, the input value is rounded to
2218 | a subnormal number, and the underflow and inexact exceptions are raised if
2219 | the abstract input cannot be represented exactly as a subnormal single-
2220 | precision floating-point number.
2221 | The input significand `zSig' has its binary point between bits 30
2222 | and 29, which is 7 bits to the left of the usual location. This shifted
2223 | significand must be normalized or smaller. If `zSig' is not normalized,
2224 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2225 | and it must not require rounding. In the usual case that `zSig' is
2226 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2227 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2228 | Binary Floating-Point Arithmetic.
2229 *----------------------------------------------------------------------------*/
2231 static float32
roundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
2232 float_status
*status
)
2234 int8_t roundingMode
;
2235 flag roundNearestEven
;
2236 int8_t roundIncrement
, roundBits
;
2239 roundingMode
= status
->float_rounding_mode
;
2240 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2241 switch (roundingMode
) {
2242 case float_round_nearest_even
:
2243 case float_round_ties_away
:
2244 roundIncrement
= 0x40;
2246 case float_round_to_zero
:
2249 case float_round_up
:
2250 roundIncrement
= zSign
? 0 : 0x7f;
2252 case float_round_down
:
2253 roundIncrement
= zSign
? 0x7f : 0;
2259 roundBits
= zSig
& 0x7F;
2260 if ( 0xFD <= (uint16_t) zExp
) {
2261 if ( ( 0xFD < zExp
)
2262 || ( ( zExp
== 0xFD )
2263 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
2265 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2266 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
2269 if (status
->flush_to_zero
) {
2270 float_raise(float_flag_output_denormal
, status
);
2271 return packFloat32(zSign
, 0, 0);
2274 (status
->float_detect_tininess
2275 == float_tininess_before_rounding
)
2277 || ( zSig
+ roundIncrement
< 0x80000000 );
2278 shift32RightJamming( zSig
, - zExp
, &zSig
);
2280 roundBits
= zSig
& 0x7F;
2281 if (isTiny
&& roundBits
) {
2282 float_raise(float_flag_underflow
, status
);
2287 status
->float_exception_flags
|= float_flag_inexact
;
2289 zSig
= ( zSig
+ roundIncrement
)>>7;
2290 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
2291 if ( zSig
== 0 ) zExp
= 0;
2292 return packFloat32( zSign
, zExp
, zSig
);
2296 /*----------------------------------------------------------------------------
2297 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2298 | and significand `zSig', and returns the proper single-precision floating-
2299 | point value corresponding to the abstract input. This routine is just like
2300 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2301 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2302 | floating-point exponent.
2303 *----------------------------------------------------------------------------*/
2306 normalizeRoundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
2307 float_status
*status
)
2311 shiftCount
= countLeadingZeros32( zSig
) - 1;
2312 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
2317 /*----------------------------------------------------------------------------
2318 | If `a' is denormal and we are in flush-to-zero mode then set the
2319 | input-denormal exception and return zero. Otherwise just return the value.
2320 *----------------------------------------------------------------------------*/
2321 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
2323 if (status
->flush_inputs_to_zero
) {
2324 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
2325 float_raise(float_flag_input_denormal
, status
);
2326 return make_float64(float64_val(a
) & (1ULL << 63));
2332 /*----------------------------------------------------------------------------
2333 | Normalizes the subnormal double-precision floating-point value represented
2334 | by the denormalized significand `aSig'. The normalized exponent and
2335 | significand are stored at the locations pointed to by `zExpPtr' and
2336 | `zSigPtr', respectively.
2337 *----------------------------------------------------------------------------*/
2340 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
2344 shiftCount
= countLeadingZeros64( aSig
) - 11;
2345 *zSigPtr
= aSig
<<shiftCount
;
2346 *zExpPtr
= 1 - shiftCount
;
2350 /*----------------------------------------------------------------------------
2351 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2352 | double-precision floating-point value, returning the result. After being
2353 | shifted into the proper positions, the three fields are simply added
2354 | together to form the result. This means that any integer portion of `zSig'
2355 | will be added into the exponent. Since a properly normalized significand
2356 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2357 | than the desired result exponent whenever `zSig' is a complete, normalized
2359 *----------------------------------------------------------------------------*/
2361 static inline float64
packFloat64(flag zSign
, int zExp
, uint64_t zSig
)
2364 return make_float64(
2365 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
2369 /*----------------------------------------------------------------------------
2370 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2371 | and significand `zSig', and returns the proper double-precision floating-
2372 | point value corresponding to the abstract input. Ordinarily, the abstract
2373 | value is simply rounded and packed into the double-precision format, with
2374 | the inexact exception raised if the abstract input cannot be represented
2375 | exactly. However, if the abstract value is too large, the overflow and
2376 | inexact exceptions are raised and an infinity or maximal finite value is
2377 | returned. If the abstract value is too small, the input value is rounded to
2378 | a subnormal number, and the underflow and inexact exceptions are raised if
2379 | the abstract input cannot be represented exactly as a subnormal double-
2380 | precision floating-point number.
2381 | The input significand `zSig' has its binary point between bits 62
2382 | and 61, which is 10 bits to the left of the usual location. This shifted
2383 | significand must be normalized or smaller. If `zSig' is not normalized,
2384 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2385 | and it must not require rounding. In the usual case that `zSig' is
2386 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2387 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2388 | Binary Floating-Point Arithmetic.
2389 *----------------------------------------------------------------------------*/
2391 static float64
roundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
2392 float_status
*status
)
2394 int8_t roundingMode
;
2395 flag roundNearestEven
;
2396 int roundIncrement
, roundBits
;
2399 roundingMode
= status
->float_rounding_mode
;
2400 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2401 switch (roundingMode
) {
2402 case float_round_nearest_even
:
2403 case float_round_ties_away
:
2404 roundIncrement
= 0x200;
2406 case float_round_to_zero
:
2409 case float_round_up
:
2410 roundIncrement
= zSign
? 0 : 0x3ff;
2412 case float_round_down
:
2413 roundIncrement
= zSign
? 0x3ff : 0;
2415 case float_round_to_odd
:
2416 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
2421 roundBits
= zSig
& 0x3FF;
2422 if ( 0x7FD <= (uint16_t) zExp
) {
2423 if ( ( 0x7FD < zExp
)
2424 || ( ( zExp
== 0x7FD )
2425 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
2427 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
2428 roundIncrement
!= 0;
2429 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2430 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
2433 if (status
->flush_to_zero
) {
2434 float_raise(float_flag_output_denormal
, status
);
2435 return packFloat64(zSign
, 0, 0);
2438 (status
->float_detect_tininess
2439 == float_tininess_before_rounding
)
2441 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
2442 shift64RightJamming( zSig
, - zExp
, &zSig
);
2444 roundBits
= zSig
& 0x3FF;
2445 if (isTiny
&& roundBits
) {
2446 float_raise(float_flag_underflow
, status
);
2448 if (roundingMode
== float_round_to_odd
) {
2450 * For round-to-odd case, the roundIncrement depends on
2451 * zSig which just changed.
2453 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
2458 status
->float_exception_flags
|= float_flag_inexact
;
2460 zSig
= ( zSig
+ roundIncrement
)>>10;
2461 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
2462 if ( zSig
== 0 ) zExp
= 0;
2463 return packFloat64( zSign
, zExp
, zSig
);
2467 /*----------------------------------------------------------------------------
2468 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2469 | and significand `zSig', and returns the proper double-precision floating-
2470 | point value corresponding to the abstract input. This routine is just like
2471 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2472 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2473 | floating-point exponent.
2474 *----------------------------------------------------------------------------*/
2477 normalizeRoundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
2478 float_status
*status
)
2482 shiftCount
= countLeadingZeros64( zSig
) - 1;
2483 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
2488 /*----------------------------------------------------------------------------
2489 | Normalizes the subnormal extended double-precision floating-point value
2490 | represented by the denormalized significand `aSig'. The normalized exponent
2491 | and significand are stored at the locations pointed to by `zExpPtr' and
2492 | `zSigPtr', respectively.
2493 *----------------------------------------------------------------------------*/
2495 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
2500 shiftCount
= countLeadingZeros64( aSig
);
2501 *zSigPtr
= aSig
<<shiftCount
;
2502 *zExpPtr
= 1 - shiftCount
;
2505 /*----------------------------------------------------------------------------
2506 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2507 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2508 | and returns the proper extended double-precision floating-point value
2509 | corresponding to the abstract input. Ordinarily, the abstract value is
2510 | rounded and packed into the extended double-precision format, with the
2511 | inexact exception raised if the abstract input cannot be represented
2512 | exactly. However, if the abstract value is too large, the overflow and
2513 | inexact exceptions are raised and an infinity or maximal finite value is
2514 | returned. If the abstract value is too small, the input value is rounded to
2515 | a subnormal number, and the underflow and inexact exceptions are raised if
2516 | the abstract input cannot be represented exactly as a subnormal extended
2517 | double-precision floating-point number.
2518 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
2519 | number of bits as single or double precision, respectively. Otherwise, the
2520 | result is rounded to the full precision of the extended double-precision
2522 | The input significand must be normalized or smaller. If the input
2523 | significand is not normalized, `zExp' must be 0; in that case, the result
2524 | returned is a subnormal number, and it must not require rounding. The
2525 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2526 | Floating-Point Arithmetic.
2527 *----------------------------------------------------------------------------*/
2529 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, flag zSign
,
2530 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
2531 float_status
*status
)
2533 int8_t roundingMode
;
2534 flag roundNearestEven
, increment
, isTiny
;
2535 int64_t roundIncrement
, roundMask
, roundBits
;
2537 roundingMode
= status
->float_rounding_mode
;
2538 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2539 if ( roundingPrecision
== 80 ) goto precision80
;
2540 if ( roundingPrecision
== 64 ) {
2541 roundIncrement
= LIT64( 0x0000000000000400 );
2542 roundMask
= LIT64( 0x00000000000007FF );
2544 else if ( roundingPrecision
== 32 ) {
2545 roundIncrement
= LIT64( 0x0000008000000000 );
2546 roundMask
= LIT64( 0x000000FFFFFFFFFF );
2551 zSig0
|= ( zSig1
!= 0 );
2552 switch (roundingMode
) {
2553 case float_round_nearest_even
:
2554 case float_round_ties_away
:
2556 case float_round_to_zero
:
2559 case float_round_up
:
2560 roundIncrement
= zSign
? 0 : roundMask
;
2562 case float_round_down
:
2563 roundIncrement
= zSign
? roundMask
: 0;
2568 roundBits
= zSig0
& roundMask
;
2569 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
2570 if ( ( 0x7FFE < zExp
)
2571 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
2576 if (status
->flush_to_zero
) {
2577 float_raise(float_flag_output_denormal
, status
);
2578 return packFloatx80(zSign
, 0, 0);
2581 (status
->float_detect_tininess
2582 == float_tininess_before_rounding
)
2584 || ( zSig0
<= zSig0
+ roundIncrement
);
2585 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
2587 roundBits
= zSig0
& roundMask
;
2588 if (isTiny
&& roundBits
) {
2589 float_raise(float_flag_underflow
, status
);
2592 status
->float_exception_flags
|= float_flag_inexact
;
2594 zSig0
+= roundIncrement
;
2595 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
2596 roundIncrement
= roundMask
+ 1;
2597 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
2598 roundMask
|= roundIncrement
;
2600 zSig0
&= ~ roundMask
;
2601 return packFloatx80( zSign
, zExp
, zSig0
);
2605 status
->float_exception_flags
|= float_flag_inexact
;
2607 zSig0
+= roundIncrement
;
2608 if ( zSig0
< roundIncrement
) {
2610 zSig0
= LIT64( 0x8000000000000000 );
2612 roundIncrement
= roundMask
+ 1;
2613 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
2614 roundMask
|= roundIncrement
;
2616 zSig0
&= ~ roundMask
;
2617 if ( zSig0
== 0 ) zExp
= 0;
2618 return packFloatx80( zSign
, zExp
, zSig0
);
2620 switch (roundingMode
) {
2621 case float_round_nearest_even
:
2622 case float_round_ties_away
:
2623 increment
= ((int64_t)zSig1
< 0);
2625 case float_round_to_zero
:
2628 case float_round_up
:
2629 increment
= !zSign
&& zSig1
;
2631 case float_round_down
:
2632 increment
= zSign
&& zSig1
;
2637 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
2638 if ( ( 0x7FFE < zExp
)
2639 || ( ( zExp
== 0x7FFE )
2640 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
2646 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2647 if ( ( roundingMode
== float_round_to_zero
)
2648 || ( zSign
&& ( roundingMode
== float_round_up
) )
2649 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
2651 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
2653 return packFloatx80(zSign
,
2654 floatx80_infinity_high
,
2655 floatx80_infinity_low
);
2659 (status
->float_detect_tininess
2660 == float_tininess_before_rounding
)
2663 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
2664 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
2666 if (isTiny
&& zSig1
) {
2667 float_raise(float_flag_underflow
, status
);
2670 status
->float_exception_flags
|= float_flag_inexact
;
2672 switch (roundingMode
) {
2673 case float_round_nearest_even
:
2674 case float_round_ties_away
:
2675 increment
= ((int64_t)zSig1
< 0);
2677 case float_round_to_zero
:
2680 case float_round_up
:
2681 increment
= !zSign
&& zSig1
;
2683 case float_round_down
:
2684 increment
= zSign
&& zSig1
;
2692 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
2693 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
2695 return packFloatx80( zSign
, zExp
, zSig0
);
2699 status
->float_exception_flags
|= float_flag_inexact
;
2705 zSig0
= LIT64( 0x8000000000000000 );
2708 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
2712 if ( zSig0
== 0 ) zExp
= 0;
2714 return packFloatx80( zSign
, zExp
, zSig0
);
2718 /*----------------------------------------------------------------------------
2719 | Takes an abstract floating-point value having sign `zSign', exponent
2720 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2721 | and returns the proper extended double-precision floating-point value
2722 | corresponding to the abstract input. This routine is just like
2723 | `roundAndPackFloatx80' except that the input significand does not have to be
2725 *----------------------------------------------------------------------------*/
2727 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
2728 flag zSign
, int32_t zExp
,
2729 uint64_t zSig0
, uint64_t zSig1
,
2730 float_status
*status
)
2739 shiftCount
= countLeadingZeros64( zSig0
);
2740 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
2742 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
2743 zSig0
, zSig1
, status
);
2747 /*----------------------------------------------------------------------------
2748 | Returns the least-significant 64 fraction bits of the quadruple-precision
2749 | floating-point value `a'.
2750 *----------------------------------------------------------------------------*/
2752 static inline uint64_t extractFloat128Frac1( float128 a
)
2759 /*----------------------------------------------------------------------------
2760 | Returns the most-significant 48 fraction bits of the quadruple-precision
2761 | floating-point value `a'.
2762 *----------------------------------------------------------------------------*/
2764 static inline uint64_t extractFloat128Frac0( float128 a
)
2767 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
2771 /*----------------------------------------------------------------------------
2772 | Returns the exponent bits of the quadruple-precision floating-point value
2774 *----------------------------------------------------------------------------*/
2776 static inline int32_t extractFloat128Exp( float128 a
)
2779 return ( a
.high
>>48 ) & 0x7FFF;
2783 /*----------------------------------------------------------------------------
2784 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2785 *----------------------------------------------------------------------------*/
2787 static inline flag
extractFloat128Sign( float128 a
)
2794 /*----------------------------------------------------------------------------
2795 | Normalizes the subnormal quadruple-precision floating-point value
2796 | represented by the denormalized significand formed by the concatenation of
2797 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
2798 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2799 | significand are stored at the location pointed to by `zSig0Ptr', and the
2800 | least significant 64 bits of the normalized significand are stored at the
2801 | location pointed to by `zSig1Ptr'.
2802 *----------------------------------------------------------------------------*/
2805 normalizeFloat128Subnormal(
2816 shiftCount
= countLeadingZeros64( aSig1
) - 15;
2817 if ( shiftCount
< 0 ) {
2818 *zSig0Ptr
= aSig1
>>( - shiftCount
);
2819 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
2822 *zSig0Ptr
= aSig1
<<shiftCount
;
2825 *zExpPtr
= - shiftCount
- 63;
2828 shiftCount
= countLeadingZeros64( aSig0
) - 15;
2829 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
2830 *zExpPtr
= 1 - shiftCount
;
2835 /*----------------------------------------------------------------------------
2836 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2837 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2838 | floating-point value, returning the result. After being shifted into the
2839 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2840 | added together to form the most significant 32 bits of the result. This
2841 | means that any integer portion of `zSig0' will be added into the exponent.
2842 | Since a properly normalized significand will have an integer portion equal
2843 | to 1, the `zExp' input should be 1 less than the desired result exponent
2844 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2846 *----------------------------------------------------------------------------*/
2848 static inline float128
2849 packFloat128( flag zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
2854 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
2859 /*----------------------------------------------------------------------------
2860 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2861 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2862 | and `zSig2', and returns the proper quadruple-precision floating-point value
2863 | corresponding to the abstract input. Ordinarily, the abstract value is
2864 | simply rounded and packed into the quadruple-precision format, with the
2865 | inexact exception raised if the abstract input cannot be represented
2866 | exactly. However, if the abstract value is too large, the overflow and
2867 | inexact exceptions are raised and an infinity or maximal finite value is
2868 | returned. If the abstract value is too small, the input value is rounded to
2869 | a subnormal number, and the underflow and inexact exceptions are raised if
2870 | the abstract input cannot be represented exactly as a subnormal quadruple-
2871 | precision floating-point number.
2872 | The input significand must be normalized or smaller. If the input
2873 | significand is not normalized, `zExp' must be 0; in that case, the result
2874 | returned is a subnormal number, and it must not require rounding. In the
2875 | usual case that the input significand is normalized, `zExp' must be 1 less
2876 | than the ``true'' floating-point exponent. The handling of underflow and
2877 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2878 *----------------------------------------------------------------------------*/
2880 static float128
roundAndPackFloat128(flag zSign
, int32_t zExp
,
2881 uint64_t zSig0
, uint64_t zSig1
,
2882 uint64_t zSig2
, float_status
*status
)
2884 int8_t roundingMode
;
2885 flag roundNearestEven
, increment
, isTiny
;
2887 roundingMode
= status
->float_rounding_mode
;
2888 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2889 switch (roundingMode
) {
2890 case float_round_nearest_even
:
2891 case float_round_ties_away
:
2892 increment
= ((int64_t)zSig2
< 0);
2894 case float_round_to_zero
:
2897 case float_round_up
:
2898 increment
= !zSign
&& zSig2
;
2900 case float_round_down
:
2901 increment
= zSign
&& zSig2
;
2903 case float_round_to_odd
:
2904 increment
= !(zSig1
& 0x1) && zSig2
;
2909 if ( 0x7FFD <= (uint32_t) zExp
) {
2910 if ( ( 0x7FFD < zExp
)
2911 || ( ( zExp
== 0x7FFD )
2913 LIT64( 0x0001FFFFFFFFFFFF ),
2914 LIT64( 0xFFFFFFFFFFFFFFFF ),
2921 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2922 if ( ( roundingMode
== float_round_to_zero
)
2923 || ( zSign
&& ( roundingMode
== float_round_up
) )
2924 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
2925 || (roundingMode
== float_round_to_odd
)
2931 LIT64( 0x0000FFFFFFFFFFFF ),
2932 LIT64( 0xFFFFFFFFFFFFFFFF )
2935 return packFloat128( zSign
, 0x7FFF, 0, 0 );
2938 if (status
->flush_to_zero
) {
2939 float_raise(float_flag_output_denormal
, status
);
2940 return packFloat128(zSign
, 0, 0, 0);
2943 (status
->float_detect_tininess
2944 == float_tininess_before_rounding
)
2950 LIT64( 0x0001FFFFFFFFFFFF ),
2951 LIT64( 0xFFFFFFFFFFFFFFFF )
2953 shift128ExtraRightJamming(
2954 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
2956 if (isTiny
&& zSig2
) {
2957 float_raise(float_flag_underflow
, status
);
2959 switch (roundingMode
) {
2960 case float_round_nearest_even
:
2961 case float_round_ties_away
:
2962 increment
= ((int64_t)zSig2
< 0);
2964 case float_round_to_zero
:
2967 case float_round_up
:
2968 increment
= !zSign
&& zSig2
;
2970 case float_round_down
:
2971 increment
= zSign
&& zSig2
;
2973 case float_round_to_odd
:
2974 increment
= !(zSig1
& 0x1) && zSig2
;
2982 status
->float_exception_flags
|= float_flag_inexact
;
2985 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
2986 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
2989 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
2991 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
2995 /*----------------------------------------------------------------------------
2996 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2997 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2998 | returns the proper quadruple-precision floating-point value corresponding
2999 | to the abstract input. This routine is just like `roundAndPackFloat128'
3000 | except that the input significand has fewer bits and does not have to be
3001 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
3003 *----------------------------------------------------------------------------*/
3005 static float128
normalizeRoundAndPackFloat128(flag zSign
, int32_t zExp
,
3006 uint64_t zSig0
, uint64_t zSig1
,
3007 float_status
*status
)
3017 shiftCount
= countLeadingZeros64( zSig0
) - 15;
3018 if ( 0 <= shiftCount
) {
3020 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
3023 shift128ExtraRightJamming(
3024 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
3027 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
3032 /*----------------------------------------------------------------------------
3033 | Returns the result of converting the 32-bit two's complement integer `a'
3034 | to the extended double-precision floating-point format. The conversion
3035 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3037 *----------------------------------------------------------------------------*/
3039 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
3046 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
3048 absA
= zSign
? - a
: a
;
3049 shiftCount
= countLeadingZeros32( absA
) + 32;
3051 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
3055 /*----------------------------------------------------------------------------
3056 | Returns the result of converting the 32-bit two's complement integer `a' to
3057 | the quadruple-precision floating-point format. The conversion is performed
3058 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3059 *----------------------------------------------------------------------------*/
3061 float128
int32_to_float128(int32_t a
, float_status
*status
)
3068 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
3070 absA
= zSign
? - a
: a
;
3071 shiftCount
= countLeadingZeros32( absA
) + 17;
3073 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
3077 /*----------------------------------------------------------------------------
3078 | Returns the result of converting the 64-bit two's complement integer `a'
3079 | to the extended double-precision floating-point format. The conversion
3080 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3082 *----------------------------------------------------------------------------*/
3084 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
3090 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
3092 absA
= zSign
? - a
: a
;
3093 shiftCount
= countLeadingZeros64( absA
);
3094 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
3098 /*----------------------------------------------------------------------------
3099 | Returns the result of converting the 64-bit two's complement integer `a' to
3100 | the quadruple-precision floating-point format. The conversion is performed
3101 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3102 *----------------------------------------------------------------------------*/
3104 float128
int64_to_float128(int64_t a
, float_status
*status
)
3110 uint64_t zSig0
, zSig1
;
3112 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
3114 absA
= zSign
? - a
: a
;
3115 shiftCount
= countLeadingZeros64( absA
) + 49;
3116 zExp
= 0x406E - shiftCount
;
3117 if ( 64 <= shiftCount
) {
3126 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
3127 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
3131 /*----------------------------------------------------------------------------
3132 | Returns the result of converting the 64-bit unsigned integer `a'
3133 | to the quadruple-precision floating-point format. The conversion is performed
3134 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3135 *----------------------------------------------------------------------------*/
3137 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
3140 return float128_zero
;
3142 return normalizeRoundAndPackFloat128(0, 0x406E, a
, 0, status
);
3148 /*----------------------------------------------------------------------------
3149 | Returns the result of converting the single-precision floating-point value
3150 | `a' to the double-precision floating-point format. The conversion is
3151 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3153 *----------------------------------------------------------------------------*/
3155 float64
float32_to_float64(float32 a
, float_status
*status
)
3160 a
= float32_squash_input_denormal(a
, status
);
3162 aSig
= extractFloat32Frac( a
);
3163 aExp
= extractFloat32Exp( a
);
3164 aSign
= extractFloat32Sign( a
);
3165 if ( aExp
== 0xFF ) {
3167 return commonNaNToFloat64(float32ToCommonNaN(a
, status
), status
);
3169 return packFloat64( aSign
, 0x7FF, 0 );
3172 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
3173 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3176 return packFloat64( aSign
, aExp
+ 0x380, ( (uint64_t) aSig
)<<29 );
3180 /*----------------------------------------------------------------------------
3181 | Returns the result of converting the single-precision floating-point value
3182 | `a' to the extended double-precision floating-point format. The conversion
3183 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3185 *----------------------------------------------------------------------------*/
3187 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
3193 a
= float32_squash_input_denormal(a
, status
);
3194 aSig
= extractFloat32Frac( a
);
3195 aExp
= extractFloat32Exp( a
);
3196 aSign
= extractFloat32Sign( a
);
3197 if ( aExp
== 0xFF ) {
3199 return commonNaNToFloatx80(float32ToCommonNaN(a
, status
), status
);
3201 return packFloatx80(aSign
,
3202 floatx80_infinity_high
,
3203 floatx80_infinity_low
);
3206 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
3207 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3210 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
3214 /*----------------------------------------------------------------------------
3215 | Returns the result of converting the single-precision floating-point value
3216 | `a' to the double-precision floating-point format. The conversion is
3217 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3219 *----------------------------------------------------------------------------*/
3221 float128
float32_to_float128(float32 a
, float_status
*status
)
3227 a
= float32_squash_input_denormal(a
, status
);
3228 aSig
= extractFloat32Frac( a
);
3229 aExp
= extractFloat32Exp( a
);
3230 aSign
= extractFloat32Sign( a
);
3231 if ( aExp
== 0xFF ) {
3233 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
3235 return packFloat128( aSign
, 0x7FFF, 0, 0 );
3238 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
3239 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3242 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
3246 /*----------------------------------------------------------------------------
3247 | Returns the remainder of the single-precision floating-point value `a'
3248 | with respect to the corresponding value `b'. The operation is performed
3249 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3250 *----------------------------------------------------------------------------*/
3252 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
3255 int aExp
, bExp
, expDiff
;
3256 uint32_t aSig
, bSig
;
3258 uint64_t aSig64
, bSig64
, q64
;
3259 uint32_t alternateASig
;
3261 a
= float32_squash_input_denormal(a
, status
);
3262 b
= float32_squash_input_denormal(b
, status
);
3264 aSig
= extractFloat32Frac( a
);
3265 aExp
= extractFloat32Exp( a
);
3266 aSign
= extractFloat32Sign( a
);
3267 bSig
= extractFloat32Frac( b
);
3268 bExp
= extractFloat32Exp( b
);
3269 if ( aExp
== 0xFF ) {
3270 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
3271 return propagateFloat32NaN(a
, b
, status
);
3273 float_raise(float_flag_invalid
, status
);
3274 return float32_default_nan(status
);
3276 if ( bExp
== 0xFF ) {
3278 return propagateFloat32NaN(a
, b
, status
);
3284 float_raise(float_flag_invalid
, status
);
3285 return float32_default_nan(status
);
3287 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
3290 if ( aSig
== 0 ) return a
;
3291 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3293 expDiff
= aExp
- bExp
;
3296 if ( expDiff
< 32 ) {
3299 if ( expDiff
< 0 ) {
3300 if ( expDiff
< -1 ) return a
;
3303 q
= ( bSig
<= aSig
);
3304 if ( q
) aSig
-= bSig
;
3305 if ( 0 < expDiff
) {
3306 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
3309 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3317 if ( bSig
<= aSig
) aSig
-= bSig
;
3318 aSig64
= ( (uint64_t) aSig
)<<40;
3319 bSig64
= ( (uint64_t) bSig
)<<40;
3321 while ( 0 < expDiff
) {
3322 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
3323 q64
= ( 2 < q64
) ? q64
- 2 : 0;
3324 aSig64
= - ( ( bSig
* q64
)<<38 );
3328 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
3329 q64
= ( 2 < q64
) ? q64
- 2 : 0;
3330 q
= q64
>>( 64 - expDiff
);
3332 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
3335 alternateASig
= aSig
;
3338 } while ( 0 <= (int32_t) aSig
);
3339 sigMean
= aSig
+ alternateASig
;
3340 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3341 aSig
= alternateASig
;
3343 zSign
= ( (int32_t) aSig
< 0 );
3344 if ( zSign
) aSig
= - aSig
;
3345 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
3350 /*----------------------------------------------------------------------------
3351 | Returns the binary exponential of the single-precision floating-point value
3352 | `a'. The operation is performed according to the IEC/IEEE Standard for
3353 | Binary Floating-Point Arithmetic.
3355 | Uses the following identities:
3357 | 1. -------------------------------------------------------------------------
3361 | 2. -------------------------------------------------------------------------
3364 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3366 *----------------------------------------------------------------------------*/
3368 static const float64 float32_exp2_coefficients
[15] =
3370 const_float64( 0x3ff0000000000000ll
), /* 1 */
3371 const_float64( 0x3fe0000000000000ll
), /* 2 */
3372 const_float64( 0x3fc5555555555555ll
), /* 3 */
3373 const_float64( 0x3fa5555555555555ll
), /* 4 */
3374 const_float64( 0x3f81111111111111ll
), /* 5 */
3375 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
3376 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
3377 const_float64( 0x3efa01a01a01a01all
), /* 8 */
3378 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
3379 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
3380 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
3381 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
3382 const_float64( 0x3de6124613a86d09ll
), /* 13 */
3383 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
3384 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
3387 float32
float32_exp2(float32 a
, float_status
*status
)
3394 a
= float32_squash_input_denormal(a
, status
);
3396 aSig
= extractFloat32Frac( a
);
3397 aExp
= extractFloat32Exp( a
);
3398 aSign
= extractFloat32Sign( a
);
3400 if ( aExp
== 0xFF) {
3402 return propagateFloat32NaN(a
, float32_zero
, status
);
3404 return (aSign
) ? float32_zero
: a
;
3407 if (aSig
== 0) return float32_one
;
3410 float_raise(float_flag_inexact
, status
);
3412 /* ******************************* */
3413 /* using float64 for approximation */
3414 /* ******************************* */
3415 x
= float32_to_float64(a
, status
);
3416 x
= float64_mul(x
, float64_ln2
, status
);
3420 for (i
= 0 ; i
< 15 ; i
++) {
3423 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
3424 r
= float64_add(r
, f
, status
);
3426 xn
= float64_mul(xn
, x
, status
);
3429 return float64_to_float32(r
, status
);
3432 /*----------------------------------------------------------------------------
3433 | Returns the binary log of the single-precision floating-point value `a'.
3434 | The operation is performed according to the IEC/IEEE Standard for Binary
3435 | Floating-Point Arithmetic.
3436 *----------------------------------------------------------------------------*/
3437 float32
float32_log2(float32 a
, float_status
*status
)
3441 uint32_t aSig
, zSig
, i
;
3443 a
= float32_squash_input_denormal(a
, status
);
3444 aSig
= extractFloat32Frac( a
);
3445 aExp
= extractFloat32Exp( a
);
3446 aSign
= extractFloat32Sign( a
);
3449 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
3450 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3453 float_raise(float_flag_invalid
, status
);
3454 return float32_default_nan(status
);
3456 if ( aExp
== 0xFF ) {
3458 return propagateFloat32NaN(a
, float32_zero
, status
);
3468 for (i
= 1 << 22; i
> 0; i
>>= 1) {
3469 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
3470 if ( aSig
& 0x01000000 ) {
3479 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
3482 /*----------------------------------------------------------------------------
3483 | Returns 1 if the single-precision floating-point value `a' is equal to
3484 | the corresponding value `b', and 0 otherwise. The invalid exception is
3485 | raised if either operand is a NaN. Otherwise, the comparison is performed
3486 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3487 *----------------------------------------------------------------------------*/
3489 int float32_eq(float32 a
, float32 b
, float_status
*status
)
3492 a
= float32_squash_input_denormal(a
, status
);
3493 b
= float32_squash_input_denormal(b
, status
);
3495 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3496 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3498 float_raise(float_flag_invalid
, status
);
3501 av
= float32_val(a
);
3502 bv
= float32_val(b
);
3503 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3506 /*----------------------------------------------------------------------------
3507 | Returns 1 if the single-precision floating-point value `a' is less than
3508 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3509 | exception is raised if either operand is a NaN. The comparison is performed
3510 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3511 *----------------------------------------------------------------------------*/
3513 int float32_le(float32 a
, float32 b
, float_status
*status
)
3517 a
= float32_squash_input_denormal(a
, status
);
3518 b
= float32_squash_input_denormal(b
, status
);
3520 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3521 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3523 float_raise(float_flag_invalid
, status
);
3526 aSign
= extractFloat32Sign( a
);
3527 bSign
= extractFloat32Sign( b
);
3528 av
= float32_val(a
);
3529 bv
= float32_val(b
);
3530 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3531 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3535 /*----------------------------------------------------------------------------
3536 | Returns 1 if the single-precision floating-point value `a' is less than
3537 | the corresponding value `b', and 0 otherwise. The invalid exception is
3538 | raised if either operand is a NaN. The comparison is performed according
3539 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3540 *----------------------------------------------------------------------------*/
3542 int float32_lt(float32 a
, float32 b
, float_status
*status
)
3546 a
= float32_squash_input_denormal(a
, status
);
3547 b
= float32_squash_input_denormal(b
, status
);
3549 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3550 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3552 float_raise(float_flag_invalid
, status
);
3555 aSign
= extractFloat32Sign( a
);
3556 bSign
= extractFloat32Sign( b
);
3557 av
= float32_val(a
);
3558 bv
= float32_val(b
);
3559 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
3560 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3564 /*----------------------------------------------------------------------------
3565 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3566 | be compared, and 0 otherwise. The invalid exception is raised if either
3567 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3568 | Standard for Binary Floating-Point Arithmetic.
3569 *----------------------------------------------------------------------------*/
3571 int float32_unordered(float32 a
, float32 b
, float_status
*status
)
3573 a
= float32_squash_input_denormal(a
, status
);
3574 b
= float32_squash_input_denormal(b
, status
);
3576 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3577 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3579 float_raise(float_flag_invalid
, status
);
3585 /*----------------------------------------------------------------------------
3586 | Returns 1 if the single-precision floating-point value `a' is equal to
3587 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3588 | exception. The comparison is performed according to the IEC/IEEE Standard
3589 | for Binary Floating-Point Arithmetic.
3590 *----------------------------------------------------------------------------*/
3592 int float32_eq_quiet(float32 a
, float32 b
, float_status
*status
)
3594 a
= float32_squash_input_denormal(a
, status
);
3595 b
= float32_squash_input_denormal(b
, status
);
3597 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3598 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3600 if (float32_is_signaling_nan(a
, status
)
3601 || float32_is_signaling_nan(b
, status
)) {
3602 float_raise(float_flag_invalid
, status
);
3606 return ( float32_val(a
) == float32_val(b
) ) ||
3607 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
3610 /*----------------------------------------------------------------------------
3611 | Returns 1 if the single-precision floating-point value `a' is less than or
3612 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3613 | cause an exception. Otherwise, the comparison is performed according to the
3614 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3615 *----------------------------------------------------------------------------*/
3617 int float32_le_quiet(float32 a
, float32 b
, float_status
*status
)
3621 a
= float32_squash_input_denormal(a
, status
);
3622 b
= float32_squash_input_denormal(b
, status
);
3624 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3625 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3627 if (float32_is_signaling_nan(a
, status
)
3628 || float32_is_signaling_nan(b
, status
)) {
3629 float_raise(float_flag_invalid
, status
);
3633 aSign
= extractFloat32Sign( a
);
3634 bSign
= extractFloat32Sign( b
);
3635 av
= float32_val(a
);
3636 bv
= float32_val(b
);
3637 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3638 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3642 /*----------------------------------------------------------------------------
3643 | Returns 1 if the single-precision floating-point value `a' is less than
3644 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3645 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3646 | Standard for Binary Floating-Point Arithmetic.
3647 *----------------------------------------------------------------------------*/
3649 int float32_lt_quiet(float32 a
, float32 b
, float_status
*status
)
3653 a
= float32_squash_input_denormal(a
, status
);
3654 b
= float32_squash_input_denormal(b
, status
);
3656 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3657 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3659 if (float32_is_signaling_nan(a
, status
)
3660 || float32_is_signaling_nan(b
, status
)) {
3661 float_raise(float_flag_invalid
, status
);
3665 aSign
= extractFloat32Sign( a
);
3666 bSign
= extractFloat32Sign( b
);
3667 av
= float32_val(a
);
3668 bv
= float32_val(b
);
3669 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
3670 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3674 /*----------------------------------------------------------------------------
3675 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3676 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3677 | comparison is performed according to the IEC/IEEE Standard for Binary
3678 | Floating-Point Arithmetic.
3679 *----------------------------------------------------------------------------*/
3681 int float32_unordered_quiet(float32 a
, float32 b
, float_status
*status
)
3683 a
= float32_squash_input_denormal(a
, status
);
3684 b
= float32_squash_input_denormal(b
, status
);
3686 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3687 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3689 if (float32_is_signaling_nan(a
, status
)
3690 || float32_is_signaling_nan(b
, status
)) {
3691 float_raise(float_flag_invalid
, status
);
3699 /*----------------------------------------------------------------------------
3700 | Returns the result of converting the double-precision floating-point value
3701 | `a' to the single-precision floating-point format. The conversion is
3702 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3704 *----------------------------------------------------------------------------*/
3706 float32
float64_to_float32(float64 a
, float_status
*status
)
3712 a
= float64_squash_input_denormal(a
, status
);
3714 aSig
= extractFloat64Frac( a
);
3715 aExp
= extractFloat64Exp( a
);
3716 aSign
= extractFloat64Sign( a
);
3717 if ( aExp
== 0x7FF ) {
3719 return commonNaNToFloat32(float64ToCommonNaN(a
, status
), status
);
3721 return packFloat32( aSign
, 0xFF, 0 );
3723 shift64RightJamming( aSig
, 22, &aSig
);
3725 if ( aExp
|| zSig
) {
3729 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
3734 /*----------------------------------------------------------------------------
3735 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3736 | half-precision floating-point value, returning the result. After being
3737 | shifted into the proper positions, the three fields are simply added
3738 | together to form the result. This means that any integer portion of `zSig'
3739 | will be added into the exponent. Since a properly normalized significand
3740 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3741 | than the desired result exponent whenever `zSig' is a complete, normalized
3743 *----------------------------------------------------------------------------*/
3744 static float16
packFloat16(flag zSign
, int zExp
, uint16_t zSig
)
3746 return make_float16(
3747 (((uint32_t)zSign
) << 15) + (((uint32_t)zExp
) << 10) + zSig
);
3750 /*----------------------------------------------------------------------------
3751 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3752 | and significand `zSig', and returns the proper half-precision floating-
3753 | point value corresponding to the abstract input. Ordinarily, the abstract
3754 | value is simply rounded and packed into the half-precision format, with
3755 | the inexact exception raised if the abstract input cannot be represented
3756 | exactly. However, if the abstract value is too large, the overflow and
3757 | inexact exceptions are raised and an infinity or maximal finite value is
3758 | returned. If the abstract value is too small, the input value is rounded to
3759 | a subnormal number, and the underflow and inexact exceptions are raised if
3760 | the abstract input cannot be represented exactly as a subnormal half-
3761 | precision floating-point number.
3762 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3763 | ARM-style "alternative representation", which omits the NaN and Inf
3764 | encodings in order to raise the maximum representable exponent by one.
3765 | The input significand `zSig' has its binary point between bits 22
3766 | and 23, which is 13 bits to the left of the usual location. This shifted
3767 | significand must be normalized or smaller. If `zSig' is not normalized,
3768 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3769 | and it must not require rounding. In the usual case that `zSig' is
3770 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3771 | Note the slightly odd position of the binary point in zSig compared with the
3772 | other roundAndPackFloat functions. This should probably be fixed if we
3773 | need to implement more float16 routines than just conversion.
3774 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3775 | Binary Floating-Point Arithmetic.
3776 *----------------------------------------------------------------------------*/
3778 static float16
roundAndPackFloat16(flag zSign
, int zExp
,
3779 uint32_t zSig
, flag ieee
,
3780 float_status
*status
)
3782 int maxexp
= ieee
? 29 : 30;
3785 bool rounding_bumps_exp
;
3786 bool is_tiny
= false;
3788 /* Calculate the mask of bits of the mantissa which are not
3789 * representable in half-precision and will be lost.
3792 /* Will be denormal in halfprec */
3798 /* Normal number in halfprec */
3802 switch (status
->float_rounding_mode
) {
3803 case float_round_nearest_even
:
3804 increment
= (mask
+ 1) >> 1;
3805 if ((zSig
& mask
) == increment
) {
3806 increment
= zSig
& (increment
<< 1);
3809 case float_round_ties_away
:
3810 increment
= (mask
+ 1) >> 1;
3812 case float_round_up
:
3813 increment
= zSign
? 0 : mask
;
3815 case float_round_down
:
3816 increment
= zSign
? mask
: 0;
3818 default: /* round_to_zero */
3823 rounding_bumps_exp
= (zSig
+ increment
>= 0x01000000);
3825 if (zExp
> maxexp
|| (zExp
== maxexp
&& rounding_bumps_exp
)) {
3827 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3828 return packFloat16(zSign
, 0x1f, 0);
3830 float_raise(float_flag_invalid
, status
);
3831 return packFloat16(zSign
, 0x1f, 0x3ff);
3836 /* Note that flush-to-zero does not affect half-precision results */
3838 (status
->float_detect_tininess
== float_tininess_before_rounding
)
3840 || (!rounding_bumps_exp
);
3843 float_raise(float_flag_inexact
, status
);
3845 float_raise(float_flag_underflow
, status
);
3850 if (rounding_bumps_exp
) {
3856 return packFloat16(zSign
, 0, 0);
3862 return packFloat16(zSign
, zExp
, zSig
>> 13);
3865 /*----------------------------------------------------------------------------
3866 | If `a' is denormal and we are in flush-to-zero mode then set the
3867 | input-denormal exception and return zero. Otherwise just return the value.
3868 *----------------------------------------------------------------------------*/
3869 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
3871 if (status
->flush_inputs_to_zero
) {
3872 if (extractFloat16Exp(a
) == 0 && extractFloat16Frac(a
) != 0) {
3873 float_raise(float_flag_input_denormal
, status
);
3874 return make_float16(float16_val(a
) & 0x8000);
3880 static void normalizeFloat16Subnormal(uint32_t aSig
, int *zExpPtr
,
3883 int8_t shiftCount
= countLeadingZeros32(aSig
) - 21;
3884 *zSigPtr
= aSig
<< shiftCount
;
3885 *zExpPtr
= 1 - shiftCount
;
3888 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3889 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3891 float32
float16_to_float32(float16 a
, flag ieee
, float_status
*status
)
3897 aSign
= extractFloat16Sign(a
);
3898 aExp
= extractFloat16Exp(a
);
3899 aSig
= extractFloat16Frac(a
);
3901 if (aExp
== 0x1f && ieee
) {
3903 return commonNaNToFloat32(float16ToCommonNaN(a
, status
), status
);
3905 return packFloat32(aSign
, 0xff, 0);
3909 return packFloat32(aSign
, 0, 0);
3912 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3915 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
3918 float16
float32_to_float16(float32 a
, flag ieee
, float_status
*status
)
3924 a
= float32_squash_input_denormal(a
, status
);
3926 aSig
= extractFloat32Frac( a
);
3927 aExp
= extractFloat32Exp( a
);
3928 aSign
= extractFloat32Sign( a
);
3929 if ( aExp
== 0xFF ) {
3931 /* Input is a NaN */
3933 float_raise(float_flag_invalid
, status
);
3934 return packFloat16(aSign
, 0, 0);
3936 return commonNaNToFloat16(
3937 float32ToCommonNaN(a
, status
), status
);
3941 float_raise(float_flag_invalid
, status
);
3942 return packFloat16(aSign
, 0x1f, 0x3ff);
3944 return packFloat16(aSign
, 0x1f, 0);
3946 if (aExp
== 0 && aSig
== 0) {
3947 return packFloat16(aSign
, 0, 0);
3949 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3950 * even if the input is denormal; however this is harmless because
3951 * the largest possible single-precision denormal is still smaller
3952 * than the smallest representable half-precision denormal, and so we
3953 * will end up ignoring aSig and returning via the "always return zero"
3959 return roundAndPackFloat16(aSign
, aExp
, aSig
, ieee
, status
);
3962 float64
float16_to_float64(float16 a
, flag ieee
, float_status
*status
)
3968 aSign
= extractFloat16Sign(a
);
3969 aExp
= extractFloat16Exp(a
);
3970 aSig
= extractFloat16Frac(a
);
3972 if (aExp
== 0x1f && ieee
) {
3974 return commonNaNToFloat64(
3975 float16ToCommonNaN(a
, status
), status
);
3977 return packFloat64(aSign
, 0x7ff, 0);
3981 return packFloat64(aSign
, 0, 0);
3984 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3987 return packFloat64(aSign
, aExp
+ 0x3f0, ((uint64_t)aSig
) << 42);
3990 float16
float64_to_float16(float64 a
, flag ieee
, float_status
*status
)
3997 a
= float64_squash_input_denormal(a
, status
);
3999 aSig
= extractFloat64Frac(a
);
4000 aExp
= extractFloat64Exp(a
);
4001 aSign
= extractFloat64Sign(a
);
4002 if (aExp
== 0x7FF) {
4004 /* Input is a NaN */
4006 float_raise(float_flag_invalid
, status
);
4007 return packFloat16(aSign
, 0, 0);
4009 return commonNaNToFloat16(
4010 float64ToCommonNaN(a
, status
), status
);
4014 float_raise(float_flag_invalid
, status
);
4015 return packFloat16(aSign
, 0x1f, 0x3ff);
4017 return packFloat16(aSign
, 0x1f, 0);
4019 shift64RightJamming(aSig
, 29, &aSig
);
4021 if (aExp
== 0 && zSig
== 0) {
4022 return packFloat16(aSign
, 0, 0);
4024 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4025 * even if the input is denormal; however this is harmless because
4026 * the largest possible single-precision denormal is still smaller
4027 * than the smallest representable half-precision denormal, and so we
4028 * will end up ignoring aSig and returning via the "always return zero"
4034 return roundAndPackFloat16(aSign
, aExp
, zSig
, ieee
, status
);
4037 /*----------------------------------------------------------------------------
4038 | Returns the result of converting the double-precision floating-point value
4039 | `a' to the extended double-precision floating-point format. The conversion
4040 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4042 *----------------------------------------------------------------------------*/
4044 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
4050 a
= float64_squash_input_denormal(a
, status
);
4051 aSig
= extractFloat64Frac( a
);
4052 aExp
= extractFloat64Exp( a
);
4053 aSign
= extractFloat64Sign( a
);
4054 if ( aExp
== 0x7FF ) {
4056 return commonNaNToFloatx80(float64ToCommonNaN(a
, status
), status
);
4058 return packFloatx80(aSign
,
4059 floatx80_infinity_high
,
4060 floatx80_infinity_low
);
4063 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
4064 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4068 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
4072 /*----------------------------------------------------------------------------
4073 | Returns the result of converting the double-precision floating-point value
4074 | `a' to the quadruple-precision floating-point format. The conversion is
4075 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4077 *----------------------------------------------------------------------------*/
4079 float128
float64_to_float128(float64 a
, float_status
*status
)
4083 uint64_t aSig
, zSig0
, zSig1
;
4085 a
= float64_squash_input_denormal(a
, status
);
4086 aSig
= extractFloat64Frac( a
);
4087 aExp
= extractFloat64Exp( a
);
4088 aSign
= extractFloat64Sign( a
);
4089 if ( aExp
== 0x7FF ) {
4091 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
4093 return packFloat128( aSign
, 0x7FFF, 0, 0 );
4096 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
4097 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4100 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
4101 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
4106 /*----------------------------------------------------------------------------
4107 | Returns the remainder of the double-precision floating-point value `a'
4108 | with respect to the corresponding value `b'. The operation is performed
4109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4110 *----------------------------------------------------------------------------*/
4112 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
4115 int aExp
, bExp
, expDiff
;
4116 uint64_t aSig
, bSig
;
4117 uint64_t q
, alternateASig
;
4120 a
= float64_squash_input_denormal(a
, status
);
4121 b
= float64_squash_input_denormal(b
, status
);
4122 aSig
= extractFloat64Frac( a
);
4123 aExp
= extractFloat64Exp( a
);
4124 aSign
= extractFloat64Sign( a
);
4125 bSig
= extractFloat64Frac( b
);
4126 bExp
= extractFloat64Exp( b
);
4127 if ( aExp
== 0x7FF ) {
4128 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
4129 return propagateFloat64NaN(a
, b
, status
);
4131 float_raise(float_flag_invalid
, status
);
4132 return float64_default_nan(status
);
4134 if ( bExp
== 0x7FF ) {
4136 return propagateFloat64NaN(a
, b
, status
);
4142 float_raise(float_flag_invalid
, status
);
4143 return float64_default_nan(status
);
4145 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4148 if ( aSig
== 0 ) return a
;
4149 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4151 expDiff
= aExp
- bExp
;
4152 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
4153 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4154 if ( expDiff
< 0 ) {
4155 if ( expDiff
< -1 ) return a
;
4158 q
= ( bSig
<= aSig
);
4159 if ( q
) aSig
-= bSig
;
4161 while ( 0 < expDiff
) {
4162 q
= estimateDiv128To64( aSig
, 0, bSig
);
4163 q
= ( 2 < q
) ? q
- 2 : 0;
4164 aSig
= - ( ( bSig
>>2 ) * q
);
4168 if ( 0 < expDiff
) {
4169 q
= estimateDiv128To64( aSig
, 0, bSig
);
4170 q
= ( 2 < q
) ? q
- 2 : 0;
4173 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4180 alternateASig
= aSig
;
4183 } while ( 0 <= (int64_t) aSig
);
4184 sigMean
= aSig
+ alternateASig
;
4185 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4186 aSig
= alternateASig
;
4188 zSign
= ( (int64_t) aSig
< 0 );
4189 if ( zSign
) aSig
= - aSig
;
4190 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
4194 /*----------------------------------------------------------------------------
4195 | Returns the binary log of the double-precision floating-point value `a'.
4196 | The operation is performed according to the IEC/IEEE Standard for Binary
4197 | Floating-Point Arithmetic.
4198 *----------------------------------------------------------------------------*/
4199 float64
float64_log2(float64 a
, float_status
*status
)
4203 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
4204 a
= float64_squash_input_denormal(a
, status
);
4206 aSig
= extractFloat64Frac( a
);
4207 aExp
= extractFloat64Exp( a
);
4208 aSign
= extractFloat64Sign( a
);
4211 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
4212 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4215 float_raise(float_flag_invalid
, status
);
4216 return float64_default_nan(status
);
4218 if ( aExp
== 0x7FF ) {
4220 return propagateFloat64NaN(a
, float64_zero
, status
);
4226 aSig
|= LIT64( 0x0010000000000000 );
4228 zSig
= (uint64_t)aExp
<< 52;
4229 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
4230 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
4231 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
4232 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
4240 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
4243 /*----------------------------------------------------------------------------
4244 | Returns 1 if the double-precision floating-point value `a' is equal to the
4245 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4246 | if either operand is a NaN. Otherwise, the comparison is performed
4247 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4248 *----------------------------------------------------------------------------*/
4250 int float64_eq(float64 a
, float64 b
, float_status
*status
)
4253 a
= float64_squash_input_denormal(a
, status
);
4254 b
= float64_squash_input_denormal(b
, status
);
4256 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4257 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4259 float_raise(float_flag_invalid
, status
);
4262 av
= float64_val(a
);
4263 bv
= float64_val(b
);
4264 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4268 /*----------------------------------------------------------------------------
4269 | Returns 1 if the double-precision floating-point value `a' is less than or
4270 | equal to the corresponding value `b', and 0 otherwise. The invalid
4271 | exception is raised if either operand is a NaN. The comparison is performed
4272 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4273 *----------------------------------------------------------------------------*/
4275 int float64_le(float64 a
, float64 b
, float_status
*status
)
4279 a
= float64_squash_input_denormal(a
, status
);
4280 b
= float64_squash_input_denormal(b
, status
);
4282 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4283 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4285 float_raise(float_flag_invalid
, status
);
4288 aSign
= extractFloat64Sign( a
);
4289 bSign
= extractFloat64Sign( b
);
4290 av
= float64_val(a
);
4291 bv
= float64_val(b
);
4292 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4293 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4297 /*----------------------------------------------------------------------------
4298 | Returns 1 if the double-precision floating-point value `a' is less than
4299 | the corresponding value `b', and 0 otherwise. The invalid exception is
4300 | raised if either operand is a NaN. The comparison is performed according
4301 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4302 *----------------------------------------------------------------------------*/
4304 int float64_lt(float64 a
, float64 b
, float_status
*status
)
4309 a
= float64_squash_input_denormal(a
, status
);
4310 b
= float64_squash_input_denormal(b
, status
);
4311 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4312 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4314 float_raise(float_flag_invalid
, status
);
4317 aSign
= extractFloat64Sign( a
);
4318 bSign
= extractFloat64Sign( b
);
4319 av
= float64_val(a
);
4320 bv
= float64_val(b
);
4321 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4322 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4326 /*----------------------------------------------------------------------------
4327 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4328 | be compared, and 0 otherwise. The invalid exception is raised if either
4329 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4330 | Standard for Binary Floating-Point Arithmetic.
4331 *----------------------------------------------------------------------------*/
4333 int float64_unordered(float64 a
, float64 b
, float_status
*status
)
4335 a
= float64_squash_input_denormal(a
, status
);
4336 b
= float64_squash_input_denormal(b
, status
);
4338 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4339 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4341 float_raise(float_flag_invalid
, status
);
4347 /*----------------------------------------------------------------------------
4348 | Returns 1 if the double-precision floating-point value `a' is equal to the
4349 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4350 | exception.The comparison is performed according to the IEC/IEEE Standard
4351 | for Binary Floating-Point Arithmetic.
4352 *----------------------------------------------------------------------------*/
4354 int float64_eq_quiet(float64 a
, float64 b
, float_status
*status
)
4357 a
= float64_squash_input_denormal(a
, status
);
4358 b
= float64_squash_input_denormal(b
, status
);
4360 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4361 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4363 if (float64_is_signaling_nan(a
, status
)
4364 || float64_is_signaling_nan(b
, status
)) {
4365 float_raise(float_flag_invalid
, status
);
4369 av
= float64_val(a
);
4370 bv
= float64_val(b
);
4371 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4375 /*----------------------------------------------------------------------------
4376 | Returns 1 if the double-precision floating-point value `a' is less than or
4377 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4378 | cause an exception. Otherwise, the comparison is performed according to the
4379 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4380 *----------------------------------------------------------------------------*/
4382 int float64_le_quiet(float64 a
, float64 b
, float_status
*status
)
4386 a
= float64_squash_input_denormal(a
, status
);
4387 b
= float64_squash_input_denormal(b
, status
);
4389 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4390 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4392 if (float64_is_signaling_nan(a
, status
)
4393 || float64_is_signaling_nan(b
, status
)) {
4394 float_raise(float_flag_invalid
, status
);
4398 aSign
= extractFloat64Sign( a
);
4399 bSign
= extractFloat64Sign( b
);
4400 av
= float64_val(a
);
4401 bv
= float64_val(b
);
4402 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4403 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4407 /*----------------------------------------------------------------------------
4408 | Returns 1 if the double-precision floating-point value `a' is less than
4409 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4410 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4411 | Standard for Binary Floating-Point Arithmetic.
4412 *----------------------------------------------------------------------------*/
4414 int float64_lt_quiet(float64 a
, float64 b
, float_status
*status
)
4418 a
= float64_squash_input_denormal(a
, status
);
4419 b
= float64_squash_input_denormal(b
, status
);
4421 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4422 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4424 if (float64_is_signaling_nan(a
, status
)
4425 || float64_is_signaling_nan(b
, status
)) {
4426 float_raise(float_flag_invalid
, status
);
4430 aSign
= extractFloat64Sign( a
);
4431 bSign
= extractFloat64Sign( b
);
4432 av
= float64_val(a
);
4433 bv
= float64_val(b
);
4434 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4435 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4439 /*----------------------------------------------------------------------------
4440 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4441 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4442 | comparison is performed according to the IEC/IEEE Standard for Binary
4443 | Floating-Point Arithmetic.
4444 *----------------------------------------------------------------------------*/
4446 int float64_unordered_quiet(float64 a
, float64 b
, float_status
*status
)
4448 a
= float64_squash_input_denormal(a
, status
);
4449 b
= float64_squash_input_denormal(b
, status
);
4451 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4452 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4454 if (float64_is_signaling_nan(a
, status
)
4455 || float64_is_signaling_nan(b
, status
)) {
4456 float_raise(float_flag_invalid
, status
);
4463 /*----------------------------------------------------------------------------
4464 | Returns the result of converting the extended double-precision floating-
4465 | point value `a' to the 32-bit two's complement integer format. The
4466 | conversion is performed according to the IEC/IEEE Standard for Binary
4467 | Floating-Point Arithmetic---which means in particular that the conversion
4468 | is rounded according to the current rounding mode. If `a' is a NaN, the
4469 | largest positive integer is returned. Otherwise, if the conversion
4470 | overflows, the largest integer with the same sign as `a' is returned.
4471 *----------------------------------------------------------------------------*/
4473 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
4476 int32_t aExp
, shiftCount
;
4479 if (floatx80_invalid_encoding(a
)) {
4480 float_raise(float_flag_invalid
, status
);
4483 aSig
= extractFloatx80Frac( a
);
4484 aExp
= extractFloatx80Exp( a
);
4485 aSign
= extractFloatx80Sign( a
);
4486 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4487 shiftCount
= 0x4037 - aExp
;
4488 if ( shiftCount
<= 0 ) shiftCount
= 1;
4489 shift64RightJamming( aSig
, shiftCount
, &aSig
);
4490 return roundAndPackInt32(aSign
, aSig
, status
);
4494 /*----------------------------------------------------------------------------
4495 | Returns the result of converting the extended double-precision floating-
4496 | point value `a' to the 32-bit two's complement integer format. The
4497 | conversion is performed according to the IEC/IEEE Standard for Binary
4498 | Floating-Point Arithmetic, except that the conversion is always rounded
4499 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4500 | Otherwise, if the conversion overflows, the largest integer with the same
4501 | sign as `a' is returned.
4502 *----------------------------------------------------------------------------*/
4504 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
4507 int32_t aExp
, shiftCount
;
4508 uint64_t aSig
, savedASig
;
4511 if (floatx80_invalid_encoding(a
)) {
4512 float_raise(float_flag_invalid
, status
);
4515 aSig
= extractFloatx80Frac( a
);
4516 aExp
= extractFloatx80Exp( a
);
4517 aSign
= extractFloatx80Sign( a
);
4518 if ( 0x401E < aExp
) {
4519 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4522 else if ( aExp
< 0x3FFF ) {
4524 status
->float_exception_flags
|= float_flag_inexact
;
4528 shiftCount
= 0x403E - aExp
;
4530 aSig
>>= shiftCount
;
4532 if ( aSign
) z
= - z
;
4533 if ( ( z
< 0 ) ^ aSign
) {
4535 float_raise(float_flag_invalid
, status
);
4536 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
4538 if ( ( aSig
<<shiftCount
) != savedASig
) {
4539 status
->float_exception_flags
|= float_flag_inexact
;
4545 /*----------------------------------------------------------------------------
4546 | Returns the result of converting the extended double-precision floating-
4547 | point value `a' to the 64-bit two's complement integer format. The
4548 | conversion is performed according to the IEC/IEEE Standard for Binary
4549 | Floating-Point Arithmetic---which means in particular that the conversion
4550 | is rounded according to the current rounding mode. If `a' is a NaN,
4551 | the largest positive integer is returned. Otherwise, if the conversion
4552 | overflows, the largest integer with the same sign as `a' is returned.
4553 *----------------------------------------------------------------------------*/
4555 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
4558 int32_t aExp
, shiftCount
;
4559 uint64_t aSig
, aSigExtra
;
4561 if (floatx80_invalid_encoding(a
)) {
4562 float_raise(float_flag_invalid
, status
);
4565 aSig
= extractFloatx80Frac( a
);
4566 aExp
= extractFloatx80Exp( a
);
4567 aSign
= extractFloatx80Sign( a
);
4568 shiftCount
= 0x403E - aExp
;
4569 if ( shiftCount
<= 0 ) {
4571 float_raise(float_flag_invalid
, status
);
4572 if (!aSign
|| floatx80_is_any_nan(a
)) {
4573 return LIT64( 0x7FFFFFFFFFFFFFFF );
4575 return (int64_t) LIT64( 0x8000000000000000 );
4580 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
4582 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
4586 /*----------------------------------------------------------------------------
4587 | Returns the result of converting the extended double-precision floating-
4588 | point value `a' to the 64-bit two's complement integer format. The
4589 | conversion is performed according to the IEC/IEEE Standard for Binary
4590 | Floating-Point Arithmetic, except that the conversion is always rounded
4591 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4592 | Otherwise, if the conversion overflows, the largest integer with the same
4593 | sign as `a' is returned.
4594 *----------------------------------------------------------------------------*/
4596 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
4599 int32_t aExp
, shiftCount
;
4603 if (floatx80_invalid_encoding(a
)) {
4604 float_raise(float_flag_invalid
, status
);
4607 aSig
= extractFloatx80Frac( a
);
4608 aExp
= extractFloatx80Exp( a
);
4609 aSign
= extractFloatx80Sign( a
);
4610 shiftCount
= aExp
- 0x403E;
4611 if ( 0 <= shiftCount
) {
4612 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
4613 if ( ( a
.high
!= 0xC03E ) || aSig
) {
4614 float_raise(float_flag_invalid
, status
);
4615 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
4616 return LIT64( 0x7FFFFFFFFFFFFFFF );
4619 return (int64_t) LIT64( 0x8000000000000000 );
4621 else if ( aExp
< 0x3FFF ) {
4623 status
->float_exception_flags
|= float_flag_inexact
;
4627 z
= aSig
>>( - shiftCount
);
4628 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
4629 status
->float_exception_flags
|= float_flag_inexact
;
4631 if ( aSign
) z
= - z
;
4636 /*----------------------------------------------------------------------------
4637 | Returns the result of converting the extended double-precision floating-
4638 | point value `a' to the single-precision floating-point format. The
4639 | conversion is performed according to the IEC/IEEE Standard for Binary
4640 | Floating-Point Arithmetic.
4641 *----------------------------------------------------------------------------*/
4643 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
4649 if (floatx80_invalid_encoding(a
)) {
4650 float_raise(float_flag_invalid
, status
);
4651 return float32_default_nan(status
);
4653 aSig
= extractFloatx80Frac( a
);
4654 aExp
= extractFloatx80Exp( a
);
4655 aSign
= extractFloatx80Sign( a
);
4656 if ( aExp
== 0x7FFF ) {
4657 if ( (uint64_t) ( aSig
<<1 ) ) {
4658 return commonNaNToFloat32(floatx80ToCommonNaN(a
, status
), status
);
4660 return packFloat32( aSign
, 0xFF, 0 );
4662 shift64RightJamming( aSig
, 33, &aSig
);
4663 if ( aExp
|| aSig
) aExp
-= 0x3F81;
4664 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
4668 /*----------------------------------------------------------------------------
4669 | Returns the result of converting the extended double-precision floating-
4670 | point value `a' to the double-precision floating-point format. The
4671 | conversion is performed according to the IEC/IEEE Standard for Binary
4672 | Floating-Point Arithmetic.
4673 *----------------------------------------------------------------------------*/
4675 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
4679 uint64_t aSig
, zSig
;
4681 if (floatx80_invalid_encoding(a
)) {
4682 float_raise(float_flag_invalid
, status
);
4683 return float64_default_nan(status
);
4685 aSig
= extractFloatx80Frac( a
);
4686 aExp
= extractFloatx80Exp( a
);
4687 aSign
= extractFloatx80Sign( a
);
4688 if ( aExp
== 0x7FFF ) {
4689 if ( (uint64_t) ( aSig
<<1 ) ) {
4690 return commonNaNToFloat64(floatx80ToCommonNaN(a
, status
), status
);
4692 return packFloat64( aSign
, 0x7FF, 0 );
4694 shift64RightJamming( aSig
, 1, &zSig
);
4695 if ( aExp
|| aSig
) aExp
-= 0x3C01;
4696 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
4700 /*----------------------------------------------------------------------------
4701 | Returns the result of converting the extended double-precision floating-
4702 | point value `a' to the quadruple-precision floating-point format. The
4703 | conversion is performed according to the IEC/IEEE Standard for Binary
4704 | Floating-Point Arithmetic.
4705 *----------------------------------------------------------------------------*/
4707 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
4711 uint64_t aSig
, zSig0
, zSig1
;
4713 if (floatx80_invalid_encoding(a
)) {
4714 float_raise(float_flag_invalid
, status
);
4715 return float128_default_nan(status
);
4717 aSig
= extractFloatx80Frac( a
);
4718 aExp
= extractFloatx80Exp( a
);
4719 aSign
= extractFloatx80Sign( a
);
4720 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
4721 return commonNaNToFloat128(floatx80ToCommonNaN(a
, status
), status
);
4723 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
4724 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
4728 /*----------------------------------------------------------------------------
4729 | Rounds the extended double-precision floating-point value `a'
4730 | to the precision provided by floatx80_rounding_precision and returns the
4731 | result as an extended double-precision floating-point value.
4732 | The operation is performed according to the IEC/IEEE Standard for Binary
4733 | Floating-Point Arithmetic.
4734 *----------------------------------------------------------------------------*/
4736 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
4738 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
4739 extractFloatx80Sign(a
),
4740 extractFloatx80Exp(a
),
4741 extractFloatx80Frac(a
), 0, status
);
4744 /*----------------------------------------------------------------------------
4745 | Rounds the extended double-precision floating-point value `a' to an integer,
4746 | and returns the result as an extended quadruple-precision floating-point
4747 | value. The operation is performed according to the IEC/IEEE Standard for
4748 | Binary Floating-Point Arithmetic.
4749 *----------------------------------------------------------------------------*/
4751 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
4755 uint64_t lastBitMask
, roundBitsMask
;
4758 if (floatx80_invalid_encoding(a
)) {
4759 float_raise(float_flag_invalid
, status
);
4760 return floatx80_default_nan(status
);
4762 aExp
= extractFloatx80Exp( a
);
4763 if ( 0x403E <= aExp
) {
4764 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
4765 return propagateFloatx80NaN(a
, a
, status
);
4769 if ( aExp
< 0x3FFF ) {
4771 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
4774 status
->float_exception_flags
|= float_flag_inexact
;
4775 aSign
= extractFloatx80Sign( a
);
4776 switch (status
->float_rounding_mode
) {
4777 case float_round_nearest_even
:
4778 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
4781 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
4784 case float_round_ties_away
:
4785 if (aExp
== 0x3FFE) {
4786 return packFloatx80(aSign
, 0x3FFF, LIT64(0x8000000000000000));
4789 case float_round_down
:
4792 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4793 : packFloatx80( 0, 0, 0 );
4794 case float_round_up
:
4796 aSign
? packFloatx80( 1, 0, 0 )
4797 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4799 return packFloatx80( aSign
, 0, 0 );
4802 lastBitMask
<<= 0x403E - aExp
;
4803 roundBitsMask
= lastBitMask
- 1;
4805 switch (status
->float_rounding_mode
) {
4806 case float_round_nearest_even
:
4807 z
.low
+= lastBitMask
>>1;
4808 if ((z
.low
& roundBitsMask
) == 0) {
4809 z
.low
&= ~lastBitMask
;
4812 case float_round_ties_away
:
4813 z
.low
+= lastBitMask
>> 1;
4815 case float_round_to_zero
:
4817 case float_round_up
:
4818 if (!extractFloatx80Sign(z
)) {
4819 z
.low
+= roundBitsMask
;
4822 case float_round_down
:
4823 if (extractFloatx80Sign(z
)) {
4824 z
.low
+= roundBitsMask
;
4830 z
.low
&= ~ roundBitsMask
;
4833 z
.low
= LIT64( 0x8000000000000000 );
4835 if (z
.low
!= a
.low
) {
4836 status
->float_exception_flags
|= float_flag_inexact
;
4842 /*----------------------------------------------------------------------------
4843 | Returns the result of adding the absolute values of the extended double-
4844 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4845 | negated before being returned. `zSign' is ignored if the result is a NaN.
4846 | The addition is performed according to the IEC/IEEE Standard for Binary
4847 | Floating-Point Arithmetic.
4848 *----------------------------------------------------------------------------*/
4850 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
4851 float_status
*status
)
4853 int32_t aExp
, bExp
, zExp
;
4854 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4857 aSig
= extractFloatx80Frac( a
);
4858 aExp
= extractFloatx80Exp( a
);
4859 bSig
= extractFloatx80Frac( b
);
4860 bExp
= extractFloatx80Exp( b
);
4861 expDiff
= aExp
- bExp
;
4862 if ( 0 < expDiff
) {
4863 if ( aExp
== 0x7FFF ) {
4864 if ((uint64_t)(aSig
<< 1)) {
4865 return propagateFloatx80NaN(a
, b
, status
);
4869 if ( bExp
== 0 ) --expDiff
;
4870 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4873 else if ( expDiff
< 0 ) {
4874 if ( bExp
== 0x7FFF ) {
4875 if ((uint64_t)(bSig
<< 1)) {
4876 return propagateFloatx80NaN(a
, b
, status
);
4878 return packFloatx80(zSign
,
4879 floatx80_infinity_high
,
4880 floatx80_infinity_low
);
4882 if ( aExp
== 0 ) ++expDiff
;
4883 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4887 if ( aExp
== 0x7FFF ) {
4888 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
4889 return propagateFloatx80NaN(a
, b
, status
);
4894 zSig0
= aSig
+ bSig
;
4896 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
4902 zSig0
= aSig
+ bSig
;
4903 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
4905 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4906 zSig0
|= LIT64( 0x8000000000000000 );
4909 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
4910 zSign
, zExp
, zSig0
, zSig1
, status
);
4913 /*----------------------------------------------------------------------------
4914 | Returns the result of subtracting the absolute values of the extended
4915 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4916 | difference is negated before being returned. `zSign' is ignored if the
4917 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4918 | Standard for Binary Floating-Point Arithmetic.
4919 *----------------------------------------------------------------------------*/
4921 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
4922 float_status
*status
)
4924 int32_t aExp
, bExp
, zExp
;
4925 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4928 aSig
= extractFloatx80Frac( a
);
4929 aExp
= extractFloatx80Exp( a
);
4930 bSig
= extractFloatx80Frac( b
);
4931 bExp
= extractFloatx80Exp( b
);
4932 expDiff
= aExp
- bExp
;
4933 if ( 0 < expDiff
) goto aExpBigger
;
4934 if ( expDiff
< 0 ) goto bExpBigger
;
4935 if ( aExp
== 0x7FFF ) {
4936 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
4937 return propagateFloatx80NaN(a
, b
, status
);
4939 float_raise(float_flag_invalid
, status
);
4940 return floatx80_default_nan(status
);
4947 if ( bSig
< aSig
) goto aBigger
;
4948 if ( aSig
< bSig
) goto bBigger
;
4949 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
4951 if ( bExp
== 0x7FFF ) {
4952 if ((uint64_t)(bSig
<< 1)) {
4953 return propagateFloatx80NaN(a
, b
, status
);
4955 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
4956 floatx80_infinity_low
);
4958 if ( aExp
== 0 ) ++expDiff
;
4959 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4961 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
4964 goto normalizeRoundAndPack
;
4966 if ( aExp
== 0x7FFF ) {
4967 if ((uint64_t)(aSig
<< 1)) {
4968 return propagateFloatx80NaN(a
, b
, status
);
4972 if ( bExp
== 0 ) --expDiff
;
4973 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4975 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
4977 normalizeRoundAndPack
:
4978 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
4979 zSign
, zExp
, zSig0
, zSig1
, status
);
4982 /*----------------------------------------------------------------------------
4983 | Returns the result of adding the extended double-precision floating-point
4984 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4985 | Standard for Binary Floating-Point Arithmetic.
4986 *----------------------------------------------------------------------------*/
4988 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
4992 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
4993 float_raise(float_flag_invalid
, status
);
4994 return floatx80_default_nan(status
);
4996 aSign
= extractFloatx80Sign( a
);
4997 bSign
= extractFloatx80Sign( b
);
4998 if ( aSign
== bSign
) {
4999 return addFloatx80Sigs(a
, b
, aSign
, status
);
5002 return subFloatx80Sigs(a
, b
, aSign
, status
);
5007 /*----------------------------------------------------------------------------
5008 | Returns the result of subtracting the extended double-precision floating-
5009 | point values `a' and `b'. The operation is performed according to the
5010 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5011 *----------------------------------------------------------------------------*/
5013 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5017 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5018 float_raise(float_flag_invalid
, status
);
5019 return floatx80_default_nan(status
);
5021 aSign
= extractFloatx80Sign( a
);
5022 bSign
= extractFloatx80Sign( b
);
5023 if ( aSign
== bSign
) {
5024 return subFloatx80Sigs(a
, b
, aSign
, status
);
5027 return addFloatx80Sigs(a
, b
, aSign
, status
);
5032 /*----------------------------------------------------------------------------
5033 | Returns the result of multiplying the extended double-precision floating-
5034 | point values `a' and `b'. The operation is performed according to the
5035 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5036 *----------------------------------------------------------------------------*/
5038 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
5040 flag aSign
, bSign
, zSign
;
5041 int32_t aExp
, bExp
, zExp
;
5042 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5044 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5045 float_raise(float_flag_invalid
, status
);
5046 return floatx80_default_nan(status
);
5048 aSig
= extractFloatx80Frac( a
);
5049 aExp
= extractFloatx80Exp( a
);
5050 aSign
= extractFloatx80Sign( a
);
5051 bSig
= extractFloatx80Frac( b
);
5052 bExp
= extractFloatx80Exp( b
);
5053 bSign
= extractFloatx80Sign( b
);
5054 zSign
= aSign
^ bSign
;
5055 if ( aExp
== 0x7FFF ) {
5056 if ( (uint64_t) ( aSig
<<1 )
5057 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5058 return propagateFloatx80NaN(a
, b
, status
);
5060 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
5061 return packFloatx80(zSign
, floatx80_infinity_high
,
5062 floatx80_infinity_low
);
5064 if ( bExp
== 0x7FFF ) {
5065 if ((uint64_t)(bSig
<< 1)) {
5066 return propagateFloatx80NaN(a
, b
, status
);
5068 if ( ( aExp
| aSig
) == 0 ) {
5070 float_raise(float_flag_invalid
, status
);
5071 return floatx80_default_nan(status
);
5073 return packFloatx80(zSign
, floatx80_infinity_high
,
5074 floatx80_infinity_low
);
5077 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5078 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5081 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5082 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5084 zExp
= aExp
+ bExp
- 0x3FFE;
5085 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
5086 if ( 0 < (int64_t) zSig0
) {
5087 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5090 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5091 zSign
, zExp
, zSig0
, zSig1
, status
);
5094 /*----------------------------------------------------------------------------
5095 | Returns the result of dividing the extended double-precision floating-point
5096 | value `a' by the corresponding value `b'. The operation is performed
5097 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5098 *----------------------------------------------------------------------------*/
5100 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
5102 flag aSign
, bSign
, zSign
;
5103 int32_t aExp
, bExp
, zExp
;
5104 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5105 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
5107 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5108 float_raise(float_flag_invalid
, status
);
5109 return floatx80_default_nan(status
);
5111 aSig
= extractFloatx80Frac( a
);
5112 aExp
= extractFloatx80Exp( a
);
5113 aSign
= extractFloatx80Sign( a
);
5114 bSig
= extractFloatx80Frac( b
);
5115 bExp
= extractFloatx80Exp( b
);
5116 bSign
= extractFloatx80Sign( b
);
5117 zSign
= aSign
^ bSign
;
5118 if ( aExp
== 0x7FFF ) {
5119 if ((uint64_t)(aSig
<< 1)) {
5120 return propagateFloatx80NaN(a
, b
, status
);
5122 if ( bExp
== 0x7FFF ) {
5123 if ((uint64_t)(bSig
<< 1)) {
5124 return propagateFloatx80NaN(a
, b
, status
);
5128 return packFloatx80(zSign
, floatx80_infinity_high
,
5129 floatx80_infinity_low
);
5131 if ( bExp
== 0x7FFF ) {
5132 if ((uint64_t)(bSig
<< 1)) {
5133 return propagateFloatx80NaN(a
, b
, status
);
5135 return packFloatx80( zSign
, 0, 0 );
5139 if ( ( aExp
| aSig
) == 0 ) {
5141 float_raise(float_flag_invalid
, status
);
5142 return floatx80_default_nan(status
);
5144 float_raise(float_flag_divbyzero
, status
);
5145 return packFloatx80(zSign
, floatx80_infinity_high
,
5146 floatx80_infinity_low
);
5148 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5151 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5152 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5154 zExp
= aExp
- bExp
+ 0x3FFE;
5156 if ( bSig
<= aSig
) {
5157 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
5160 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
5161 mul64To128( bSig
, zSig0
, &term0
, &term1
);
5162 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
5163 while ( (int64_t) rem0
< 0 ) {
5165 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
5167 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
5168 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
5169 mul64To128( bSig
, zSig1
, &term1
, &term2
);
5170 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5171 while ( (int64_t) rem1
< 0 ) {
5173 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
5175 zSig1
|= ( ( rem1
| rem2
) != 0 );
5177 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5178 zSign
, zExp
, zSig0
, zSig1
, status
);
5181 /*----------------------------------------------------------------------------
5182 | Returns the remainder of the extended double-precision floating-point value
5183 | `a' with respect to the corresponding value `b'. The operation is performed
5184 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5185 *----------------------------------------------------------------------------*/
5187 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
5190 int32_t aExp
, bExp
, expDiff
;
5191 uint64_t aSig0
, aSig1
, bSig
;
5192 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
5194 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5195 float_raise(float_flag_invalid
, status
);
5196 return floatx80_default_nan(status
);
5198 aSig0
= extractFloatx80Frac( a
);
5199 aExp
= extractFloatx80Exp( a
);
5200 aSign
= extractFloatx80Sign( a
);
5201 bSig
= extractFloatx80Frac( b
);
5202 bExp
= extractFloatx80Exp( b
);
5203 if ( aExp
== 0x7FFF ) {
5204 if ( (uint64_t) ( aSig0
<<1 )
5205 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5206 return propagateFloatx80NaN(a
, b
, status
);
5210 if ( bExp
== 0x7FFF ) {
5211 if ((uint64_t)(bSig
<< 1)) {
5212 return propagateFloatx80NaN(a
, b
, status
);
5219 float_raise(float_flag_invalid
, status
);
5220 return floatx80_default_nan(status
);
5222 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5225 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
5226 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5228 bSig
|= LIT64( 0x8000000000000000 );
5230 expDiff
= aExp
- bExp
;
5232 if ( expDiff
< 0 ) {
5233 if ( expDiff
< -1 ) return a
;
5234 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
5237 q
= ( bSig
<= aSig0
);
5238 if ( q
) aSig0
-= bSig
;
5240 while ( 0 < expDiff
) {
5241 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5242 q
= ( 2 < q
) ? q
- 2 : 0;
5243 mul64To128( bSig
, q
, &term0
, &term1
);
5244 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5245 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
5249 if ( 0 < expDiff
) {
5250 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5251 q
= ( 2 < q
) ? q
- 2 : 0;
5253 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
5254 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5255 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
5256 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
5258 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5265 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
5266 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5267 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5270 aSig0
= alternateASig0
;
5271 aSig1
= alternateASig1
;
5275 normalizeRoundAndPackFloatx80(
5276 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
5280 /*----------------------------------------------------------------------------
5281 | Returns the square root of the extended double-precision floating-point
5282 | value `a'. The operation is performed according to the IEC/IEEE Standard
5283 | for Binary Floating-Point Arithmetic.
5284 *----------------------------------------------------------------------------*/
5286 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
5290 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
5291 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5293 if (floatx80_invalid_encoding(a
)) {
5294 float_raise(float_flag_invalid
, status
);
5295 return floatx80_default_nan(status
);
5297 aSig0
= extractFloatx80Frac( a
);
5298 aExp
= extractFloatx80Exp( a
);
5299 aSign
= extractFloatx80Sign( a
);
5300 if ( aExp
== 0x7FFF ) {
5301 if ((uint64_t)(aSig0
<< 1)) {
5302 return propagateFloatx80NaN(a
, a
, status
);
5304 if ( ! aSign
) return a
;
5308 if ( ( aExp
| aSig0
) == 0 ) return a
;
5310 float_raise(float_flag_invalid
, status
);
5311 return floatx80_default_nan(status
);
5314 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
5315 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5317 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
5318 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
5319 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
5320 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5321 doubleZSig0
= zSig0
<<1;
5322 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5323 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5324 while ( (int64_t) rem0
< 0 ) {
5327 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5329 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5330 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5331 if ( zSig1
== 0 ) zSig1
= 1;
5332 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5333 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5334 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5335 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5336 while ( (int64_t) rem1
< 0 ) {
5338 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5340 term2
|= doubleZSig0
;
5341 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5343 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5345 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
5346 zSig0
|= doubleZSig0
;
5347 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5348 0, zExp
, zSig0
, zSig1
, status
);
5351 /*----------------------------------------------------------------------------
5352 | Returns 1 if the extended double-precision floating-point value `a' is equal
5353 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5354 | raised if either operand is a NaN. Otherwise, the comparison is performed
5355 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5356 *----------------------------------------------------------------------------*/
5358 int floatx80_eq(floatx80 a
, floatx80 b
, float_status
*status
)
5361 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5362 || (extractFloatx80Exp(a
) == 0x7FFF
5363 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5364 || (extractFloatx80Exp(b
) == 0x7FFF
5365 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5367 float_raise(float_flag_invalid
, status
);
5372 && ( ( a
.high
== b
.high
)
5374 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5379 /*----------------------------------------------------------------------------
5380 | Returns 1 if the extended double-precision floating-point value `a' is
5381 | less than or equal to the corresponding value `b', and 0 otherwise. The
5382 | invalid exception is raised if either operand is a NaN. The comparison is
5383 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5385 *----------------------------------------------------------------------------*/
5387 int floatx80_le(floatx80 a
, floatx80 b
, float_status
*status
)
5391 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5392 || (extractFloatx80Exp(a
) == 0x7FFF
5393 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5394 || (extractFloatx80Exp(b
) == 0x7FFF
5395 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5397 float_raise(float_flag_invalid
, status
);
5400 aSign
= extractFloatx80Sign( a
);
5401 bSign
= extractFloatx80Sign( b
);
5402 if ( aSign
!= bSign
) {
5405 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5409 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5410 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5414 /*----------------------------------------------------------------------------
5415 | Returns 1 if the extended double-precision floating-point value `a' is
5416 | less than the corresponding value `b', and 0 otherwise. The invalid
5417 | exception is raised if either operand is a NaN. The comparison is performed
5418 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5419 *----------------------------------------------------------------------------*/
5421 int floatx80_lt(floatx80 a
, floatx80 b
, float_status
*status
)
5425 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5426 || (extractFloatx80Exp(a
) == 0x7FFF
5427 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5428 || (extractFloatx80Exp(b
) == 0x7FFF
5429 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5431 float_raise(float_flag_invalid
, status
);
5434 aSign
= extractFloatx80Sign( a
);
5435 bSign
= extractFloatx80Sign( b
);
5436 if ( aSign
!= bSign
) {
5439 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5443 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5444 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5448 /*----------------------------------------------------------------------------
5449 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5450 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5451 | either operand is a NaN. The comparison is performed according to the
5452 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5453 *----------------------------------------------------------------------------*/
5454 int floatx80_unordered(floatx80 a
, floatx80 b
, float_status
*status
)
5456 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5457 || (extractFloatx80Exp(a
) == 0x7FFF
5458 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5459 || (extractFloatx80Exp(b
) == 0x7FFF
5460 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5462 float_raise(float_flag_invalid
, status
);
5468 /*----------------------------------------------------------------------------
5469 | Returns 1 if the extended double-precision floating-point value `a' is
5470 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5471 | cause an exception. The comparison is performed according to the IEC/IEEE
5472 | Standard for Binary Floating-Point Arithmetic.
5473 *----------------------------------------------------------------------------*/
5475 int floatx80_eq_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5478 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5479 float_raise(float_flag_invalid
, status
);
5482 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5483 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5484 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5485 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5487 if (floatx80_is_signaling_nan(a
, status
)
5488 || floatx80_is_signaling_nan(b
, status
)) {
5489 float_raise(float_flag_invalid
, status
);
5495 && ( ( a
.high
== b
.high
)
5497 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5502 /*----------------------------------------------------------------------------
5503 | Returns 1 if the extended double-precision floating-point value `a' is less
5504 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5505 | do not cause an exception. Otherwise, the comparison is performed according
5506 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5507 *----------------------------------------------------------------------------*/
5509 int floatx80_le_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5513 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5514 float_raise(float_flag_invalid
, status
);
5517 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5518 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5519 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5520 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5522 if (floatx80_is_signaling_nan(a
, status
)
5523 || floatx80_is_signaling_nan(b
, status
)) {
5524 float_raise(float_flag_invalid
, status
);
5528 aSign
= extractFloatx80Sign( a
);
5529 bSign
= extractFloatx80Sign( b
);
5530 if ( aSign
!= bSign
) {
5533 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5537 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5538 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5542 /*----------------------------------------------------------------------------
5543 | Returns 1 if the extended double-precision floating-point value `a' is less
5544 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5545 | an exception. Otherwise, the comparison is performed according to the
5546 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5547 *----------------------------------------------------------------------------*/
5549 int floatx80_lt_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5553 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5554 float_raise(float_flag_invalid
, status
);
5557 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5558 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5559 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5560 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5562 if (floatx80_is_signaling_nan(a
, status
)
5563 || floatx80_is_signaling_nan(b
, status
)) {
5564 float_raise(float_flag_invalid
, status
);
5568 aSign
= extractFloatx80Sign( a
);
5569 bSign
= extractFloatx80Sign( b
);
5570 if ( aSign
!= bSign
) {
5573 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5577 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5578 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5582 /*----------------------------------------------------------------------------
5583 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5584 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5585 | The comparison is performed according to the IEC/IEEE Standard for Binary
5586 | Floating-Point Arithmetic.
5587 *----------------------------------------------------------------------------*/
5588 int floatx80_unordered_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5590 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5591 float_raise(float_flag_invalid
, status
);
5594 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5595 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5596 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5597 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5599 if (floatx80_is_signaling_nan(a
, status
)
5600 || floatx80_is_signaling_nan(b
, status
)) {
5601 float_raise(float_flag_invalid
, status
);
5608 /*----------------------------------------------------------------------------
5609 | Returns the result of converting the quadruple-precision floating-point
5610 | value `a' to the 32-bit two's complement integer format. The conversion
5611 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5612 | Arithmetic---which means in particular that the conversion is rounded
5613 | according to the current rounding mode. If `a' is a NaN, the largest
5614 | positive integer is returned. Otherwise, if the conversion overflows, the
5615 | largest integer with the same sign as `a' is returned.
5616 *----------------------------------------------------------------------------*/
5618 int32_t float128_to_int32(float128 a
, float_status
*status
)
5621 int32_t aExp
, shiftCount
;
5622 uint64_t aSig0
, aSig1
;
5624 aSig1
= extractFloat128Frac1( a
);
5625 aSig0
= extractFloat128Frac0( a
);
5626 aExp
= extractFloat128Exp( a
);
5627 aSign
= extractFloat128Sign( a
);
5628 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
5629 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5630 aSig0
|= ( aSig1
!= 0 );
5631 shiftCount
= 0x4028 - aExp
;
5632 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
5633 return roundAndPackInt32(aSign
, aSig0
, status
);
5637 /*----------------------------------------------------------------------------
5638 | Returns the result of converting the quadruple-precision floating-point
5639 | value `a' to the 32-bit two's complement integer format. The conversion
5640 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5641 | Arithmetic, except that the conversion is always rounded toward zero. If
5642 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5643 | conversion overflows, the largest integer with the same sign as `a' is
5645 *----------------------------------------------------------------------------*/
5647 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
5650 int32_t aExp
, shiftCount
;
5651 uint64_t aSig0
, aSig1
, savedASig
;
5654 aSig1
= extractFloat128Frac1( a
);
5655 aSig0
= extractFloat128Frac0( a
);
5656 aExp
= extractFloat128Exp( a
);
5657 aSign
= extractFloat128Sign( a
);
5658 aSig0
|= ( aSig1
!= 0 );
5659 if ( 0x401E < aExp
) {
5660 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
5663 else if ( aExp
< 0x3FFF ) {
5664 if (aExp
|| aSig0
) {
5665 status
->float_exception_flags
|= float_flag_inexact
;
5669 aSig0
|= LIT64( 0x0001000000000000 );
5670 shiftCount
= 0x402F - aExp
;
5672 aSig0
>>= shiftCount
;
5674 if ( aSign
) z
= - z
;
5675 if ( ( z
< 0 ) ^ aSign
) {
5677 float_raise(float_flag_invalid
, status
);
5678 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5680 if ( ( aSig0
<<shiftCount
) != savedASig
) {
5681 status
->float_exception_flags
|= float_flag_inexact
;
5687 /*----------------------------------------------------------------------------
5688 | Returns the result of converting the quadruple-precision floating-point
5689 | value `a' to the 64-bit two's complement integer format. The conversion
5690 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5691 | Arithmetic---which means in particular that the conversion is rounded
5692 | according to the current rounding mode. If `a' is a NaN, the largest
5693 | positive integer is returned. Otherwise, if the conversion overflows, the
5694 | largest integer with the same sign as `a' is returned.
5695 *----------------------------------------------------------------------------*/
5697 int64_t float128_to_int64(float128 a
, float_status
*status
)
5700 int32_t aExp
, shiftCount
;
5701 uint64_t aSig0
, aSig1
;
5703 aSig1
= extractFloat128Frac1( a
);
5704 aSig0
= extractFloat128Frac0( a
);
5705 aExp
= extractFloat128Exp( a
);
5706 aSign
= extractFloat128Sign( a
);
5707 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5708 shiftCount
= 0x402F - aExp
;
5709 if ( shiftCount
<= 0 ) {
5710 if ( 0x403E < aExp
) {
5711 float_raise(float_flag_invalid
, status
);
5713 || ( ( aExp
== 0x7FFF )
5714 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
5717 return LIT64( 0x7FFFFFFFFFFFFFFF );
5719 return (int64_t) LIT64( 0x8000000000000000 );
5721 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
5724 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
5726 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
5730 /*----------------------------------------------------------------------------
5731 | Returns the result of converting the quadruple-precision floating-point
5732 | value `a' to the 64-bit two's complement integer format. The conversion
5733 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5734 | Arithmetic, except that the conversion is always rounded toward zero.
5735 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5736 | the conversion overflows, the largest integer with the same sign as `a' is
5738 *----------------------------------------------------------------------------*/
5740 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
5743 int32_t aExp
, shiftCount
;
5744 uint64_t aSig0
, aSig1
;
5747 aSig1
= extractFloat128Frac1( a
);
5748 aSig0
= extractFloat128Frac0( a
);
5749 aExp
= extractFloat128Exp( a
);
5750 aSign
= extractFloat128Sign( a
);
5751 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5752 shiftCount
= aExp
- 0x402F;
5753 if ( 0 < shiftCount
) {
5754 if ( 0x403E <= aExp
) {
5755 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
5756 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
5757 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
5759 status
->float_exception_flags
|= float_flag_inexact
;
5763 float_raise(float_flag_invalid
, status
);
5764 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
5765 return LIT64( 0x7FFFFFFFFFFFFFFF );
5768 return (int64_t) LIT64( 0x8000000000000000 );
5770 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
5771 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
5772 status
->float_exception_flags
|= float_flag_inexact
;
5776 if ( aExp
< 0x3FFF ) {
5777 if ( aExp
| aSig0
| aSig1
) {
5778 status
->float_exception_flags
|= float_flag_inexact
;
5782 z
= aSig0
>>( - shiftCount
);
5784 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
5785 status
->float_exception_flags
|= float_flag_inexact
;
5788 if ( aSign
) z
= - z
;
5793 /*----------------------------------------------------------------------------
5794 | Returns the result of converting the quadruple-precision floating-point value
5795 | `a' to the 64-bit unsigned integer format. The conversion is
5796 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5797 | Arithmetic---which means in particular that the conversion is rounded
5798 | according to the current rounding mode. If `a' is a NaN, the largest
5799 | positive integer is returned. If the conversion overflows, the
5800 | largest unsigned integer is returned. If 'a' is negative, the value is
5801 | rounded and zero is returned; negative values that do not round to zero
5802 | will raise the inexact exception.
5803 *----------------------------------------------------------------------------*/
5805 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
5810 uint64_t aSig0
, aSig1
;
5812 aSig0
= extractFloat128Frac0(a
);
5813 aSig1
= extractFloat128Frac1(a
);
5814 aExp
= extractFloat128Exp(a
);
5815 aSign
= extractFloat128Sign(a
);
5816 if (aSign
&& (aExp
> 0x3FFE)) {
5817 float_raise(float_flag_invalid
, status
);
5818 if (float128_is_any_nan(a
)) {
5819 return LIT64(0xFFFFFFFFFFFFFFFF);
5825 aSig0
|= LIT64(0x0001000000000000);
5827 shiftCount
= 0x402F - aExp
;
5828 if (shiftCount
<= 0) {
5829 if (0x403E < aExp
) {
5830 float_raise(float_flag_invalid
, status
);
5831 return LIT64(0xFFFFFFFFFFFFFFFF);
5833 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
5835 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
5837 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
5840 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
5843 signed char current_rounding_mode
= status
->float_rounding_mode
;
5845 set_float_rounding_mode(float_round_to_zero
, status
);
5846 v
= float128_to_uint64(a
, status
);
5847 set_float_rounding_mode(current_rounding_mode
, status
);
5852 /*----------------------------------------------------------------------------
5853 | Returns the result of converting the quadruple-precision floating-point
5854 | value `a' to the 32-bit unsigned integer format. The conversion
5855 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5856 | Arithmetic except that the conversion is always rounded toward zero.
5857 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5858 | if the conversion overflows, the largest unsigned integer is returned.
5859 | If 'a' is negative, the value is rounded and zero is returned; negative
5860 | values that do not round to zero will raise the inexact exception.
5861 *----------------------------------------------------------------------------*/
5863 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
5867 int old_exc_flags
= get_float_exception_flags(status
);
5869 v
= float128_to_uint64_round_to_zero(a
, status
);
5870 if (v
> 0xffffffff) {
5875 set_float_exception_flags(old_exc_flags
, status
);
5876 float_raise(float_flag_invalid
, status
);
5880 /*----------------------------------------------------------------------------
5881 | Returns the result of converting the quadruple-precision floating-point
5882 | value `a' to the single-precision floating-point format. The conversion
5883 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5885 *----------------------------------------------------------------------------*/
5887 float32
float128_to_float32(float128 a
, float_status
*status
)
5891 uint64_t aSig0
, aSig1
;
5894 aSig1
= extractFloat128Frac1( a
);
5895 aSig0
= extractFloat128Frac0( a
);
5896 aExp
= extractFloat128Exp( a
);
5897 aSign
= extractFloat128Sign( a
);
5898 if ( aExp
== 0x7FFF ) {
5899 if ( aSig0
| aSig1
) {
5900 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
5902 return packFloat32( aSign
, 0xFF, 0 );
5904 aSig0
|= ( aSig1
!= 0 );
5905 shift64RightJamming( aSig0
, 18, &aSig0
);
5907 if ( aExp
|| zSig
) {
5911 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
5915 /*----------------------------------------------------------------------------
5916 | Returns the result of converting the quadruple-precision floating-point
5917 | value `a' to the double-precision floating-point format. The conversion
5918 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5920 *----------------------------------------------------------------------------*/
5922 float64
float128_to_float64(float128 a
, float_status
*status
)
5926 uint64_t aSig0
, aSig1
;
5928 aSig1
= extractFloat128Frac1( a
);
5929 aSig0
= extractFloat128Frac0( a
);
5930 aExp
= extractFloat128Exp( a
);
5931 aSign
= extractFloat128Sign( a
);
5932 if ( aExp
== 0x7FFF ) {
5933 if ( aSig0
| aSig1
) {
5934 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
5936 return packFloat64( aSign
, 0x7FF, 0 );
5938 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
5939 aSig0
|= ( aSig1
!= 0 );
5940 if ( aExp
|| aSig0
) {
5941 aSig0
|= LIT64( 0x4000000000000000 );
5944 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
5948 /*----------------------------------------------------------------------------
5949 | Returns the result of converting the quadruple-precision floating-point
5950 | value `a' to the extended double-precision floating-point format. The
5951 | conversion is performed according to the IEC/IEEE Standard for Binary
5952 | Floating-Point Arithmetic.
5953 *----------------------------------------------------------------------------*/
5955 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
5959 uint64_t aSig0
, aSig1
;
5961 aSig1
= extractFloat128Frac1( a
);
5962 aSig0
= extractFloat128Frac0( a
);
5963 aExp
= extractFloat128Exp( a
);
5964 aSign
= extractFloat128Sign( a
);
5965 if ( aExp
== 0x7FFF ) {
5966 if ( aSig0
| aSig1
) {
5967 return commonNaNToFloatx80(float128ToCommonNaN(a
, status
), status
);
5969 return packFloatx80(aSign
, floatx80_infinity_high
,
5970 floatx80_infinity_low
);
5973 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
5974 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5977 aSig0
|= LIT64( 0x0001000000000000 );
5979 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
5980 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
5984 /*----------------------------------------------------------------------------
5985 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5986 | returns the result as a quadruple-precision floating-point value. The
5987 | operation is performed according to the IEC/IEEE Standard for Binary
5988 | Floating-Point Arithmetic.
5989 *----------------------------------------------------------------------------*/
5991 float128
float128_round_to_int(float128 a
, float_status
*status
)
5995 uint64_t lastBitMask
, roundBitsMask
;
5998 aExp
= extractFloat128Exp( a
);
5999 if ( 0x402F <= aExp
) {
6000 if ( 0x406F <= aExp
) {
6001 if ( ( aExp
== 0x7FFF )
6002 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6004 return propagateFloat128NaN(a
, a
, status
);
6009 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6010 roundBitsMask
= lastBitMask
- 1;
6012 switch (status
->float_rounding_mode
) {
6013 case float_round_nearest_even
:
6014 if ( lastBitMask
) {
6015 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6016 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6019 if ( (int64_t) z
.low
< 0 ) {
6021 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6025 case float_round_ties_away
:
6027 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6029 if ((int64_t) z
.low
< 0) {
6034 case float_round_to_zero
:
6036 case float_round_up
:
6037 if (!extractFloat128Sign(z
)) {
6038 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6041 case float_round_down
:
6042 if (extractFloat128Sign(z
)) {
6043 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6049 z
.low
&= ~ roundBitsMask
;
6052 if ( aExp
< 0x3FFF ) {
6053 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6054 status
->float_exception_flags
|= float_flag_inexact
;
6055 aSign
= extractFloat128Sign( a
);
6056 switch (status
->float_rounding_mode
) {
6057 case float_round_nearest_even
:
6058 if ( ( aExp
== 0x3FFE )
6059 && ( extractFloat128Frac0( a
)
6060 | extractFloat128Frac1( a
) )
6062 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6065 case float_round_ties_away
:
6066 if (aExp
== 0x3FFE) {
6067 return packFloat128(aSign
, 0x3FFF, 0, 0);
6070 case float_round_down
:
6072 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6073 : packFloat128( 0, 0, 0, 0 );
6074 case float_round_up
:
6076 aSign
? packFloat128( 1, 0, 0, 0 )
6077 : packFloat128( 0, 0x3FFF, 0, 0 );
6079 return packFloat128( aSign
, 0, 0, 0 );
6082 lastBitMask
<<= 0x402F - aExp
;
6083 roundBitsMask
= lastBitMask
- 1;
6086 switch (status
->float_rounding_mode
) {
6087 case float_round_nearest_even
:
6088 z
.high
+= lastBitMask
>>1;
6089 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6090 z
.high
&= ~ lastBitMask
;
6093 case float_round_ties_away
:
6094 z
.high
+= lastBitMask
>>1;
6096 case float_round_to_zero
:
6098 case float_round_up
:
6099 if (!extractFloat128Sign(z
)) {
6100 z
.high
|= ( a
.low
!= 0 );
6101 z
.high
+= roundBitsMask
;
6104 case float_round_down
:
6105 if (extractFloat128Sign(z
)) {
6106 z
.high
|= (a
.low
!= 0);
6107 z
.high
+= roundBitsMask
;
6113 z
.high
&= ~ roundBitsMask
;
6115 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
6116 status
->float_exception_flags
|= float_flag_inexact
;
6122 /*----------------------------------------------------------------------------
6123 | Returns the result of adding the absolute values of the quadruple-precision
6124 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6125 | before being returned. `zSign' is ignored if the result is a NaN.
6126 | The addition is performed according to the IEC/IEEE Standard for Binary
6127 | Floating-Point Arithmetic.
6128 *----------------------------------------------------------------------------*/
6130 static float128
addFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6131 float_status
*status
)
6133 int32_t aExp
, bExp
, zExp
;
6134 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6137 aSig1
= extractFloat128Frac1( a
);
6138 aSig0
= extractFloat128Frac0( a
);
6139 aExp
= extractFloat128Exp( a
);
6140 bSig1
= extractFloat128Frac1( b
);
6141 bSig0
= extractFloat128Frac0( b
);
6142 bExp
= extractFloat128Exp( b
);
6143 expDiff
= aExp
- bExp
;
6144 if ( 0 < expDiff
) {
6145 if ( aExp
== 0x7FFF ) {
6146 if (aSig0
| aSig1
) {
6147 return propagateFloat128NaN(a
, b
, status
);
6155 bSig0
|= LIT64( 0x0001000000000000 );
6157 shift128ExtraRightJamming(
6158 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
6161 else if ( expDiff
< 0 ) {
6162 if ( bExp
== 0x7FFF ) {
6163 if (bSig0
| bSig1
) {
6164 return propagateFloat128NaN(a
, b
, status
);
6166 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6172 aSig0
|= LIT64( 0x0001000000000000 );
6174 shift128ExtraRightJamming(
6175 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
6179 if ( aExp
== 0x7FFF ) {
6180 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6181 return propagateFloat128NaN(a
, b
, status
);
6185 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6187 if (status
->flush_to_zero
) {
6188 if (zSig0
| zSig1
) {
6189 float_raise(float_flag_output_denormal
, status
);
6191 return packFloat128(zSign
, 0, 0, 0);
6193 return packFloat128( zSign
, 0, zSig0
, zSig1
);
6196 zSig0
|= LIT64( 0x0002000000000000 );
6200 aSig0
|= LIT64( 0x0001000000000000 );
6201 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6203 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
6206 shift128ExtraRightJamming(
6207 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6209 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6213 /*----------------------------------------------------------------------------
6214 | Returns the result of subtracting the absolute values of the quadruple-
6215 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6216 | difference is negated before being returned. `zSign' is ignored if the
6217 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6218 | Standard for Binary Floating-Point Arithmetic.
6219 *----------------------------------------------------------------------------*/
6221 static float128
subFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6222 float_status
*status
)
6224 int32_t aExp
, bExp
, zExp
;
6225 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
6228 aSig1
= extractFloat128Frac1( a
);
6229 aSig0
= extractFloat128Frac0( a
);
6230 aExp
= extractFloat128Exp( a
);
6231 bSig1
= extractFloat128Frac1( b
);
6232 bSig0
= extractFloat128Frac0( b
);
6233 bExp
= extractFloat128Exp( b
);
6234 expDiff
= aExp
- bExp
;
6235 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6236 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
6237 if ( 0 < expDiff
) goto aExpBigger
;
6238 if ( expDiff
< 0 ) goto bExpBigger
;
6239 if ( aExp
== 0x7FFF ) {
6240 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6241 return propagateFloat128NaN(a
, b
, status
);
6243 float_raise(float_flag_invalid
, status
);
6244 return float128_default_nan(status
);
6250 if ( bSig0
< aSig0
) goto aBigger
;
6251 if ( aSig0
< bSig0
) goto bBigger
;
6252 if ( bSig1
< aSig1
) goto aBigger
;
6253 if ( aSig1
< bSig1
) goto bBigger
;
6254 return packFloat128(status
->float_rounding_mode
== float_round_down
,
6257 if ( bExp
== 0x7FFF ) {
6258 if (bSig0
| bSig1
) {
6259 return propagateFloat128NaN(a
, b
, status
);
6261 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
6267 aSig0
|= LIT64( 0x4000000000000000 );
6269 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6270 bSig0
|= LIT64( 0x4000000000000000 );
6272 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6275 goto normalizeRoundAndPack
;
6277 if ( aExp
== 0x7FFF ) {
6278 if (aSig0
| aSig1
) {
6279 return propagateFloat128NaN(a
, b
, status
);
6287 bSig0
|= LIT64( 0x4000000000000000 );
6289 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
6290 aSig0
|= LIT64( 0x4000000000000000 );
6292 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6294 normalizeRoundAndPack
:
6296 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
6301 /*----------------------------------------------------------------------------
6302 | Returns the result of adding the quadruple-precision floating-point values
6303 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6304 | for Binary Floating-Point Arithmetic.
6305 *----------------------------------------------------------------------------*/
6307 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
6311 aSign
= extractFloat128Sign( a
);
6312 bSign
= extractFloat128Sign( b
);
6313 if ( aSign
== bSign
) {
6314 return addFloat128Sigs(a
, b
, aSign
, status
);
6317 return subFloat128Sigs(a
, b
, aSign
, status
);
6322 /*----------------------------------------------------------------------------
6323 | Returns the result of subtracting the quadruple-precision floating-point
6324 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6325 | Standard for Binary Floating-Point Arithmetic.
6326 *----------------------------------------------------------------------------*/
6328 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
6332 aSign
= extractFloat128Sign( a
);
6333 bSign
= extractFloat128Sign( b
);
6334 if ( aSign
== bSign
) {
6335 return subFloat128Sigs(a
, b
, aSign
, status
);
6338 return addFloat128Sigs(a
, b
, aSign
, status
);
6343 /*----------------------------------------------------------------------------
6344 | Returns the result of multiplying the quadruple-precision floating-point
6345 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6346 | Standard for Binary Floating-Point Arithmetic.
6347 *----------------------------------------------------------------------------*/
6349 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
6351 flag aSign
, bSign
, zSign
;
6352 int32_t aExp
, bExp
, zExp
;
6353 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
6355 aSig1
= extractFloat128Frac1( a
);
6356 aSig0
= extractFloat128Frac0( a
);
6357 aExp
= extractFloat128Exp( a
);
6358 aSign
= extractFloat128Sign( a
);
6359 bSig1
= extractFloat128Frac1( b
);
6360 bSig0
= extractFloat128Frac0( b
);
6361 bExp
= extractFloat128Exp( b
);
6362 bSign
= extractFloat128Sign( b
);
6363 zSign
= aSign
^ bSign
;
6364 if ( aExp
== 0x7FFF ) {
6365 if ( ( aSig0
| aSig1
)
6366 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6367 return propagateFloat128NaN(a
, b
, status
);
6369 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
6370 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6372 if ( bExp
== 0x7FFF ) {
6373 if (bSig0
| bSig1
) {
6374 return propagateFloat128NaN(a
, b
, status
);
6376 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6378 float_raise(float_flag_invalid
, status
);
6379 return float128_default_nan(status
);
6381 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6384 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6385 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6388 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6389 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6391 zExp
= aExp
+ bExp
- 0x4000;
6392 aSig0
|= LIT64( 0x0001000000000000 );
6393 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
6394 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
6395 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6396 zSig2
|= ( zSig3
!= 0 );
6397 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
6398 shift128ExtraRightJamming(
6399 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6402 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6406 /*----------------------------------------------------------------------------
6407 | Returns the result of dividing the quadruple-precision floating-point value
6408 | `a' by the corresponding value `b'. The operation is performed according to
6409 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6410 *----------------------------------------------------------------------------*/
6412 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
6414 flag aSign
, bSign
, zSign
;
6415 int32_t aExp
, bExp
, zExp
;
6416 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6417 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6419 aSig1
= extractFloat128Frac1( a
);
6420 aSig0
= extractFloat128Frac0( a
);
6421 aExp
= extractFloat128Exp( a
);
6422 aSign
= extractFloat128Sign( a
);
6423 bSig1
= extractFloat128Frac1( b
);
6424 bSig0
= extractFloat128Frac0( b
);
6425 bExp
= extractFloat128Exp( b
);
6426 bSign
= extractFloat128Sign( b
);
6427 zSign
= aSign
^ bSign
;
6428 if ( aExp
== 0x7FFF ) {
6429 if (aSig0
| aSig1
) {
6430 return propagateFloat128NaN(a
, b
, status
);
6432 if ( bExp
== 0x7FFF ) {
6433 if (bSig0
| bSig1
) {
6434 return propagateFloat128NaN(a
, b
, status
);
6438 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6440 if ( bExp
== 0x7FFF ) {
6441 if (bSig0
| bSig1
) {
6442 return propagateFloat128NaN(a
, b
, status
);
6444 return packFloat128( zSign
, 0, 0, 0 );
6447 if ( ( bSig0
| bSig1
) == 0 ) {
6448 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6450 float_raise(float_flag_invalid
, status
);
6451 return float128_default_nan(status
);
6453 float_raise(float_flag_divbyzero
, status
);
6454 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6456 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6459 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6460 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6462 zExp
= aExp
- bExp
+ 0x3FFD;
6464 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
6466 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6467 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
6468 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
6471 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6472 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
6473 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
6474 while ( (int64_t) rem0
< 0 ) {
6476 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
6478 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
6479 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
6480 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
6481 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
6482 while ( (int64_t) rem1
< 0 ) {
6484 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
6486 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6488 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
6489 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6493 /*----------------------------------------------------------------------------
6494 | Returns the remainder of the quadruple-precision floating-point value `a'
6495 | with respect to the corresponding value `b'. The operation is performed
6496 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6497 *----------------------------------------------------------------------------*/
6499 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
6502 int32_t aExp
, bExp
, expDiff
;
6503 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
6504 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
6507 aSig1
= extractFloat128Frac1( a
);
6508 aSig0
= extractFloat128Frac0( a
);
6509 aExp
= extractFloat128Exp( a
);
6510 aSign
= extractFloat128Sign( a
);
6511 bSig1
= extractFloat128Frac1( b
);
6512 bSig0
= extractFloat128Frac0( b
);
6513 bExp
= extractFloat128Exp( b
);
6514 if ( aExp
== 0x7FFF ) {
6515 if ( ( aSig0
| aSig1
)
6516 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6517 return propagateFloat128NaN(a
, b
, status
);
6521 if ( bExp
== 0x7FFF ) {
6522 if (bSig0
| bSig1
) {
6523 return propagateFloat128NaN(a
, b
, status
);
6528 if ( ( bSig0
| bSig1
) == 0 ) {
6530 float_raise(float_flag_invalid
, status
);
6531 return float128_default_nan(status
);
6533 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6536 if ( ( aSig0
| aSig1
) == 0 ) return a
;
6537 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6539 expDiff
= aExp
- bExp
;
6540 if ( expDiff
< -1 ) return a
;
6542 aSig0
| LIT64( 0x0001000000000000 ),
6544 15 - ( expDiff
< 0 ),
6549 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6550 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
6551 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6553 while ( 0 < expDiff
) {
6554 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6555 q
= ( 4 < q
) ? q
- 4 : 0;
6556 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6557 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
6558 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
6559 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
6562 if ( -64 < expDiff
) {
6563 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6564 q
= ( 4 < q
) ? q
- 4 : 0;
6566 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6568 if ( expDiff
< 0 ) {
6569 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6572 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
6574 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6575 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
6578 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
6579 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6582 alternateASig0
= aSig0
;
6583 alternateASig1
= aSig1
;
6585 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6586 } while ( 0 <= (int64_t) aSig0
);
6588 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
6589 if ( ( sigMean0
< 0 )
6590 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
6591 aSig0
= alternateASig0
;
6592 aSig1
= alternateASig1
;
6594 zSign
= ( (int64_t) aSig0
< 0 );
6595 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
6596 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
6600 /*----------------------------------------------------------------------------
6601 | Returns the square root of the quadruple-precision floating-point value `a'.
6602 | The operation is performed according to the IEC/IEEE Standard for Binary
6603 | Floating-Point Arithmetic.
6604 *----------------------------------------------------------------------------*/
6606 float128
float128_sqrt(float128 a
, float_status
*status
)
6610 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
6611 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6613 aSig1
= extractFloat128Frac1( a
);
6614 aSig0
= extractFloat128Frac0( a
);
6615 aExp
= extractFloat128Exp( a
);
6616 aSign
= extractFloat128Sign( a
);
6617 if ( aExp
== 0x7FFF ) {
6618 if (aSig0
| aSig1
) {
6619 return propagateFloat128NaN(a
, a
, status
);
6621 if ( ! aSign
) return a
;
6625 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
6627 float_raise(float_flag_invalid
, status
);
6628 return float128_default_nan(status
);
6631 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
6632 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6634 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
6635 aSig0
|= LIT64( 0x0001000000000000 );
6636 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
6637 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
6638 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6639 doubleZSig0
= zSig0
<<1;
6640 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6641 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6642 while ( (int64_t) rem0
< 0 ) {
6645 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6647 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6648 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
6649 if ( zSig1
== 0 ) zSig1
= 1;
6650 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6651 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6652 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6653 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6654 while ( (int64_t) rem1
< 0 ) {
6656 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6658 term2
|= doubleZSig0
;
6659 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6661 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6663 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
6664 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
6668 /*----------------------------------------------------------------------------
6669 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6670 | the corresponding value `b', and 0 otherwise. The invalid exception is
6671 | raised if either operand is a NaN. Otherwise, the comparison is performed
6672 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6673 *----------------------------------------------------------------------------*/
6675 int float128_eq(float128 a
, float128 b
, float_status
*status
)
6678 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6679 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6680 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6681 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6683 float_raise(float_flag_invalid
, status
);
6688 && ( ( a
.high
== b
.high
)
6690 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6695 /*----------------------------------------------------------------------------
6696 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6697 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6698 | exception is raised if either operand is a NaN. The comparison is performed
6699 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6700 *----------------------------------------------------------------------------*/
6702 int float128_le(float128 a
, float128 b
, float_status
*status
)
6706 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6707 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6708 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6709 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6711 float_raise(float_flag_invalid
, status
);
6714 aSign
= extractFloat128Sign( a
);
6715 bSign
= extractFloat128Sign( b
);
6716 if ( aSign
!= bSign
) {
6719 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6723 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6724 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6728 /*----------------------------------------------------------------------------
6729 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6730 | the corresponding value `b', and 0 otherwise. The invalid exception is
6731 | raised if either operand is a NaN. The comparison is performed according
6732 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6733 *----------------------------------------------------------------------------*/
6735 int float128_lt(float128 a
, float128 b
, float_status
*status
)
6739 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6740 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6741 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6742 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6744 float_raise(float_flag_invalid
, status
);
6747 aSign
= extractFloat128Sign( a
);
6748 bSign
= extractFloat128Sign( b
);
6749 if ( aSign
!= bSign
) {
6752 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6756 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6757 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6761 /*----------------------------------------------------------------------------
6762 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6763 | be compared, and 0 otherwise. The invalid exception is raised if either
6764 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6765 | Standard for Binary Floating-Point Arithmetic.
6766 *----------------------------------------------------------------------------*/
6768 int float128_unordered(float128 a
, float128 b
, float_status
*status
)
6770 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6771 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6772 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6773 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6775 float_raise(float_flag_invalid
, status
);
6781 /*----------------------------------------------------------------------------
6782 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6783 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6784 | exception. The comparison is performed according to the IEC/IEEE Standard
6785 | for Binary Floating-Point Arithmetic.
6786 *----------------------------------------------------------------------------*/
6788 int float128_eq_quiet(float128 a
, float128 b
, float_status
*status
)
6791 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6792 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6793 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6794 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6796 if (float128_is_signaling_nan(a
, status
)
6797 || float128_is_signaling_nan(b
, status
)) {
6798 float_raise(float_flag_invalid
, status
);
6804 && ( ( a
.high
== b
.high
)
6806 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6811 /*----------------------------------------------------------------------------
6812 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6813 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6814 | cause an exception. Otherwise, the comparison is performed according to the
6815 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6816 *----------------------------------------------------------------------------*/
6818 int float128_le_quiet(float128 a
, float128 b
, float_status
*status
)
6822 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6823 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6824 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6825 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6827 if (float128_is_signaling_nan(a
, status
)
6828 || float128_is_signaling_nan(b
, status
)) {
6829 float_raise(float_flag_invalid
, status
);
6833 aSign
= extractFloat128Sign( a
);
6834 bSign
= extractFloat128Sign( b
);
6835 if ( aSign
!= bSign
) {
6838 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6842 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6843 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6847 /*----------------------------------------------------------------------------
6848 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6849 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6850 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6851 | Standard for Binary Floating-Point Arithmetic.
6852 *----------------------------------------------------------------------------*/
6854 int float128_lt_quiet(float128 a
, float128 b
, float_status
*status
)
6858 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6859 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6860 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6861 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6863 if (float128_is_signaling_nan(a
, status
)
6864 || float128_is_signaling_nan(b
, status
)) {
6865 float_raise(float_flag_invalid
, status
);
6869 aSign
= extractFloat128Sign( a
);
6870 bSign
= extractFloat128Sign( b
);
6871 if ( aSign
!= bSign
) {
6874 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6878 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6879 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6883 /*----------------------------------------------------------------------------
6884 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6885 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6886 | comparison is performed according to the IEC/IEEE Standard for Binary
6887 | Floating-Point Arithmetic.
6888 *----------------------------------------------------------------------------*/
6890 int float128_unordered_quiet(float128 a
, float128 b
, float_status
*status
)
6892 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6893 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6894 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6895 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6897 if (float128_is_signaling_nan(a
, status
)
6898 || float128_is_signaling_nan(b
, status
)) {
6899 float_raise(float_flag_invalid
, status
);
6906 static inline int floatx80_compare_internal(floatx80 a
, floatx80 b
,
6907 int is_quiet
, float_status
*status
)
6911 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6912 float_raise(float_flag_invalid
, status
);
6913 return float_relation_unordered
;
6915 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
6916 ( extractFloatx80Frac( a
)<<1 ) ) ||
6917 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
6918 ( extractFloatx80Frac( b
)<<1 ) )) {
6920 floatx80_is_signaling_nan(a
, status
) ||
6921 floatx80_is_signaling_nan(b
, status
)) {
6922 float_raise(float_flag_invalid
, status
);
6924 return float_relation_unordered
;
6926 aSign
= extractFloatx80Sign( a
);
6927 bSign
= extractFloatx80Sign( b
);
6928 if ( aSign
!= bSign
) {
6930 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
6931 ( ( a
.low
| b
.low
) == 0 ) ) {
6933 return float_relation_equal
;
6935 return 1 - (2 * aSign
);
6938 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6939 return float_relation_equal
;
6941 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6946 int floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
6948 return floatx80_compare_internal(a
, b
, 0, status
);
6951 int floatx80_compare_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6953 return floatx80_compare_internal(a
, b
, 1, status
);
6956 static inline int float128_compare_internal(float128 a
, float128 b
,
6957 int is_quiet
, float_status
*status
)
6961 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
6962 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
6963 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
6964 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
6966 float128_is_signaling_nan(a
, status
) ||
6967 float128_is_signaling_nan(b
, status
)) {
6968 float_raise(float_flag_invalid
, status
);
6970 return float_relation_unordered
;
6972 aSign
= extractFloat128Sign( a
);
6973 bSign
= extractFloat128Sign( b
);
6974 if ( aSign
!= bSign
) {
6975 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
6977 return float_relation_equal
;
6979 return 1 - (2 * aSign
);
6982 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6983 return float_relation_equal
;
6985 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6990 int float128_compare(float128 a
, float128 b
, float_status
*status
)
6992 return float128_compare_internal(a
, b
, 0, status
);
6995 int float128_compare_quiet(float128 a
, float128 b
, float_status
*status
)
6997 return float128_compare_internal(a
, b
, 1, status
);
7000 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7006 if (floatx80_invalid_encoding(a
)) {
7007 float_raise(float_flag_invalid
, status
);
7008 return floatx80_default_nan(status
);
7010 aSig
= extractFloatx80Frac( a
);
7011 aExp
= extractFloatx80Exp( a
);
7012 aSign
= extractFloatx80Sign( a
);
7014 if ( aExp
== 0x7FFF ) {
7016 return propagateFloatx80NaN(a
, a
, status
);
7030 } else if (n
< -0x10000) {
7035 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7036 aSign
, aExp
, aSig
, 0, status
);
7039 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7043 uint64_t aSig0
, aSig1
;
7045 aSig1
= extractFloat128Frac1( a
);
7046 aSig0
= extractFloat128Frac0( a
);
7047 aExp
= extractFloat128Exp( a
);
7048 aSign
= extractFloat128Sign( a
);
7049 if ( aExp
== 0x7FFF ) {
7050 if ( aSig0
| aSig1
) {
7051 return propagateFloat128NaN(a
, a
, status
);
7056 aSig0
|= LIT64( 0x0001000000000000 );
7057 } else if (aSig0
== 0 && aSig1
== 0) {
7065 } else if (n
< -0x10000) {
7070 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1