4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine: (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output. These details are target-
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
112 static inline uint32_t extractFloat16Frac(float16 a
)
114 return float16_val(a
) & 0x3ff;
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
121 static inline int extractFloat16Exp(float16 a
)
123 return (float16_val(a
) >> 10) & 0x1f;
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
130 static inline flag
extractFloat16Sign(float16 a
)
132 return float16_val(a
)>>15;
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
139 static inline uint32_t extractFloat32Frac(float32 a
)
141 return float32_val(a
) & 0x007FFFFF;
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
148 static inline int extractFloat32Exp(float32 a
)
150 return (float32_val(a
) >> 23) & 0xFF;
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
157 static inline flag
extractFloat32Sign(float32 a
)
159 return float32_val(a
) >> 31;
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
166 static inline uint64_t extractFloat64Frac(float64 a
)
168 return float64_val(a
) & LIT64(0x000FFFFFFFFFFFFF);
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
175 static inline int extractFloat64Exp(float64 a
)
177 return (float64_val(a
) >> 52) & 0x7FF;
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
184 static inline flag
extractFloat64Sign(float64 a
)
186 return float64_val(a
) >> 63;
190 * Classify a floating point number. Everything above float_class_qnan
191 * is a NaN so cls >= float_class_qnan is any NaN.
194 typedef enum __attribute__ ((__packed__
)) {
195 float_class_unclassified
,
199 float_class_qnan
, /* all NaNs from here */
202 float_class_msnan
, /* maybe silenced */
206 * Structure holding all of the decomposed parts of a float. The
207 * exponent is unbiased and the fraction is normalized. All
208 * calculations are done with a 64 bit fraction and then rounded as
209 * appropriate for the final format.
211 * Thanks to the packed FloatClass a decent compiler should be able to
212 * fit the whole structure into registers and avoid using the stack
213 * for parameter passing.
223 #define DECOMPOSED_BINARY_POINT (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
227 /* Structure holding all of the relevant parameters for a format.
228 * exp_size: the size of the exponent field
229 * exp_bias: the offset applied to the exponent field
230 * exp_max: the maximum normalised exponent
231 * frac_size: the size of the fraction field
232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233 * The following are computed based the size of fraction
234 * frac_lsb: least significant bit of fraction
235 * fram_lsbm1: the bit bellow the least significant bit (for rounding)
236 * round_mask/roundeven_mask: masks used for rounding
247 uint64_t roundeven_mask
;
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F) \
253 .exp_bias = ((1 << E) - 1) >> 1, \
254 .exp_max = (1 << E) - 1, \
256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
262 static const FloatFmt float16_params
= {
266 static const FloatFmt float32_params
= {
270 static const FloatFmt float64_params
= {
274 /* Unpack a float to parts, but do not canonicalize. */
275 static inline FloatParts
unpack_raw(FloatFmt fmt
, uint64_t raw
)
277 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
279 return (FloatParts
) {
280 .cls
= float_class_unclassified
,
281 .sign
= extract64(raw
, sign_pos
, 1),
282 .exp
= extract64(raw
, fmt
.frac_size
, fmt
.exp_size
),
283 .frac
= extract64(raw
, 0, fmt
.frac_size
),
287 static inline FloatParts
float16_unpack_raw(float16 f
)
289 return unpack_raw(float16_params
, f
);
292 static inline FloatParts
float32_unpack_raw(float32 f
)
294 return unpack_raw(float32_params
, f
);
297 static inline FloatParts
float64_unpack_raw(float64 f
)
299 return unpack_raw(float64_params
, f
);
302 /* Pack a float from parts, but do not canonicalize. */
303 static inline uint64_t pack_raw(FloatFmt fmt
, FloatParts p
)
305 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
306 uint64_t ret
= deposit64(p
.frac
, fmt
.frac_size
, fmt
.exp_size
, p
.exp
);
307 return deposit64(ret
, sign_pos
, 1, p
.sign
);
310 static inline float16
float16_pack_raw(FloatParts p
)
312 return make_float16(pack_raw(float16_params
, p
));
315 static inline float32
float32_pack_raw(FloatParts p
)
317 return make_float32(pack_raw(float32_params
, p
));
320 static inline float64
float64_pack_raw(FloatParts p
)
322 return make_float64(pack_raw(float64_params
, p
));
325 /* Canonicalize EXP and FRAC, setting CLS. */
326 static FloatParts
canonicalize(FloatParts part
, const FloatFmt
*parm
,
327 float_status
*status
)
329 if (part
.exp
== parm
->exp_max
) {
330 if (part
.frac
== 0) {
331 part
.cls
= float_class_inf
;
333 #ifdef NO_SIGNALING_NANS
334 part
.cls
= float_class_qnan
;
336 int64_t msb
= part
.frac
<< (parm
->frac_shift
+ 2);
337 if ((msb
< 0) == status
->snan_bit_is_one
) {
338 part
.cls
= float_class_snan
;
340 part
.cls
= float_class_qnan
;
344 } else if (part
.exp
== 0) {
345 if (likely(part
.frac
== 0)) {
346 part
.cls
= float_class_zero
;
347 } else if (status
->flush_inputs_to_zero
) {
348 float_raise(float_flag_input_denormal
, status
);
349 part
.cls
= float_class_zero
;
352 int shift
= clz64(part
.frac
) - 1;
353 part
.cls
= float_class_normal
;
354 part
.exp
= parm
->frac_shift
- parm
->exp_bias
- shift
+ 1;
358 part
.cls
= float_class_normal
;
359 part
.exp
-= parm
->exp_bias
;
360 part
.frac
= DECOMPOSED_IMPLICIT_BIT
+ (part
.frac
<< parm
->frac_shift
);
365 /* Round and uncanonicalize a floating-point number by parts. There
366 * are FRAC_SHIFT bits that may require rounding at the bottom of the
367 * fraction; these bits will be removed. The exponent will be biased
368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
371 static FloatParts
round_canonical(FloatParts p
, float_status
*s
,
372 const FloatFmt
*parm
)
374 const uint64_t frac_lsbm1
= parm
->frac_lsbm1
;
375 const uint64_t round_mask
= parm
->round_mask
;
376 const uint64_t roundeven_mask
= parm
->roundeven_mask
;
377 const int exp_max
= parm
->exp_max
;
378 const int frac_shift
= parm
->frac_shift
;
387 case float_class_normal
:
388 switch (s
->float_rounding_mode
) {
389 case float_round_nearest_even
:
390 overflow_norm
= false;
391 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
393 case float_round_ties_away
:
394 overflow_norm
= false;
397 case float_round_to_zero
:
398 overflow_norm
= true;
402 inc
= p
.sign
? 0 : round_mask
;
403 overflow_norm
= p
.sign
;
405 case float_round_down
:
406 inc
= p
.sign
? round_mask
: 0;
407 overflow_norm
= !p
.sign
;
410 g_assert_not_reached();
413 exp
+= parm
->exp_bias
;
414 if (likely(exp
> 0)) {
415 if (frac
& round_mask
) {
416 flags
|= float_flag_inexact
;
418 if (frac
& DECOMPOSED_OVERFLOW_BIT
) {
425 if (unlikely(exp
>= exp_max
)) {
426 flags
|= float_flag_overflow
| float_flag_inexact
;
431 p
.cls
= float_class_inf
;
435 } else if (s
->flush_to_zero
) {
436 flags
|= float_flag_output_denormal
;
437 p
.cls
= float_class_zero
;
440 bool is_tiny
= (s
->float_detect_tininess
441 == float_tininess_before_rounding
)
443 || !((frac
+ inc
) & DECOMPOSED_OVERFLOW_BIT
);
445 shift64RightJamming(frac
, 1 - exp
, &frac
);
446 if (frac
& round_mask
) {
447 /* Need to recompute round-to-even. */
448 if (s
->float_rounding_mode
== float_round_nearest_even
) {
449 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
452 flags
|= float_flag_inexact
;
456 exp
= (frac
& DECOMPOSED_IMPLICIT_BIT
? 1 : 0);
459 if (is_tiny
&& (flags
& float_flag_inexact
)) {
460 flags
|= float_flag_underflow
;
462 if (exp
== 0 && frac
== 0) {
463 p
.cls
= float_class_zero
;
468 case float_class_zero
:
474 case float_class_inf
:
480 case float_class_qnan
:
481 case float_class_snan
:
486 g_assert_not_reached();
489 float_raise(flags
, s
);
495 static FloatParts
float16_unpack_canonical(float16 f
, float_status
*s
)
497 return canonicalize(float16_unpack_raw(f
), &float16_params
, s
);
500 static float16
float16_round_pack_canonical(FloatParts p
, float_status
*s
)
503 case float_class_dnan
:
504 return float16_default_nan(s
);
505 case float_class_msnan
:
506 return float16_maybe_silence_nan(float16_pack_raw(p
), s
);
508 p
= round_canonical(p
, s
, &float16_params
);
509 return float16_pack_raw(p
);
513 static FloatParts
float32_unpack_canonical(float32 f
, float_status
*s
)
515 return canonicalize(float32_unpack_raw(f
), &float32_params
, s
);
518 static float32
float32_round_pack_canonical(FloatParts p
, float_status
*s
)
521 case float_class_dnan
:
522 return float32_default_nan(s
);
523 case float_class_msnan
:
524 return float32_maybe_silence_nan(float32_pack_raw(p
), s
);
526 p
= round_canonical(p
, s
, &float32_params
);
527 return float32_pack_raw(p
);
531 static FloatParts
float64_unpack_canonical(float64 f
, float_status
*s
)
533 return canonicalize(float64_unpack_raw(f
), &float64_params
, s
);
536 static float64
float64_round_pack_canonical(FloatParts p
, float_status
*s
)
539 case float_class_dnan
:
540 return float64_default_nan(s
);
541 case float_class_msnan
:
542 return float64_maybe_silence_nan(float64_pack_raw(p
), s
);
544 p
= round_canonical(p
, s
, &float64_params
);
545 return float64_pack_raw(p
);
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c
)
552 return unlikely(c
>= float_class_qnan
);
554 static bool is_snan(FloatClass c
)
556 return c
== float_class_snan
;
558 static bool is_qnan(FloatClass c
)
560 return c
== float_class_qnan
;
563 static FloatParts
return_nan(FloatParts a
, float_status
*s
)
566 case float_class_snan
:
567 s
->float_exception_flags
|= float_flag_invalid
;
568 a
.cls
= float_class_msnan
;
570 case float_class_qnan
:
571 if (s
->default_nan_mode
) {
572 a
.cls
= float_class_dnan
;
577 g_assert_not_reached();
582 static FloatParts
pick_nan(FloatParts a
, FloatParts b
, float_status
*s
)
584 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
585 s
->float_exception_flags
|= float_flag_invalid
;
588 if (s
->default_nan_mode
) {
589 a
.cls
= float_class_dnan
;
591 if (pickNaN(is_qnan(a
.cls
), is_snan(a
.cls
),
592 is_qnan(b
.cls
), is_snan(b
.cls
),
594 (a
.frac
== b
.frac
&& a
.sign
< b
.sign
))) {
597 a
.cls
= float_class_msnan
;
602 static FloatParts
pick_nan_muladd(FloatParts a
, FloatParts b
, FloatParts c
,
603 bool inf_zero
, float_status
*s
)
605 if (is_snan(a
.cls
) || is_snan(b
.cls
) || is_snan(c
.cls
)) {
606 s
->float_exception_flags
|= float_flag_invalid
;
609 if (s
->default_nan_mode
) {
610 a
.cls
= float_class_dnan
;
612 switch (pickNaNMulAdd(is_qnan(a
.cls
), is_snan(a
.cls
),
613 is_qnan(b
.cls
), is_snan(b
.cls
),
614 is_qnan(c
.cls
), is_snan(c
.cls
),
625 a
.cls
= float_class_dnan
;
628 g_assert_not_reached();
631 a
.cls
= float_class_msnan
;
637 * Returns the result of adding or subtracting the values of the
638 * floating-point values `a' and `b'. The operation is performed
639 * according to the IEC/IEEE Standard for Binary Floating-Point
643 static FloatParts
addsub_floats(FloatParts a
, FloatParts b
, bool subtract
,
646 bool a_sign
= a
.sign
;
647 bool b_sign
= b
.sign
^ subtract
;
649 if (a_sign
!= b_sign
) {
652 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
653 if (a
.exp
> b
.exp
|| (a
.exp
== b
.exp
&& a
.frac
>= b
.frac
)) {
654 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
655 a
.frac
= a
.frac
- b
.frac
;
657 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
658 a
.frac
= b
.frac
- a
.frac
;
664 a
.cls
= float_class_zero
;
665 a
.sign
= s
->float_rounding_mode
== float_round_down
;
667 int shift
= clz64(a
.frac
) - 1;
668 a
.frac
= a
.frac
<< shift
;
669 a
.exp
= a
.exp
- shift
;
674 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
675 return pick_nan(a
, b
, s
);
677 if (a
.cls
== float_class_inf
) {
678 if (b
.cls
== float_class_inf
) {
679 float_raise(float_flag_invalid
, s
);
680 a
.cls
= float_class_dnan
;
684 if (a
.cls
== float_class_zero
&& b
.cls
== float_class_zero
) {
685 a
.sign
= s
->float_rounding_mode
== float_round_down
;
688 if (a
.cls
== float_class_zero
|| b
.cls
== float_class_inf
) {
692 if (b
.cls
== float_class_zero
) {
697 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
699 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
700 } else if (a
.exp
< b
.exp
) {
701 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
705 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
711 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
712 return pick_nan(a
, b
, s
);
714 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
717 if (b
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
722 g_assert_not_reached();
726 * Returns the result of adding or subtracting the floating-point
727 * values `a' and `b'. The operation is performed according to the
728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
731 float16
__attribute__((flatten
)) float16_add(float16 a
, float16 b
,
732 float_status
*status
)
734 FloatParts pa
= float16_unpack_canonical(a
, status
);
735 FloatParts pb
= float16_unpack_canonical(b
, status
);
736 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
738 return float16_round_pack_canonical(pr
, status
);
741 float32
__attribute__((flatten
)) float32_add(float32 a
, float32 b
,
742 float_status
*status
)
744 FloatParts pa
= float32_unpack_canonical(a
, status
);
745 FloatParts pb
= float32_unpack_canonical(b
, status
);
746 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
748 return float32_round_pack_canonical(pr
, status
);
751 float64
__attribute__((flatten
)) float64_add(float64 a
, float64 b
,
752 float_status
*status
)
754 FloatParts pa
= float64_unpack_canonical(a
, status
);
755 FloatParts pb
= float64_unpack_canonical(b
, status
);
756 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
758 return float64_round_pack_canonical(pr
, status
);
761 float16
__attribute__((flatten
)) float16_sub(float16 a
, float16 b
,
762 float_status
*status
)
764 FloatParts pa
= float16_unpack_canonical(a
, status
);
765 FloatParts pb
= float16_unpack_canonical(b
, status
);
766 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
768 return float16_round_pack_canonical(pr
, status
);
771 float32
__attribute__((flatten
)) float32_sub(float32 a
, float32 b
,
772 float_status
*status
)
774 FloatParts pa
= float32_unpack_canonical(a
, status
);
775 FloatParts pb
= float32_unpack_canonical(b
, status
);
776 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
778 return float32_round_pack_canonical(pr
, status
);
781 float64
__attribute__((flatten
)) float64_sub(float64 a
, float64 b
,
782 float_status
*status
)
784 FloatParts pa
= float64_unpack_canonical(a
, status
);
785 FloatParts pb
= float64_unpack_canonical(b
, status
);
786 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
788 return float64_round_pack_canonical(pr
, status
);
792 * Returns the result of multiplying the floating-point values `a' and
793 * `b'. The operation is performed according to the IEC/IEEE Standard
794 * for Binary Floating-Point Arithmetic.
797 static FloatParts
mul_floats(FloatParts a
, FloatParts b
, float_status
*s
)
799 bool sign
= a
.sign
^ b
.sign
;
801 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
803 int exp
= a
.exp
+ b
.exp
;
805 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
806 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
807 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
808 shift64RightJamming(lo
, 1, &lo
);
818 /* handle all the NaN cases */
819 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
820 return pick_nan(a
, b
, s
);
822 /* Inf * Zero == NaN */
823 if ((a
.cls
== float_class_inf
&& b
.cls
== float_class_zero
) ||
824 (a
.cls
== float_class_zero
&& b
.cls
== float_class_inf
)) {
825 s
->float_exception_flags
|= float_flag_invalid
;
826 a
.cls
= float_class_dnan
;
830 /* Multiply by 0 or Inf */
831 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
835 if (b
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
839 g_assert_not_reached();
842 float16
__attribute__((flatten
)) float16_mul(float16 a
, float16 b
,
843 float_status
*status
)
845 FloatParts pa
= float16_unpack_canonical(a
, status
);
846 FloatParts pb
= float16_unpack_canonical(b
, status
);
847 FloatParts pr
= mul_floats(pa
, pb
, status
);
849 return float16_round_pack_canonical(pr
, status
);
852 float32
__attribute__((flatten
)) float32_mul(float32 a
, float32 b
,
853 float_status
*status
)
855 FloatParts pa
= float32_unpack_canonical(a
, status
);
856 FloatParts pb
= float32_unpack_canonical(b
, status
);
857 FloatParts pr
= mul_floats(pa
, pb
, status
);
859 return float32_round_pack_canonical(pr
, status
);
862 float64
__attribute__((flatten
)) float64_mul(float64 a
, float64 b
,
863 float_status
*status
)
865 FloatParts pa
= float64_unpack_canonical(a
, status
);
866 FloatParts pb
= float64_unpack_canonical(b
, status
);
867 FloatParts pr
= mul_floats(pa
, pb
, status
);
869 return float64_round_pack_canonical(pr
, status
);
873 * Returns the result of multiplying the floating-point values `a' and
874 * `b' then adding 'c', with no intermediate rounding step after the
875 * multiplication. The operation is performed according to the
876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
877 * The flags argument allows the caller to select negation of the
878 * addend, the intermediate product, or the final result. (The
879 * difference between this and having the caller do a separate
880 * negation is that negating externally will flip the sign bit on
884 static FloatParts
muladd_floats(FloatParts a
, FloatParts b
, FloatParts c
,
885 int flags
, float_status
*s
)
887 bool inf_zero
= ((1 << a
.cls
) | (1 << b
.cls
)) ==
888 ((1 << float_class_inf
) | (1 << float_class_zero
));
890 bool sign_flip
= flags
& float_muladd_negate_result
;
895 /* It is implementation-defined whether the cases of (0,inf,qnan)
896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
897 * they return if they do), so we have to hand this information
898 * off to the target-specific pick-a-NaN routine.
900 if (is_nan(a
.cls
) || is_nan(b
.cls
) || is_nan(c
.cls
)) {
901 return pick_nan_muladd(a
, b
, c
, inf_zero
, s
);
905 s
->float_exception_flags
|= float_flag_invalid
;
906 a
.cls
= float_class_dnan
;
910 if (flags
& float_muladd_negate_c
) {
914 p_sign
= a
.sign
^ b
.sign
;
916 if (flags
& float_muladd_negate_product
) {
920 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_inf
) {
921 p_class
= float_class_inf
;
922 } else if (a
.cls
== float_class_zero
|| b
.cls
== float_class_zero
) {
923 p_class
= float_class_zero
;
925 p_class
= float_class_normal
;
928 if (c
.cls
== float_class_inf
) {
929 if (p_class
== float_class_inf
&& p_sign
!= c
.sign
) {
930 s
->float_exception_flags
|= float_flag_invalid
;
931 a
.cls
= float_class_dnan
;
933 a
.cls
= float_class_inf
;
934 a
.sign
= c
.sign
^ sign_flip
;
939 if (p_class
== float_class_inf
) {
940 a
.cls
= float_class_inf
;
941 a
.sign
= p_sign
^ sign_flip
;
945 if (p_class
== float_class_zero
) {
946 if (c
.cls
== float_class_zero
) {
947 if (p_sign
!= c
.sign
) {
948 p_sign
= s
->float_rounding_mode
== float_round_down
;
951 } else if (flags
& float_muladd_halve_result
) {
958 /* a & b should be normals now... */
959 assert(a
.cls
== float_class_normal
&&
960 b
.cls
== float_class_normal
);
962 p_exp
= a
.exp
+ b
.exp
;
964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
967 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
968 /* binary point now at bit 124 */
970 /* check for overflow */
971 if (hi
& (1ULL << (DECOMPOSED_BINARY_POINT
* 2 + 1 - 64))) {
972 shift128RightJamming(hi
, lo
, 1, &hi
, &lo
);
977 if (c
.cls
== float_class_zero
) {
978 /* move binary point back to 62 */
979 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
981 int exp_diff
= p_exp
- c
.exp
;
982 if (p_sign
== c
.sign
) {
985 shift128RightJamming(hi
, lo
,
986 DECOMPOSED_BINARY_POINT
- exp_diff
,
992 /* shift c to the same binary point as the product (124) */
995 shift128RightJamming(c_hi
, c_lo
,
998 add128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
999 /* move binary point back to 62 */
1000 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1003 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
1004 shift64RightJamming(lo
, 1, &lo
);
1010 uint64_t c_hi
, c_lo
;
1011 /* make C binary point match product at bit 124 */
1015 if (exp_diff
<= 0) {
1016 shift128RightJamming(hi
, lo
, -exp_diff
, &hi
, &lo
);
1019 (hi
> c_hi
|| (hi
== c_hi
&& lo
>= c_lo
))) {
1020 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1022 sub128(c_hi
, c_lo
, hi
, lo
, &hi
, &lo
);
1027 shift128RightJamming(c_hi
, c_lo
,
1030 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1033 if (hi
== 0 && lo
== 0) {
1034 a
.cls
= float_class_zero
;
1035 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1036 a
.sign
^= sign_flip
;
1043 shift
= clz64(lo
) + 64;
1045 /* Normalizing to a binary point of 124 is the
1046 correct adjust for the exponent. However since we're
1047 shifting, we might as well put the binary point back
1048 at 62 where we really want it. Therefore shift as
1049 if we're leaving 1 bit at the top of the word, but
1050 adjust the exponent as if we're leaving 3 bits. */
1053 lo
= lo
<< (shift
- 64);
1055 hi
= (hi
<< shift
) | (lo
>> (64 - shift
));
1056 lo
= hi
| ((lo
<< shift
) != 0);
1063 if (flags
& float_muladd_halve_result
) {
1067 /* finally prepare our result */
1068 a
.cls
= float_class_normal
;
1069 a
.sign
= p_sign
^ sign_flip
;
1076 float16
__attribute__((flatten
)) float16_muladd(float16 a
, float16 b
, float16 c
,
1077 int flags
, float_status
*status
)
1079 FloatParts pa
= float16_unpack_canonical(a
, status
);
1080 FloatParts pb
= float16_unpack_canonical(b
, status
);
1081 FloatParts pc
= float16_unpack_canonical(c
, status
);
1082 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1084 return float16_round_pack_canonical(pr
, status
);
1087 float32
__attribute__((flatten
)) float32_muladd(float32 a
, float32 b
, float32 c
,
1088 int flags
, float_status
*status
)
1090 FloatParts pa
= float32_unpack_canonical(a
, status
);
1091 FloatParts pb
= float32_unpack_canonical(b
, status
);
1092 FloatParts pc
= float32_unpack_canonical(c
, status
);
1093 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1095 return float32_round_pack_canonical(pr
, status
);
1098 float64
__attribute__((flatten
)) float64_muladd(float64 a
, float64 b
, float64 c
,
1099 int flags
, float_status
*status
)
1101 FloatParts pa
= float64_unpack_canonical(a
, status
);
1102 FloatParts pb
= float64_unpack_canonical(b
, status
);
1103 FloatParts pc
= float64_unpack_canonical(c
, status
);
1104 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1106 return float64_round_pack_canonical(pr
, status
);
1110 * Returns the result of dividing the floating-point value `a' by the
1111 * corresponding value `b'. The operation is performed according to
1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 static FloatParts
div_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1117 bool sign
= a
.sign
^ b
.sign
;
1119 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1120 uint64_t temp_lo
, temp_hi
;
1121 int exp
= a
.exp
- b
.exp
;
1122 if (a
.frac
< b
.frac
) {
1124 shortShift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1,
1125 &temp_hi
, &temp_lo
);
1127 shortShift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
,
1128 &temp_hi
, &temp_lo
);
1130 /* LSB of quot is set if inexact which roundandpack will use
1131 * to set flags. Yet again we re-use a for the result */
1132 a
.frac
= div128To64(temp_lo
, temp_hi
, b
.frac
);
1137 /* handle all the NaN cases */
1138 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1139 return pick_nan(a
, b
, s
);
1141 /* 0/0 or Inf/Inf */
1144 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1145 s
->float_exception_flags
|= float_flag_invalid
;
1146 a
.cls
= float_class_dnan
;
1150 if (b
.cls
== float_class_zero
) {
1151 s
->float_exception_flags
|= float_flag_divbyzero
;
1152 a
.cls
= float_class_inf
;
1156 /* Inf / x or 0 / x */
1157 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1162 if (b
.cls
== float_class_inf
) {
1163 a
.cls
= float_class_zero
;
1167 g_assert_not_reached();
1170 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1172 FloatParts pa
= float16_unpack_canonical(a
, status
);
1173 FloatParts pb
= float16_unpack_canonical(b
, status
);
1174 FloatParts pr
= div_floats(pa
, pb
, status
);
1176 return float16_round_pack_canonical(pr
, status
);
1179 float32
float32_div(float32 a
, float32 b
, float_status
*status
)
1181 FloatParts pa
= float32_unpack_canonical(a
, status
);
1182 FloatParts pb
= float32_unpack_canonical(b
, status
);
1183 FloatParts pr
= div_floats(pa
, pb
, status
);
1185 return float32_round_pack_canonical(pr
, status
);
1188 float64
float64_div(float64 a
, float64 b
, float_status
*status
)
1190 FloatParts pa
= float64_unpack_canonical(a
, status
);
1191 FloatParts pb
= float64_unpack_canonical(b
, status
);
1192 FloatParts pr
= div_floats(pa
, pb
, status
);
1194 return float64_round_pack_canonical(pr
, status
);
1198 * Rounds the floating-point value `a' to an integer, and returns the
1199 * result as a floating-point value. The operation is performed
1200 * according to the IEC/IEEE Standard for Binary Floating-Point
1204 static FloatParts
round_to_int(FloatParts a
, int rounding_mode
, float_status
*s
)
1206 if (is_nan(a
.cls
)) {
1207 return return_nan(a
, s
);
1211 case float_class_zero
:
1212 case float_class_inf
:
1213 case float_class_qnan
:
1214 /* already "integral" */
1216 case float_class_normal
:
1217 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
1218 /* already integral */
1223 /* all fractional */
1224 s
->float_exception_flags
|= float_flag_inexact
;
1225 switch (rounding_mode
) {
1226 case float_round_nearest_even
:
1227 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
1229 case float_round_ties_away
:
1230 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
1232 case float_round_to_zero
:
1235 case float_round_up
:
1238 case float_round_down
:
1242 g_assert_not_reached();
1246 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
1249 a
.cls
= float_class_zero
;
1252 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
1253 uint64_t frac_lsbm1
= frac_lsb
>> 1;
1254 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
1255 uint64_t rnd_mask
= rnd_even_mask
>> 1;
1258 switch (rounding_mode
) {
1259 case float_round_nearest_even
:
1260 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
1262 case float_round_ties_away
:
1265 case float_round_to_zero
:
1268 case float_round_up
:
1269 inc
= a
.sign
? 0 : rnd_mask
;
1271 case float_round_down
:
1272 inc
= a
.sign
? rnd_mask
: 0;
1275 g_assert_not_reached();
1278 if (a
.frac
& rnd_mask
) {
1279 s
->float_exception_flags
|= float_flag_inexact
;
1281 a
.frac
&= ~rnd_mask
;
1282 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
1290 g_assert_not_reached();
1295 float16
float16_round_to_int(float16 a
, float_status
*s
)
1297 FloatParts pa
= float16_unpack_canonical(a
, s
);
1298 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, s
);
1299 return float16_round_pack_canonical(pr
, s
);
1302 float32
float32_round_to_int(float32 a
, float_status
*s
)
1304 FloatParts pa
= float32_unpack_canonical(a
, s
);
1305 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, s
);
1306 return float32_round_pack_canonical(pr
, s
);
1309 float64
float64_round_to_int(float64 a
, float_status
*s
)
1311 FloatParts pa
= float64_unpack_canonical(a
, s
);
1312 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, s
);
1313 return float64_round_pack_canonical(pr
, s
);
1316 float64
float64_trunc_to_int(float64 a
, float_status
*s
)
1318 FloatParts pa
= float64_unpack_canonical(a
, s
);
1319 FloatParts pr
= round_to_int(pa
, float_round_to_zero
, s
);
1320 return float64_round_pack_canonical(pr
, s
);
1324 * Returns the result of converting the floating-point value `a' to
1325 * the two's complement integer format. The conversion is performed
1326 * according to the IEC/IEEE Standard for Binary Floating-Point
1327 * Arithmetic---which means in particular that the conversion is
1328 * rounded according to the current rounding mode. If `a' is a NaN,
1329 * the largest positive integer is returned. Otherwise, if the
1330 * conversion overflows, the largest integer with the same sign as `a'
1334 static int64_t round_to_int_and_pack(FloatParts in
, int rmode
,
1335 int64_t min
, int64_t max
,
1339 int orig_flags
= get_float_exception_flags(s
);
1340 FloatParts p
= round_to_int(in
, rmode
, s
);
1343 case float_class_snan
:
1344 case float_class_qnan
:
1345 case float_class_dnan
:
1346 case float_class_msnan
:
1347 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1349 case float_class_inf
:
1350 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1351 return p
.sign
? min
: max
;
1352 case float_class_zero
:
1354 case float_class_normal
:
1355 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
1356 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
1357 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
1358 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
1363 if (r
< -(uint64_t) min
) {
1366 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1373 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1378 g_assert_not_reached();
1382 #define FLOAT_TO_INT(fsz, isz) \
1383 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1386 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1387 return round_to_int_and_pack(p, s->float_rounding_mode, \
1388 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1392 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1393 (float ## fsz a, float_status *s) \
1395 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1396 return round_to_int_and_pack(p, float_round_to_zero, \
1397 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1401 FLOAT_TO_INT(16, 16)
1402 FLOAT_TO_INT(16, 32)
1403 FLOAT_TO_INT(16, 64)
1405 FLOAT_TO_INT(32, 16)
1406 FLOAT_TO_INT(32, 32)
1407 FLOAT_TO_INT(32, 64)
1409 FLOAT_TO_INT(64, 16)
1410 FLOAT_TO_INT(64, 32)
1411 FLOAT_TO_INT(64, 64)
1416 * Returns the result of converting the floating-point value `a' to
1417 * the unsigned integer format. The conversion is performed according
1418 * to the IEC/IEEE Standard for Binary Floating-Point
1419 * Arithmetic---which means in particular that the conversion is
1420 * rounded according to the current rounding mode. If `a' is a NaN,
1421 * the largest unsigned integer is returned. Otherwise, if the
1422 * conversion overflows, the largest unsigned integer is returned. If
1423 * the 'a' is negative, the result is rounded and zero is returned;
1424 * values that do not round to zero will raise the inexact exception
1428 static uint64_t round_to_uint_and_pack(FloatParts in
, int rmode
, uint64_t max
,
1431 int orig_flags
= get_float_exception_flags(s
);
1432 FloatParts p
= round_to_int(in
, rmode
, s
);
1435 case float_class_snan
:
1436 case float_class_qnan
:
1437 case float_class_dnan
:
1438 case float_class_msnan
:
1439 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1441 case float_class_inf
:
1442 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1443 return p
.sign
? 0 : max
;
1444 case float_class_zero
:
1446 case float_class_normal
:
1450 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1454 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
1455 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
1456 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
1457 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
1459 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1463 /* For uint64 this will never trip, but if p.exp is too large
1464 * to shift a decomposed fraction we shall have exited via the
1468 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1475 g_assert_not_reached();
1479 #define FLOAT_TO_UINT(fsz, isz) \
1480 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1483 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1484 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1485 UINT ## isz ## _MAX, s); \
1488 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1489 (float ## fsz a, float_status *s) \
1491 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1492 return round_to_uint_and_pack(p, float_round_to_zero, \
1493 UINT ## isz ## _MAX, s); \
1496 FLOAT_TO_UINT(16, 16)
1497 FLOAT_TO_UINT(16, 32)
1498 FLOAT_TO_UINT(16, 64)
1500 FLOAT_TO_UINT(32, 16)
1501 FLOAT_TO_UINT(32, 32)
1502 FLOAT_TO_UINT(32, 64)
1504 FLOAT_TO_UINT(64, 16)
1505 FLOAT_TO_UINT(64, 32)
1506 FLOAT_TO_UINT(64, 64)
1508 #undef FLOAT_TO_UINT
1511 * Integer to float conversions
1513 * Returns the result of converting the two's complement integer `a'
1514 * to the floating-point format. The conversion is performed according
1515 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1518 static FloatParts
int_to_float(int64_t a
, float_status
*status
)
1522 r
.cls
= float_class_zero
;
1524 } else if (a
== (1ULL << 63)) {
1525 r
.cls
= float_class_normal
;
1527 r
.frac
= DECOMPOSED_IMPLICIT_BIT
;
1538 int shift
= clz64(f
) - 1;
1539 r
.cls
= float_class_normal
;
1540 r
.exp
= (DECOMPOSED_BINARY_POINT
- shift
);
1541 r
.frac
= f
<< shift
;
1547 float16
int64_to_float16(int64_t a
, float_status
*status
)
1549 FloatParts pa
= int_to_float(a
, status
);
1550 return float16_round_pack_canonical(pa
, status
);
1553 float16
int32_to_float16(int32_t a
, float_status
*status
)
1555 return int64_to_float16(a
, status
);
1558 float16
int16_to_float16(int16_t a
, float_status
*status
)
1560 return int64_to_float16(a
, status
);
1563 float32
int64_to_float32(int64_t a
, float_status
*status
)
1565 FloatParts pa
= int_to_float(a
, status
);
1566 return float32_round_pack_canonical(pa
, status
);
1569 float32
int32_to_float32(int32_t a
, float_status
*status
)
1571 return int64_to_float32(a
, status
);
1574 float32
int16_to_float32(int16_t a
, float_status
*status
)
1576 return int64_to_float32(a
, status
);
1579 float64
int64_to_float64(int64_t a
, float_status
*status
)
1581 FloatParts pa
= int_to_float(a
, status
);
1582 return float64_round_pack_canonical(pa
, status
);
1585 float64
int32_to_float64(int32_t a
, float_status
*status
)
1587 return int64_to_float64(a
, status
);
1590 float64
int16_to_float64(int16_t a
, float_status
*status
)
1592 return int64_to_float64(a
, status
);
1597 * Unsigned Integer to float conversions
1599 * Returns the result of converting the unsigned integer `a' to the
1600 * floating-point format. The conversion is performed according to the
1601 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1604 static FloatParts
uint_to_float(uint64_t a
, float_status
*status
)
1606 FloatParts r
= { .sign
= false};
1609 r
.cls
= float_class_zero
;
1611 int spare_bits
= clz64(a
) - 1;
1612 r
.cls
= float_class_normal
;
1613 r
.exp
= DECOMPOSED_BINARY_POINT
- spare_bits
;
1614 if (spare_bits
< 0) {
1615 shift64RightJamming(a
, -spare_bits
, &a
);
1618 r
.frac
= a
<< spare_bits
;
1625 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
1627 FloatParts pa
= uint_to_float(a
, status
);
1628 return float16_round_pack_canonical(pa
, status
);
1631 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
1633 return uint64_to_float16(a
, status
);
1636 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
1638 return uint64_to_float16(a
, status
);
1641 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
1643 FloatParts pa
= uint_to_float(a
, status
);
1644 return float32_round_pack_canonical(pa
, status
);
1647 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
1649 return uint64_to_float32(a
, status
);
1652 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
1654 return uint64_to_float32(a
, status
);
1657 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
1659 FloatParts pa
= uint_to_float(a
, status
);
1660 return float64_round_pack_canonical(pa
, status
);
1663 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
1665 return uint64_to_float64(a
, status
);
1668 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
1670 return uint64_to_float64(a
, status
);
1674 /* min() and max() functions. These can't be implemented as
1675 * 'compare and pick one input' because that would mishandle
1676 * NaNs and +0 vs -0.
1678 * minnum() and maxnum() functions. These are similar to the min()
1679 * and max() functions but if one of the arguments is a QNaN and
1680 * the other is numerical then the numerical argument is returned.
1681 * SNaNs will get quietened before being returned.
1682 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1683 * and maxNum() operations. min() and max() are the typical min/max
1684 * semantics provided by many CPUs which predate that specification.
1686 * minnummag() and maxnummag() functions correspond to minNumMag()
1687 * and minNumMag() from the IEEE-754 2008.
1689 static FloatParts
minmax_floats(FloatParts a
, FloatParts b
, bool ismin
,
1690 bool ieee
, bool ismag
, float_status
*s
)
1692 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
1694 /* Takes two floating-point values `a' and `b', one of
1695 * which is a NaN, and returns the appropriate NaN
1696 * result. If either `a' or `b' is a signaling NaN,
1697 * the invalid exception is raised.
1699 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
1700 return pick_nan(a
, b
, s
);
1701 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
1703 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
1707 return pick_nan(a
, b
, s
);
1712 case float_class_normal
:
1715 case float_class_inf
:
1718 case float_class_zero
:
1722 g_assert_not_reached();
1726 case float_class_normal
:
1729 case float_class_inf
:
1732 case float_class_zero
:
1736 g_assert_not_reached();
1740 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
1741 bool a_less
= a_exp
< b_exp
;
1742 if (a_exp
== b_exp
) {
1743 a_less
= a
.frac
< b
.frac
;
1745 return a_less
^ ismin
? b
: a
;
1748 if (a
.sign
== b
.sign
) {
1749 bool a_less
= a_exp
< b_exp
;
1750 if (a_exp
== b_exp
) {
1751 a_less
= a
.frac
< b
.frac
;
1753 return a
.sign
^ a_less
^ ismin
? b
: a
;
1755 return a
.sign
^ ismin
? b
: a
;
1760 #define MINMAX(sz, name, ismin, isiee, ismag) \
1761 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1764 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1765 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1766 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1768 return float ## sz ## _round_pack_canonical(pr, s); \
1771 MINMAX(16, min
, true, false, false)
1772 MINMAX(16, minnum
, true, true, false)
1773 MINMAX(16, minnummag
, true, true, true)
1774 MINMAX(16, max
, false, false, false)
1775 MINMAX(16, maxnum
, false, true, false)
1776 MINMAX(16, maxnummag
, false, true, true)
1778 MINMAX(32, min
, true, false, false)
1779 MINMAX(32, minnum
, true, true, false)
1780 MINMAX(32, minnummag
, true, true, true)
1781 MINMAX(32, max
, false, false, false)
1782 MINMAX(32, maxnum
, false, true, false)
1783 MINMAX(32, maxnummag
, false, true, true)
1785 MINMAX(64, min
, true, false, false)
1786 MINMAX(64, minnum
, true, true, false)
1787 MINMAX(64, minnummag
, true, true, true)
1788 MINMAX(64, max
, false, false, false)
1789 MINMAX(64, maxnum
, false, true, false)
1790 MINMAX(64, maxnummag
, false, true, true)
1794 /* Floating point compare */
1795 static int compare_floats(FloatParts a
, FloatParts b
, bool is_quiet
,
1798 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1800 a
.cls
== float_class_snan
||
1801 b
.cls
== float_class_snan
) {
1802 s
->float_exception_flags
|= float_flag_invalid
;
1804 return float_relation_unordered
;
1807 if (a
.cls
== float_class_zero
) {
1808 if (b
.cls
== float_class_zero
) {
1809 return float_relation_equal
;
1811 return b
.sign
? float_relation_greater
: float_relation_less
;
1812 } else if (b
.cls
== float_class_zero
) {
1813 return a
.sign
? float_relation_less
: float_relation_greater
;
1816 /* The only really important thing about infinity is its sign. If
1817 * both are infinities the sign marks the smallest of the two.
1819 if (a
.cls
== float_class_inf
) {
1820 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
1821 return float_relation_equal
;
1823 return a
.sign
? float_relation_less
: float_relation_greater
;
1824 } else if (b
.cls
== float_class_inf
) {
1825 return b
.sign
? float_relation_greater
: float_relation_less
;
1828 if (a
.sign
!= b
.sign
) {
1829 return a
.sign
? float_relation_less
: float_relation_greater
;
1832 if (a
.exp
== b
.exp
) {
1833 if (a
.frac
== b
.frac
) {
1834 return float_relation_equal
;
1837 return a
.frac
> b
.frac
?
1838 float_relation_less
: float_relation_greater
;
1840 return a
.frac
> b
.frac
?
1841 float_relation_greater
: float_relation_less
;
1845 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
1847 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
1852 #define COMPARE(sz) \
1853 int float ## sz ## _compare(float ## sz a, float ## sz b, \
1856 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1857 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1858 return compare_floats(pa, pb, false, s); \
1860 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
1863 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1864 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1865 return compare_floats(pa, pb, true, s); \
1874 /* Multiply A by 2 raised to the power N. */
1875 static FloatParts
scalbn_decomposed(FloatParts a
, int n
, float_status
*s
)
1877 if (unlikely(is_nan(a
.cls
))) {
1878 return return_nan(a
, s
);
1880 if (a
.cls
== float_class_normal
) {
1886 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
1888 FloatParts pa
= float16_unpack_canonical(a
, status
);
1889 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
1890 return float16_round_pack_canonical(pr
, status
);
1893 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
1895 FloatParts pa
= float32_unpack_canonical(a
, status
);
1896 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
1897 return float32_round_pack_canonical(pr
, status
);
1900 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
1902 FloatParts pa
= float64_unpack_canonical(a
, status
);
1903 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
1904 return float64_round_pack_canonical(pr
, status
);
1910 * The old softfloat code did an approximation step before zeroing in
1911 * on the final result. However for simpleness we just compute the
1912 * square root by iterating down from the implicit bit to enough extra
1913 * bits to ensure we get a correctly rounded result.
1915 * This does mean however the calculation is slower than before,
1916 * especially for 64 bit floats.
1919 static FloatParts
sqrt_float(FloatParts a
, float_status
*s
, const FloatFmt
*p
)
1921 uint64_t a_frac
, r_frac
, s_frac
;
1924 if (is_nan(a
.cls
)) {
1925 return return_nan(a
, s
);
1927 if (a
.cls
== float_class_zero
) {
1928 return a
; /* sqrt(+-0) = +-0 */
1931 s
->float_exception_flags
|= float_flag_invalid
;
1932 a
.cls
= float_class_dnan
;
1935 if (a
.cls
== float_class_inf
) {
1936 return a
; /* sqrt(+inf) = +inf */
1939 assert(a
.cls
== float_class_normal
);
1941 /* We need two overflow bits at the top. Adding room for that is a
1942 * right shift. If the exponent is odd, we can discard the low bit
1943 * by multiplying the fraction by 2; that's a left shift. Combine
1944 * those and we shift right if the exponent is even.
1952 /* Bit-by-bit computation of sqrt. */
1956 /* Iterate from implicit bit down to the 3 extra bits to compute a
1957 * properly rounded result. Remember we've inserted one more bit
1958 * at the top, so these positions are one less.
1960 bit
= DECOMPOSED_BINARY_POINT
- 1;
1961 last_bit
= MAX(p
->frac_shift
- 4, 0);
1963 uint64_t q
= 1ULL << bit
;
1964 uint64_t t_frac
= s_frac
+ q
;
1965 if (t_frac
<= a_frac
) {
1966 s_frac
= t_frac
+ q
;
1971 } while (--bit
>= last_bit
);
1973 /* Undo the right shift done above. If there is any remaining
1974 * fraction, the result is inexact. Set the sticky bit.
1976 a
.frac
= (r_frac
<< 1) + (a_frac
!= 0);
1981 float16
__attribute__((flatten
)) float16_sqrt(float16 a
, float_status
*status
)
1983 FloatParts pa
= float16_unpack_canonical(a
, status
);
1984 FloatParts pr
= sqrt_float(pa
, status
, &float16_params
);
1985 return float16_round_pack_canonical(pr
, status
);
1988 float32
__attribute__((flatten
)) float32_sqrt(float32 a
, float_status
*status
)
1990 FloatParts pa
= float32_unpack_canonical(a
, status
);
1991 FloatParts pr
= sqrt_float(pa
, status
, &float32_params
);
1992 return float32_round_pack_canonical(pr
, status
);
1995 float64
__attribute__((flatten
)) float64_sqrt(float64 a
, float_status
*status
)
1997 FloatParts pa
= float64_unpack_canonical(a
, status
);
1998 FloatParts pr
= sqrt_float(pa
, status
, &float64_params
);
1999 return float64_round_pack_canonical(pr
, status
);
2003 /*----------------------------------------------------------------------------
2004 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2005 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2006 | input. If `zSign' is 1, the input is negated before being converted to an
2007 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2008 | is simply rounded to an integer, with the inexact exception raised if the
2009 | input cannot be represented exactly as an integer. However, if the fixed-
2010 | point input is too large, the invalid exception is raised and the largest
2011 | positive or negative integer is returned.
2012 *----------------------------------------------------------------------------*/
2014 static int32_t roundAndPackInt32(flag zSign
, uint64_t absZ
, float_status
*status
)
2016 int8_t roundingMode
;
2017 flag roundNearestEven
;
2018 int8_t roundIncrement
, roundBits
;
2021 roundingMode
= status
->float_rounding_mode
;
2022 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2023 switch (roundingMode
) {
2024 case float_round_nearest_even
:
2025 case float_round_ties_away
:
2026 roundIncrement
= 0x40;
2028 case float_round_to_zero
:
2031 case float_round_up
:
2032 roundIncrement
= zSign
? 0 : 0x7f;
2034 case float_round_down
:
2035 roundIncrement
= zSign
? 0x7f : 0;
2040 roundBits
= absZ
& 0x7F;
2041 absZ
= ( absZ
+ roundIncrement
)>>7;
2042 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
2044 if ( zSign
) z
= - z
;
2045 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
2046 float_raise(float_flag_invalid
, status
);
2047 return zSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
2050 status
->float_exception_flags
|= float_flag_inexact
;
2056 /*----------------------------------------------------------------------------
2057 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2058 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2059 | and returns the properly rounded 64-bit integer corresponding to the input.
2060 | If `zSign' is 1, the input is negated before being converted to an integer.
2061 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2062 | the inexact exception raised if the input cannot be represented exactly as
2063 | an integer. However, if the fixed-point input is too large, the invalid
2064 | exception is raised and the largest positive or negative integer is
2066 *----------------------------------------------------------------------------*/
2068 static int64_t roundAndPackInt64(flag zSign
, uint64_t absZ0
, uint64_t absZ1
,
2069 float_status
*status
)
2071 int8_t roundingMode
;
2072 flag roundNearestEven
, increment
;
2075 roundingMode
= status
->float_rounding_mode
;
2076 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2077 switch (roundingMode
) {
2078 case float_round_nearest_even
:
2079 case float_round_ties_away
:
2080 increment
= ((int64_t) absZ1
< 0);
2082 case float_round_to_zero
:
2085 case float_round_up
:
2086 increment
= !zSign
&& absZ1
;
2088 case float_round_down
:
2089 increment
= zSign
&& absZ1
;
2096 if ( absZ0
== 0 ) goto overflow
;
2097 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
2100 if ( zSign
) z
= - z
;
2101 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
2103 float_raise(float_flag_invalid
, status
);
2105 zSign
? (int64_t) LIT64( 0x8000000000000000 )
2106 : LIT64( 0x7FFFFFFFFFFFFFFF );
2109 status
->float_exception_flags
|= float_flag_inexact
;
2115 /*----------------------------------------------------------------------------
2116 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2117 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2118 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2119 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2120 | with the inexact exception raised if the input cannot be represented exactly
2121 | as an integer. However, if the fixed-point input is too large, the invalid
2122 | exception is raised and the largest unsigned integer is returned.
2123 *----------------------------------------------------------------------------*/
2125 static int64_t roundAndPackUint64(flag zSign
, uint64_t absZ0
,
2126 uint64_t absZ1
, float_status
*status
)
2128 int8_t roundingMode
;
2129 flag roundNearestEven
, increment
;
2131 roundingMode
= status
->float_rounding_mode
;
2132 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
2133 switch (roundingMode
) {
2134 case float_round_nearest_even
:
2135 case float_round_ties_away
:
2136 increment
= ((int64_t)absZ1
< 0);
2138 case float_round_to_zero
:
2141 case float_round_up
:
2142 increment
= !zSign
&& absZ1
;
2144 case float_round_down
:
2145 increment
= zSign
&& absZ1
;
2153 float_raise(float_flag_invalid
, status
);
2154 return LIT64(0xFFFFFFFFFFFFFFFF);
2156 absZ0
&= ~(((uint64_t)(absZ1
<<1) == 0) & roundNearestEven
);
2159 if (zSign
&& absZ0
) {
2160 float_raise(float_flag_invalid
, status
);
2165 status
->float_exception_flags
|= float_flag_inexact
;
2170 /*----------------------------------------------------------------------------
2171 | If `a' is denormal and we are in flush-to-zero mode then set the
2172 | input-denormal exception and return zero. Otherwise just return the value.
2173 *----------------------------------------------------------------------------*/
2174 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
2176 if (status
->flush_inputs_to_zero
) {
2177 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
2178 float_raise(float_flag_input_denormal
, status
);
2179 return make_float32(float32_val(a
) & 0x80000000);
2185 /*----------------------------------------------------------------------------
2186 | Normalizes the subnormal single-precision floating-point value represented
2187 | by the denormalized significand `aSig'. The normalized exponent and
2188 | significand are stored at the locations pointed to by `zExpPtr' and
2189 | `zSigPtr', respectively.
2190 *----------------------------------------------------------------------------*/
2193 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
2197 shiftCount
= countLeadingZeros32( aSig
) - 8;
2198 *zSigPtr
= aSig
<<shiftCount
;
2199 *zExpPtr
= 1 - shiftCount
;
2203 /*----------------------------------------------------------------------------
2204 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2205 | and significand `zSig', and returns the proper single-precision floating-
2206 | point value corresponding to the abstract input. Ordinarily, the abstract
2207 | value is simply rounded and packed into the single-precision format, with
2208 | the inexact exception raised if the abstract input cannot be represented
2209 | exactly. However, if the abstract value is too large, the overflow and
2210 | inexact exceptions are raised and an infinity or maximal finite value is
2211 | returned. If the abstract value is too small, the input value is rounded to
2212 | a subnormal number, and the underflow and inexact exceptions are raised if
2213 | the abstract input cannot be represented exactly as a subnormal single-
2214 | precision floating-point number.
2215 | The input significand `zSig' has its binary point between bits 30
2216 | and 29, which is 7 bits to the left of the usual location. This shifted
2217 | significand must be normalized or smaller. If `zSig' is not normalized,
2218 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2219 | and it must not require rounding. In the usual case that `zSig' is
2220 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2221 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2222 | Binary Floating-Point Arithmetic.
2223 *----------------------------------------------------------------------------*/
2225 static float32
roundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
2226 float_status
*status
)
2228 int8_t roundingMode
;
2229 flag roundNearestEven
;
2230 int8_t roundIncrement
, roundBits
;
2233 roundingMode
= status
->float_rounding_mode
;
2234 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2235 switch (roundingMode
) {
2236 case float_round_nearest_even
:
2237 case float_round_ties_away
:
2238 roundIncrement
= 0x40;
2240 case float_round_to_zero
:
2243 case float_round_up
:
2244 roundIncrement
= zSign
? 0 : 0x7f;
2246 case float_round_down
:
2247 roundIncrement
= zSign
? 0x7f : 0;
2253 roundBits
= zSig
& 0x7F;
2254 if ( 0xFD <= (uint16_t) zExp
) {
2255 if ( ( 0xFD < zExp
)
2256 || ( ( zExp
== 0xFD )
2257 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
2259 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2260 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
2263 if (status
->flush_to_zero
) {
2264 float_raise(float_flag_output_denormal
, status
);
2265 return packFloat32(zSign
, 0, 0);
2268 (status
->float_detect_tininess
2269 == float_tininess_before_rounding
)
2271 || ( zSig
+ roundIncrement
< 0x80000000 );
2272 shift32RightJamming( zSig
, - zExp
, &zSig
);
2274 roundBits
= zSig
& 0x7F;
2275 if (isTiny
&& roundBits
) {
2276 float_raise(float_flag_underflow
, status
);
2281 status
->float_exception_flags
|= float_flag_inexact
;
2283 zSig
= ( zSig
+ roundIncrement
)>>7;
2284 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
2285 if ( zSig
== 0 ) zExp
= 0;
2286 return packFloat32( zSign
, zExp
, zSig
);
2290 /*----------------------------------------------------------------------------
2291 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2292 | and significand `zSig', and returns the proper single-precision floating-
2293 | point value corresponding to the abstract input. This routine is just like
2294 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2295 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2296 | floating-point exponent.
2297 *----------------------------------------------------------------------------*/
2300 normalizeRoundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
2301 float_status
*status
)
2305 shiftCount
= countLeadingZeros32( zSig
) - 1;
2306 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
2311 /*----------------------------------------------------------------------------
2312 | If `a' is denormal and we are in flush-to-zero mode then set the
2313 | input-denormal exception and return zero. Otherwise just return the value.
2314 *----------------------------------------------------------------------------*/
2315 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
2317 if (status
->flush_inputs_to_zero
) {
2318 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
2319 float_raise(float_flag_input_denormal
, status
);
2320 return make_float64(float64_val(a
) & (1ULL << 63));
2326 /*----------------------------------------------------------------------------
2327 | Normalizes the subnormal double-precision floating-point value represented
2328 | by the denormalized significand `aSig'. The normalized exponent and
2329 | significand are stored at the locations pointed to by `zExpPtr' and
2330 | `zSigPtr', respectively.
2331 *----------------------------------------------------------------------------*/
2334 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
2338 shiftCount
= countLeadingZeros64( aSig
) - 11;
2339 *zSigPtr
= aSig
<<shiftCount
;
2340 *zExpPtr
= 1 - shiftCount
;
2344 /*----------------------------------------------------------------------------
2345 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2346 | double-precision floating-point value, returning the result. After being
2347 | shifted into the proper positions, the three fields are simply added
2348 | together to form the result. This means that any integer portion of `zSig'
2349 | will be added into the exponent. Since a properly normalized significand
2350 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2351 | than the desired result exponent whenever `zSig' is a complete, normalized
2353 *----------------------------------------------------------------------------*/
2355 static inline float64
packFloat64(flag zSign
, int zExp
, uint64_t zSig
)
2358 return make_float64(
2359 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
2363 /*----------------------------------------------------------------------------
2364 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2365 | and significand `zSig', and returns the proper double-precision floating-
2366 | point value corresponding to the abstract input. Ordinarily, the abstract
2367 | value is simply rounded and packed into the double-precision format, with
2368 | the inexact exception raised if the abstract input cannot be represented
2369 | exactly. However, if the abstract value is too large, the overflow and
2370 | inexact exceptions are raised and an infinity or maximal finite value is
2371 | returned. If the abstract value is too small, the input value is rounded to
2372 | a subnormal number, and the underflow and inexact exceptions are raised if
2373 | the abstract input cannot be represented exactly as a subnormal double-
2374 | precision floating-point number.
2375 | The input significand `zSig' has its binary point between bits 62
2376 | and 61, which is 10 bits to the left of the usual location. This shifted
2377 | significand must be normalized or smaller. If `zSig' is not normalized,
2378 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2379 | and it must not require rounding. In the usual case that `zSig' is
2380 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2381 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2382 | Binary Floating-Point Arithmetic.
2383 *----------------------------------------------------------------------------*/
2385 static float64
roundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
2386 float_status
*status
)
2388 int8_t roundingMode
;
2389 flag roundNearestEven
;
2390 int roundIncrement
, roundBits
;
2393 roundingMode
= status
->float_rounding_mode
;
2394 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2395 switch (roundingMode
) {
2396 case float_round_nearest_even
:
2397 case float_round_ties_away
:
2398 roundIncrement
= 0x200;
2400 case float_round_to_zero
:
2403 case float_round_up
:
2404 roundIncrement
= zSign
? 0 : 0x3ff;
2406 case float_round_down
:
2407 roundIncrement
= zSign
? 0x3ff : 0;
2409 case float_round_to_odd
:
2410 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
2415 roundBits
= zSig
& 0x3FF;
2416 if ( 0x7FD <= (uint16_t) zExp
) {
2417 if ( ( 0x7FD < zExp
)
2418 || ( ( zExp
== 0x7FD )
2419 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
2421 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
2422 roundIncrement
!= 0;
2423 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2424 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
2427 if (status
->flush_to_zero
) {
2428 float_raise(float_flag_output_denormal
, status
);
2429 return packFloat64(zSign
, 0, 0);
2432 (status
->float_detect_tininess
2433 == float_tininess_before_rounding
)
2435 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
2436 shift64RightJamming( zSig
, - zExp
, &zSig
);
2438 roundBits
= zSig
& 0x3FF;
2439 if (isTiny
&& roundBits
) {
2440 float_raise(float_flag_underflow
, status
);
2442 if (roundingMode
== float_round_to_odd
) {
2444 * For round-to-odd case, the roundIncrement depends on
2445 * zSig which just changed.
2447 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
2452 status
->float_exception_flags
|= float_flag_inexact
;
2454 zSig
= ( zSig
+ roundIncrement
)>>10;
2455 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
2456 if ( zSig
== 0 ) zExp
= 0;
2457 return packFloat64( zSign
, zExp
, zSig
);
2461 /*----------------------------------------------------------------------------
2462 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2463 | and significand `zSig', and returns the proper double-precision floating-
2464 | point value corresponding to the abstract input. This routine is just like
2465 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2466 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2467 | floating-point exponent.
2468 *----------------------------------------------------------------------------*/
2471 normalizeRoundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
2472 float_status
*status
)
2476 shiftCount
= countLeadingZeros64( zSig
) - 1;
2477 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
2482 /*----------------------------------------------------------------------------
2483 | Normalizes the subnormal extended double-precision floating-point value
2484 | represented by the denormalized significand `aSig'. The normalized exponent
2485 | and significand are stored at the locations pointed to by `zExpPtr' and
2486 | `zSigPtr', respectively.
2487 *----------------------------------------------------------------------------*/
2489 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
2494 shiftCount
= countLeadingZeros64( aSig
);
2495 *zSigPtr
= aSig
<<shiftCount
;
2496 *zExpPtr
= 1 - shiftCount
;
2499 /*----------------------------------------------------------------------------
2500 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2501 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2502 | and returns the proper extended double-precision floating-point value
2503 | corresponding to the abstract input. Ordinarily, the abstract value is
2504 | rounded and packed into the extended double-precision format, with the
2505 | inexact exception raised if the abstract input cannot be represented
2506 | exactly. However, if the abstract value is too large, the overflow and
2507 | inexact exceptions are raised and an infinity or maximal finite value is
2508 | returned. If the abstract value is too small, the input value is rounded to
2509 | a subnormal number, and the underflow and inexact exceptions are raised if
2510 | the abstract input cannot be represented exactly as a subnormal extended
2511 | double-precision floating-point number.
2512 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
2513 | number of bits as single or double precision, respectively. Otherwise, the
2514 | result is rounded to the full precision of the extended double-precision
2516 | The input significand must be normalized or smaller. If the input
2517 | significand is not normalized, `zExp' must be 0; in that case, the result
2518 | returned is a subnormal number, and it must not require rounding. The
2519 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2520 | Floating-Point Arithmetic.
2521 *----------------------------------------------------------------------------*/
2523 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, flag zSign
,
2524 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
2525 float_status
*status
)
2527 int8_t roundingMode
;
2528 flag roundNearestEven
, increment
, isTiny
;
2529 int64_t roundIncrement
, roundMask
, roundBits
;
2531 roundingMode
= status
->float_rounding_mode
;
2532 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2533 if ( roundingPrecision
== 80 ) goto precision80
;
2534 if ( roundingPrecision
== 64 ) {
2535 roundIncrement
= LIT64( 0x0000000000000400 );
2536 roundMask
= LIT64( 0x00000000000007FF );
2538 else if ( roundingPrecision
== 32 ) {
2539 roundIncrement
= LIT64( 0x0000008000000000 );
2540 roundMask
= LIT64( 0x000000FFFFFFFFFF );
2545 zSig0
|= ( zSig1
!= 0 );
2546 switch (roundingMode
) {
2547 case float_round_nearest_even
:
2548 case float_round_ties_away
:
2550 case float_round_to_zero
:
2553 case float_round_up
:
2554 roundIncrement
= zSign
? 0 : roundMask
;
2556 case float_round_down
:
2557 roundIncrement
= zSign
? roundMask
: 0;
2562 roundBits
= zSig0
& roundMask
;
2563 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
2564 if ( ( 0x7FFE < zExp
)
2565 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
2570 if (status
->flush_to_zero
) {
2571 float_raise(float_flag_output_denormal
, status
);
2572 return packFloatx80(zSign
, 0, 0);
2575 (status
->float_detect_tininess
2576 == float_tininess_before_rounding
)
2578 || ( zSig0
<= zSig0
+ roundIncrement
);
2579 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
2581 roundBits
= zSig0
& roundMask
;
2582 if (isTiny
&& roundBits
) {
2583 float_raise(float_flag_underflow
, status
);
2586 status
->float_exception_flags
|= float_flag_inexact
;
2588 zSig0
+= roundIncrement
;
2589 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
2590 roundIncrement
= roundMask
+ 1;
2591 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
2592 roundMask
|= roundIncrement
;
2594 zSig0
&= ~ roundMask
;
2595 return packFloatx80( zSign
, zExp
, zSig0
);
2599 status
->float_exception_flags
|= float_flag_inexact
;
2601 zSig0
+= roundIncrement
;
2602 if ( zSig0
< roundIncrement
) {
2604 zSig0
= LIT64( 0x8000000000000000 );
2606 roundIncrement
= roundMask
+ 1;
2607 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
2608 roundMask
|= roundIncrement
;
2610 zSig0
&= ~ roundMask
;
2611 if ( zSig0
== 0 ) zExp
= 0;
2612 return packFloatx80( zSign
, zExp
, zSig0
);
2614 switch (roundingMode
) {
2615 case float_round_nearest_even
:
2616 case float_round_ties_away
:
2617 increment
= ((int64_t)zSig1
< 0);
2619 case float_round_to_zero
:
2622 case float_round_up
:
2623 increment
= !zSign
&& zSig1
;
2625 case float_round_down
:
2626 increment
= zSign
&& zSig1
;
2631 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
2632 if ( ( 0x7FFE < zExp
)
2633 || ( ( zExp
== 0x7FFE )
2634 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
2640 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2641 if ( ( roundingMode
== float_round_to_zero
)
2642 || ( zSign
&& ( roundingMode
== float_round_up
) )
2643 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
2645 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
2647 return packFloatx80(zSign
,
2648 floatx80_infinity_high
,
2649 floatx80_infinity_low
);
2653 (status
->float_detect_tininess
2654 == float_tininess_before_rounding
)
2657 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
2658 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
2660 if (isTiny
&& zSig1
) {
2661 float_raise(float_flag_underflow
, status
);
2664 status
->float_exception_flags
|= float_flag_inexact
;
2666 switch (roundingMode
) {
2667 case float_round_nearest_even
:
2668 case float_round_ties_away
:
2669 increment
= ((int64_t)zSig1
< 0);
2671 case float_round_to_zero
:
2674 case float_round_up
:
2675 increment
= !zSign
&& zSig1
;
2677 case float_round_down
:
2678 increment
= zSign
&& zSig1
;
2686 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
2687 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
2689 return packFloatx80( zSign
, zExp
, zSig0
);
2693 status
->float_exception_flags
|= float_flag_inexact
;
2699 zSig0
= LIT64( 0x8000000000000000 );
2702 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
2706 if ( zSig0
== 0 ) zExp
= 0;
2708 return packFloatx80( zSign
, zExp
, zSig0
);
2712 /*----------------------------------------------------------------------------
2713 | Takes an abstract floating-point value having sign `zSign', exponent
2714 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2715 | and returns the proper extended double-precision floating-point value
2716 | corresponding to the abstract input. This routine is just like
2717 | `roundAndPackFloatx80' except that the input significand does not have to be
2719 *----------------------------------------------------------------------------*/
2721 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
2722 flag zSign
, int32_t zExp
,
2723 uint64_t zSig0
, uint64_t zSig1
,
2724 float_status
*status
)
2733 shiftCount
= countLeadingZeros64( zSig0
);
2734 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
2736 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
2737 zSig0
, zSig1
, status
);
2741 /*----------------------------------------------------------------------------
2742 | Returns the least-significant 64 fraction bits of the quadruple-precision
2743 | floating-point value `a'.
2744 *----------------------------------------------------------------------------*/
2746 static inline uint64_t extractFloat128Frac1( float128 a
)
2753 /*----------------------------------------------------------------------------
2754 | Returns the most-significant 48 fraction bits of the quadruple-precision
2755 | floating-point value `a'.
2756 *----------------------------------------------------------------------------*/
2758 static inline uint64_t extractFloat128Frac0( float128 a
)
2761 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
2765 /*----------------------------------------------------------------------------
2766 | Returns the exponent bits of the quadruple-precision floating-point value
2768 *----------------------------------------------------------------------------*/
2770 static inline int32_t extractFloat128Exp( float128 a
)
2773 return ( a
.high
>>48 ) & 0x7FFF;
2777 /*----------------------------------------------------------------------------
2778 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2779 *----------------------------------------------------------------------------*/
2781 static inline flag
extractFloat128Sign( float128 a
)
2788 /*----------------------------------------------------------------------------
2789 | Normalizes the subnormal quadruple-precision floating-point value
2790 | represented by the denormalized significand formed by the concatenation of
2791 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
2792 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2793 | significand are stored at the location pointed to by `zSig0Ptr', and the
2794 | least significant 64 bits of the normalized significand are stored at the
2795 | location pointed to by `zSig1Ptr'.
2796 *----------------------------------------------------------------------------*/
2799 normalizeFloat128Subnormal(
2810 shiftCount
= countLeadingZeros64( aSig1
) - 15;
2811 if ( shiftCount
< 0 ) {
2812 *zSig0Ptr
= aSig1
>>( - shiftCount
);
2813 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
2816 *zSig0Ptr
= aSig1
<<shiftCount
;
2819 *zExpPtr
= - shiftCount
- 63;
2822 shiftCount
= countLeadingZeros64( aSig0
) - 15;
2823 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
2824 *zExpPtr
= 1 - shiftCount
;
2829 /*----------------------------------------------------------------------------
2830 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2831 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2832 | floating-point value, returning the result. After being shifted into the
2833 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2834 | added together to form the most significant 32 bits of the result. This
2835 | means that any integer portion of `zSig0' will be added into the exponent.
2836 | Since a properly normalized significand will have an integer portion equal
2837 | to 1, the `zExp' input should be 1 less than the desired result exponent
2838 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2840 *----------------------------------------------------------------------------*/
2842 static inline float128
2843 packFloat128( flag zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
2848 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
2853 /*----------------------------------------------------------------------------
2854 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2855 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2856 | and `zSig2', and returns the proper quadruple-precision floating-point value
2857 | corresponding to the abstract input. Ordinarily, the abstract value is
2858 | simply rounded and packed into the quadruple-precision format, with the
2859 | inexact exception raised if the abstract input cannot be represented
2860 | exactly. However, if the abstract value is too large, the overflow and
2861 | inexact exceptions are raised and an infinity or maximal finite value is
2862 | returned. If the abstract value is too small, the input value is rounded to
2863 | a subnormal number, and the underflow and inexact exceptions are raised if
2864 | the abstract input cannot be represented exactly as a subnormal quadruple-
2865 | precision floating-point number.
2866 | The input significand must be normalized or smaller. If the input
2867 | significand is not normalized, `zExp' must be 0; in that case, the result
2868 | returned is a subnormal number, and it must not require rounding. In the
2869 | usual case that the input significand is normalized, `zExp' must be 1 less
2870 | than the ``true'' floating-point exponent. The handling of underflow and
2871 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2872 *----------------------------------------------------------------------------*/
2874 static float128
roundAndPackFloat128(flag zSign
, int32_t zExp
,
2875 uint64_t zSig0
, uint64_t zSig1
,
2876 uint64_t zSig2
, float_status
*status
)
2878 int8_t roundingMode
;
2879 flag roundNearestEven
, increment
, isTiny
;
2881 roundingMode
= status
->float_rounding_mode
;
2882 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2883 switch (roundingMode
) {
2884 case float_round_nearest_even
:
2885 case float_round_ties_away
:
2886 increment
= ((int64_t)zSig2
< 0);
2888 case float_round_to_zero
:
2891 case float_round_up
:
2892 increment
= !zSign
&& zSig2
;
2894 case float_round_down
:
2895 increment
= zSign
&& zSig2
;
2897 case float_round_to_odd
:
2898 increment
= !(zSig1
& 0x1) && zSig2
;
2903 if ( 0x7FFD <= (uint32_t) zExp
) {
2904 if ( ( 0x7FFD < zExp
)
2905 || ( ( zExp
== 0x7FFD )
2907 LIT64( 0x0001FFFFFFFFFFFF ),
2908 LIT64( 0xFFFFFFFFFFFFFFFF ),
2915 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2916 if ( ( roundingMode
== float_round_to_zero
)
2917 || ( zSign
&& ( roundingMode
== float_round_up
) )
2918 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
2919 || (roundingMode
== float_round_to_odd
)
2925 LIT64( 0x0000FFFFFFFFFFFF ),
2926 LIT64( 0xFFFFFFFFFFFFFFFF )
2929 return packFloat128( zSign
, 0x7FFF, 0, 0 );
2932 if (status
->flush_to_zero
) {
2933 float_raise(float_flag_output_denormal
, status
);
2934 return packFloat128(zSign
, 0, 0, 0);
2937 (status
->float_detect_tininess
2938 == float_tininess_before_rounding
)
2944 LIT64( 0x0001FFFFFFFFFFFF ),
2945 LIT64( 0xFFFFFFFFFFFFFFFF )
2947 shift128ExtraRightJamming(
2948 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
2950 if (isTiny
&& zSig2
) {
2951 float_raise(float_flag_underflow
, status
);
2953 switch (roundingMode
) {
2954 case float_round_nearest_even
:
2955 case float_round_ties_away
:
2956 increment
= ((int64_t)zSig2
< 0);
2958 case float_round_to_zero
:
2961 case float_round_up
:
2962 increment
= !zSign
&& zSig2
;
2964 case float_round_down
:
2965 increment
= zSign
&& zSig2
;
2967 case float_round_to_odd
:
2968 increment
= !(zSig1
& 0x1) && zSig2
;
2976 status
->float_exception_flags
|= float_flag_inexact
;
2979 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
2980 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
2983 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
2985 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
2989 /*----------------------------------------------------------------------------
2990 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2991 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2992 | returns the proper quadruple-precision floating-point value corresponding
2993 | to the abstract input. This routine is just like `roundAndPackFloat128'
2994 | except that the input significand has fewer bits and does not have to be
2995 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
2997 *----------------------------------------------------------------------------*/
2999 static float128
normalizeRoundAndPackFloat128(flag zSign
, int32_t zExp
,
3000 uint64_t zSig0
, uint64_t zSig1
,
3001 float_status
*status
)
3011 shiftCount
= countLeadingZeros64( zSig0
) - 15;
3012 if ( 0 <= shiftCount
) {
3014 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
3017 shift128ExtraRightJamming(
3018 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
3021 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
3026 /*----------------------------------------------------------------------------
3027 | Returns the result of converting the 32-bit two's complement integer `a'
3028 | to the extended double-precision floating-point format. The conversion
3029 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3031 *----------------------------------------------------------------------------*/
3033 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
3040 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
3042 absA
= zSign
? - a
: a
;
3043 shiftCount
= countLeadingZeros32( absA
) + 32;
3045 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
3049 /*----------------------------------------------------------------------------
3050 | Returns the result of converting the 32-bit two's complement integer `a' to
3051 | the quadruple-precision floating-point format. The conversion is performed
3052 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3053 *----------------------------------------------------------------------------*/
3055 float128
int32_to_float128(int32_t a
, float_status
*status
)
3062 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
3064 absA
= zSign
? - a
: a
;
3065 shiftCount
= countLeadingZeros32( absA
) + 17;
3067 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
3071 /*----------------------------------------------------------------------------
3072 | Returns the result of converting the 64-bit two's complement integer `a'
3073 | to the extended double-precision floating-point format. The conversion
3074 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3076 *----------------------------------------------------------------------------*/
3078 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
3084 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
3086 absA
= zSign
? - a
: a
;
3087 shiftCount
= countLeadingZeros64( absA
);
3088 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
3092 /*----------------------------------------------------------------------------
3093 | Returns the result of converting the 64-bit two's complement integer `a' to
3094 | the quadruple-precision floating-point format. The conversion is performed
3095 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3096 *----------------------------------------------------------------------------*/
3098 float128
int64_to_float128(int64_t a
, float_status
*status
)
3104 uint64_t zSig0
, zSig1
;
3106 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
3108 absA
= zSign
? - a
: a
;
3109 shiftCount
= countLeadingZeros64( absA
) + 49;
3110 zExp
= 0x406E - shiftCount
;
3111 if ( 64 <= shiftCount
) {
3120 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
3121 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
3125 /*----------------------------------------------------------------------------
3126 | Returns the result of converting the 64-bit unsigned integer `a'
3127 | to the quadruple-precision floating-point format. The conversion is performed
3128 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3129 *----------------------------------------------------------------------------*/
3131 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
3134 return float128_zero
;
3136 return normalizeRoundAndPackFloat128(0, 0x406E, a
, 0, status
);
3142 /*----------------------------------------------------------------------------
3143 | Returns the result of converting the single-precision floating-point value
3144 | `a' to the double-precision floating-point format. The conversion is
3145 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3147 *----------------------------------------------------------------------------*/
3149 float64
float32_to_float64(float32 a
, float_status
*status
)
3154 a
= float32_squash_input_denormal(a
, status
);
3156 aSig
= extractFloat32Frac( a
);
3157 aExp
= extractFloat32Exp( a
);
3158 aSign
= extractFloat32Sign( a
);
3159 if ( aExp
== 0xFF ) {
3161 return commonNaNToFloat64(float32ToCommonNaN(a
, status
), status
);
3163 return packFloat64( aSign
, 0x7FF, 0 );
3166 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
3167 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3170 return packFloat64( aSign
, aExp
+ 0x380, ( (uint64_t) aSig
)<<29 );
3174 /*----------------------------------------------------------------------------
3175 | Returns the result of converting the single-precision floating-point value
3176 | `a' to the extended double-precision floating-point format. The conversion
3177 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3179 *----------------------------------------------------------------------------*/
3181 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
3187 a
= float32_squash_input_denormal(a
, status
);
3188 aSig
= extractFloat32Frac( a
);
3189 aExp
= extractFloat32Exp( a
);
3190 aSign
= extractFloat32Sign( a
);
3191 if ( aExp
== 0xFF ) {
3193 return commonNaNToFloatx80(float32ToCommonNaN(a
, status
), status
);
3195 return packFloatx80(aSign
,
3196 floatx80_infinity_high
,
3197 floatx80_infinity_low
);
3200 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
3201 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3204 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
3208 /*----------------------------------------------------------------------------
3209 | Returns the result of converting the single-precision floating-point value
3210 | `a' to the double-precision floating-point format. The conversion is
3211 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3213 *----------------------------------------------------------------------------*/
3215 float128
float32_to_float128(float32 a
, float_status
*status
)
3221 a
= float32_squash_input_denormal(a
, status
);
3222 aSig
= extractFloat32Frac( a
);
3223 aExp
= extractFloat32Exp( a
);
3224 aSign
= extractFloat32Sign( a
);
3225 if ( aExp
== 0xFF ) {
3227 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
3229 return packFloat128( aSign
, 0x7FFF, 0, 0 );
3232 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
3233 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3236 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
3240 /*----------------------------------------------------------------------------
3241 | Returns the remainder of the single-precision floating-point value `a'
3242 | with respect to the corresponding value `b'. The operation is performed
3243 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3244 *----------------------------------------------------------------------------*/
3246 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
3249 int aExp
, bExp
, expDiff
;
3250 uint32_t aSig
, bSig
;
3252 uint64_t aSig64
, bSig64
, q64
;
3253 uint32_t alternateASig
;
3255 a
= float32_squash_input_denormal(a
, status
);
3256 b
= float32_squash_input_denormal(b
, status
);
3258 aSig
= extractFloat32Frac( a
);
3259 aExp
= extractFloat32Exp( a
);
3260 aSign
= extractFloat32Sign( a
);
3261 bSig
= extractFloat32Frac( b
);
3262 bExp
= extractFloat32Exp( b
);
3263 if ( aExp
== 0xFF ) {
3264 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
3265 return propagateFloat32NaN(a
, b
, status
);
3267 float_raise(float_flag_invalid
, status
);
3268 return float32_default_nan(status
);
3270 if ( bExp
== 0xFF ) {
3272 return propagateFloat32NaN(a
, b
, status
);
3278 float_raise(float_flag_invalid
, status
);
3279 return float32_default_nan(status
);
3281 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
3284 if ( aSig
== 0 ) return a
;
3285 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3287 expDiff
= aExp
- bExp
;
3290 if ( expDiff
< 32 ) {
3293 if ( expDiff
< 0 ) {
3294 if ( expDiff
< -1 ) return a
;
3297 q
= ( bSig
<= aSig
);
3298 if ( q
) aSig
-= bSig
;
3299 if ( 0 < expDiff
) {
3300 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
3303 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3311 if ( bSig
<= aSig
) aSig
-= bSig
;
3312 aSig64
= ( (uint64_t) aSig
)<<40;
3313 bSig64
= ( (uint64_t) bSig
)<<40;
3315 while ( 0 < expDiff
) {
3316 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
3317 q64
= ( 2 < q64
) ? q64
- 2 : 0;
3318 aSig64
= - ( ( bSig
* q64
)<<38 );
3322 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
3323 q64
= ( 2 < q64
) ? q64
- 2 : 0;
3324 q
= q64
>>( 64 - expDiff
);
3326 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
3329 alternateASig
= aSig
;
3332 } while ( 0 <= (int32_t) aSig
);
3333 sigMean
= aSig
+ alternateASig
;
3334 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3335 aSig
= alternateASig
;
3337 zSign
= ( (int32_t) aSig
< 0 );
3338 if ( zSign
) aSig
= - aSig
;
3339 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
3344 /*----------------------------------------------------------------------------
3345 | Returns the binary exponential of the single-precision floating-point value
3346 | `a'. The operation is performed according to the IEC/IEEE Standard for
3347 | Binary Floating-Point Arithmetic.
3349 | Uses the following identities:
3351 | 1. -------------------------------------------------------------------------
3355 | 2. -------------------------------------------------------------------------
3358 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3360 *----------------------------------------------------------------------------*/
3362 static const float64 float32_exp2_coefficients
[15] =
3364 const_float64( 0x3ff0000000000000ll
), /* 1 */
3365 const_float64( 0x3fe0000000000000ll
), /* 2 */
3366 const_float64( 0x3fc5555555555555ll
), /* 3 */
3367 const_float64( 0x3fa5555555555555ll
), /* 4 */
3368 const_float64( 0x3f81111111111111ll
), /* 5 */
3369 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
3370 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
3371 const_float64( 0x3efa01a01a01a01all
), /* 8 */
3372 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
3373 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
3374 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
3375 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
3376 const_float64( 0x3de6124613a86d09ll
), /* 13 */
3377 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
3378 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
3381 float32
float32_exp2(float32 a
, float_status
*status
)
3388 a
= float32_squash_input_denormal(a
, status
);
3390 aSig
= extractFloat32Frac( a
);
3391 aExp
= extractFloat32Exp( a
);
3392 aSign
= extractFloat32Sign( a
);
3394 if ( aExp
== 0xFF) {
3396 return propagateFloat32NaN(a
, float32_zero
, status
);
3398 return (aSign
) ? float32_zero
: a
;
3401 if (aSig
== 0) return float32_one
;
3404 float_raise(float_flag_inexact
, status
);
3406 /* ******************************* */
3407 /* using float64 for approximation */
3408 /* ******************************* */
3409 x
= float32_to_float64(a
, status
);
3410 x
= float64_mul(x
, float64_ln2
, status
);
3414 for (i
= 0 ; i
< 15 ; i
++) {
3417 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
3418 r
= float64_add(r
, f
, status
);
3420 xn
= float64_mul(xn
, x
, status
);
3423 return float64_to_float32(r
, status
);
3426 /*----------------------------------------------------------------------------
3427 | Returns the binary log of the single-precision floating-point value `a'.
3428 | The operation is performed according to the IEC/IEEE Standard for Binary
3429 | Floating-Point Arithmetic.
3430 *----------------------------------------------------------------------------*/
3431 float32
float32_log2(float32 a
, float_status
*status
)
3435 uint32_t aSig
, zSig
, i
;
3437 a
= float32_squash_input_denormal(a
, status
);
3438 aSig
= extractFloat32Frac( a
);
3439 aExp
= extractFloat32Exp( a
);
3440 aSign
= extractFloat32Sign( a
);
3443 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
3444 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3447 float_raise(float_flag_invalid
, status
);
3448 return float32_default_nan(status
);
3450 if ( aExp
== 0xFF ) {
3452 return propagateFloat32NaN(a
, float32_zero
, status
);
3462 for (i
= 1 << 22; i
> 0; i
>>= 1) {
3463 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
3464 if ( aSig
& 0x01000000 ) {
3473 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
3476 /*----------------------------------------------------------------------------
3477 | Returns 1 if the single-precision floating-point value `a' is equal to
3478 | the corresponding value `b', and 0 otherwise. The invalid exception is
3479 | raised if either operand is a NaN. Otherwise, the comparison is performed
3480 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3481 *----------------------------------------------------------------------------*/
3483 int float32_eq(float32 a
, float32 b
, float_status
*status
)
3486 a
= float32_squash_input_denormal(a
, status
);
3487 b
= float32_squash_input_denormal(b
, status
);
3489 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3490 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3492 float_raise(float_flag_invalid
, status
);
3495 av
= float32_val(a
);
3496 bv
= float32_val(b
);
3497 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3500 /*----------------------------------------------------------------------------
3501 | Returns 1 if the single-precision floating-point value `a' is less than
3502 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3503 | exception is raised if either operand is a NaN. The comparison is performed
3504 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3505 *----------------------------------------------------------------------------*/
3507 int float32_le(float32 a
, float32 b
, float_status
*status
)
3511 a
= float32_squash_input_denormal(a
, status
);
3512 b
= float32_squash_input_denormal(b
, status
);
3514 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3515 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3517 float_raise(float_flag_invalid
, status
);
3520 aSign
= extractFloat32Sign( a
);
3521 bSign
= extractFloat32Sign( b
);
3522 av
= float32_val(a
);
3523 bv
= float32_val(b
);
3524 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3525 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3529 /*----------------------------------------------------------------------------
3530 | Returns 1 if the single-precision floating-point value `a' is less than
3531 | the corresponding value `b', and 0 otherwise. The invalid exception is
3532 | raised if either operand is a NaN. The comparison is performed according
3533 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3534 *----------------------------------------------------------------------------*/
3536 int float32_lt(float32 a
, float32 b
, float_status
*status
)
3540 a
= float32_squash_input_denormal(a
, status
);
3541 b
= float32_squash_input_denormal(b
, status
);
3543 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3544 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3546 float_raise(float_flag_invalid
, status
);
3549 aSign
= extractFloat32Sign( a
);
3550 bSign
= extractFloat32Sign( b
);
3551 av
= float32_val(a
);
3552 bv
= float32_val(b
);
3553 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
3554 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3558 /*----------------------------------------------------------------------------
3559 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3560 | be compared, and 0 otherwise. The invalid exception is raised if either
3561 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3562 | Standard for Binary Floating-Point Arithmetic.
3563 *----------------------------------------------------------------------------*/
3565 int float32_unordered(float32 a
, float32 b
, float_status
*status
)
3567 a
= float32_squash_input_denormal(a
, status
);
3568 b
= float32_squash_input_denormal(b
, status
);
3570 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3571 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3573 float_raise(float_flag_invalid
, status
);
3579 /*----------------------------------------------------------------------------
3580 | Returns 1 if the single-precision floating-point value `a' is equal to
3581 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3582 | exception. The comparison is performed according to the IEC/IEEE Standard
3583 | for Binary Floating-Point Arithmetic.
3584 *----------------------------------------------------------------------------*/
3586 int float32_eq_quiet(float32 a
, float32 b
, float_status
*status
)
3588 a
= float32_squash_input_denormal(a
, status
);
3589 b
= float32_squash_input_denormal(b
, status
);
3591 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3592 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3594 if (float32_is_signaling_nan(a
, status
)
3595 || float32_is_signaling_nan(b
, status
)) {
3596 float_raise(float_flag_invalid
, status
);
3600 return ( float32_val(a
) == float32_val(b
) ) ||
3601 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
3604 /*----------------------------------------------------------------------------
3605 | Returns 1 if the single-precision floating-point value `a' is less than or
3606 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3607 | cause an exception. Otherwise, the comparison is performed according to the
3608 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3609 *----------------------------------------------------------------------------*/
3611 int float32_le_quiet(float32 a
, float32 b
, float_status
*status
)
3615 a
= float32_squash_input_denormal(a
, status
);
3616 b
= float32_squash_input_denormal(b
, status
);
3618 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3619 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3621 if (float32_is_signaling_nan(a
, status
)
3622 || float32_is_signaling_nan(b
, status
)) {
3623 float_raise(float_flag_invalid
, status
);
3627 aSign
= extractFloat32Sign( a
);
3628 bSign
= extractFloat32Sign( b
);
3629 av
= float32_val(a
);
3630 bv
= float32_val(b
);
3631 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3632 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3636 /*----------------------------------------------------------------------------
3637 | Returns 1 if the single-precision floating-point value `a' is less than
3638 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3639 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3640 | Standard for Binary Floating-Point Arithmetic.
3641 *----------------------------------------------------------------------------*/
3643 int float32_lt_quiet(float32 a
, float32 b
, float_status
*status
)
3647 a
= float32_squash_input_denormal(a
, status
);
3648 b
= float32_squash_input_denormal(b
, status
);
3650 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3651 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3653 if (float32_is_signaling_nan(a
, status
)
3654 || float32_is_signaling_nan(b
, status
)) {
3655 float_raise(float_flag_invalid
, status
);
3659 aSign
= extractFloat32Sign( a
);
3660 bSign
= extractFloat32Sign( b
);
3661 av
= float32_val(a
);
3662 bv
= float32_val(b
);
3663 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
3664 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3668 /*----------------------------------------------------------------------------
3669 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3670 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3671 | comparison is performed according to the IEC/IEEE Standard for Binary
3672 | Floating-Point Arithmetic.
3673 *----------------------------------------------------------------------------*/
3675 int float32_unordered_quiet(float32 a
, float32 b
, float_status
*status
)
3677 a
= float32_squash_input_denormal(a
, status
);
3678 b
= float32_squash_input_denormal(b
, status
);
3680 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3681 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3683 if (float32_is_signaling_nan(a
, status
)
3684 || float32_is_signaling_nan(b
, status
)) {
3685 float_raise(float_flag_invalid
, status
);
3693 /*----------------------------------------------------------------------------
3694 | Returns the result of converting the double-precision floating-point value
3695 | `a' to the single-precision floating-point format. The conversion is
3696 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3698 *----------------------------------------------------------------------------*/
3700 float32
float64_to_float32(float64 a
, float_status
*status
)
3706 a
= float64_squash_input_denormal(a
, status
);
3708 aSig
= extractFloat64Frac( a
);
3709 aExp
= extractFloat64Exp( a
);
3710 aSign
= extractFloat64Sign( a
);
3711 if ( aExp
== 0x7FF ) {
3713 return commonNaNToFloat32(float64ToCommonNaN(a
, status
), status
);
3715 return packFloat32( aSign
, 0xFF, 0 );
3717 shift64RightJamming( aSig
, 22, &aSig
);
3719 if ( aExp
|| zSig
) {
3723 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
3728 /*----------------------------------------------------------------------------
3729 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3730 | half-precision floating-point value, returning the result. After being
3731 | shifted into the proper positions, the three fields are simply added
3732 | together to form the result. This means that any integer portion of `zSig'
3733 | will be added into the exponent. Since a properly normalized significand
3734 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3735 | than the desired result exponent whenever `zSig' is a complete, normalized
3737 *----------------------------------------------------------------------------*/
3738 static float16
packFloat16(flag zSign
, int zExp
, uint16_t zSig
)
3740 return make_float16(
3741 (((uint32_t)zSign
) << 15) + (((uint32_t)zExp
) << 10) + zSig
);
3744 /*----------------------------------------------------------------------------
3745 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3746 | and significand `zSig', and returns the proper half-precision floating-
3747 | point value corresponding to the abstract input. Ordinarily, the abstract
3748 | value is simply rounded and packed into the half-precision format, with
3749 | the inexact exception raised if the abstract input cannot be represented
3750 | exactly. However, if the abstract value is too large, the overflow and
3751 | inexact exceptions are raised and an infinity or maximal finite value is
3752 | returned. If the abstract value is too small, the input value is rounded to
3753 | a subnormal number, and the underflow and inexact exceptions are raised if
3754 | the abstract input cannot be represented exactly as a subnormal half-
3755 | precision floating-point number.
3756 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3757 | ARM-style "alternative representation", which omits the NaN and Inf
3758 | encodings in order to raise the maximum representable exponent by one.
3759 | The input significand `zSig' has its binary point between bits 22
3760 | and 23, which is 13 bits to the left of the usual location. This shifted
3761 | significand must be normalized or smaller. If `zSig' is not normalized,
3762 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3763 | and it must not require rounding. In the usual case that `zSig' is
3764 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3765 | Note the slightly odd position of the binary point in zSig compared with the
3766 | other roundAndPackFloat functions. This should probably be fixed if we
3767 | need to implement more float16 routines than just conversion.
3768 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3769 | Binary Floating-Point Arithmetic.
3770 *----------------------------------------------------------------------------*/
3772 static float16
roundAndPackFloat16(flag zSign
, int zExp
,
3773 uint32_t zSig
, flag ieee
,
3774 float_status
*status
)
3776 int maxexp
= ieee
? 29 : 30;
3779 bool rounding_bumps_exp
;
3780 bool is_tiny
= false;
3782 /* Calculate the mask of bits of the mantissa which are not
3783 * representable in half-precision and will be lost.
3786 /* Will be denormal in halfprec */
3792 /* Normal number in halfprec */
3796 switch (status
->float_rounding_mode
) {
3797 case float_round_nearest_even
:
3798 increment
= (mask
+ 1) >> 1;
3799 if ((zSig
& mask
) == increment
) {
3800 increment
= zSig
& (increment
<< 1);
3803 case float_round_ties_away
:
3804 increment
= (mask
+ 1) >> 1;
3806 case float_round_up
:
3807 increment
= zSign
? 0 : mask
;
3809 case float_round_down
:
3810 increment
= zSign
? mask
: 0;
3812 default: /* round_to_zero */
3817 rounding_bumps_exp
= (zSig
+ increment
>= 0x01000000);
3819 if (zExp
> maxexp
|| (zExp
== maxexp
&& rounding_bumps_exp
)) {
3821 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3822 return packFloat16(zSign
, 0x1f, 0);
3824 float_raise(float_flag_invalid
, status
);
3825 return packFloat16(zSign
, 0x1f, 0x3ff);
3830 /* Note that flush-to-zero does not affect half-precision results */
3832 (status
->float_detect_tininess
== float_tininess_before_rounding
)
3834 || (!rounding_bumps_exp
);
3837 float_raise(float_flag_inexact
, status
);
3839 float_raise(float_flag_underflow
, status
);
3844 if (rounding_bumps_exp
) {
3850 return packFloat16(zSign
, 0, 0);
3856 return packFloat16(zSign
, zExp
, zSig
>> 13);
3859 /*----------------------------------------------------------------------------
3860 | If `a' is denormal and we are in flush-to-zero mode then set the
3861 | input-denormal exception and return zero. Otherwise just return the value.
3862 *----------------------------------------------------------------------------*/
3863 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
3865 if (status
->flush_inputs_to_zero
) {
3866 if (extractFloat16Exp(a
) == 0 && extractFloat16Frac(a
) != 0) {
3867 float_raise(float_flag_input_denormal
, status
);
3868 return make_float16(float16_val(a
) & 0x8000);
3874 static void normalizeFloat16Subnormal(uint32_t aSig
, int *zExpPtr
,
3877 int8_t shiftCount
= countLeadingZeros32(aSig
) - 21;
3878 *zSigPtr
= aSig
<< shiftCount
;
3879 *zExpPtr
= 1 - shiftCount
;
3882 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3883 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3885 float32
float16_to_float32(float16 a
, flag ieee
, float_status
*status
)
3891 aSign
= extractFloat16Sign(a
);
3892 aExp
= extractFloat16Exp(a
);
3893 aSig
= extractFloat16Frac(a
);
3895 if (aExp
== 0x1f && ieee
) {
3897 return commonNaNToFloat32(float16ToCommonNaN(a
, status
), status
);
3899 return packFloat32(aSign
, 0xff, 0);
3903 return packFloat32(aSign
, 0, 0);
3906 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3909 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
3912 float16
float32_to_float16(float32 a
, flag ieee
, float_status
*status
)
3918 a
= float32_squash_input_denormal(a
, status
);
3920 aSig
= extractFloat32Frac( a
);
3921 aExp
= extractFloat32Exp( a
);
3922 aSign
= extractFloat32Sign( a
);
3923 if ( aExp
== 0xFF ) {
3925 /* Input is a NaN */
3927 float_raise(float_flag_invalid
, status
);
3928 return packFloat16(aSign
, 0, 0);
3930 return commonNaNToFloat16(
3931 float32ToCommonNaN(a
, status
), status
);
3935 float_raise(float_flag_invalid
, status
);
3936 return packFloat16(aSign
, 0x1f, 0x3ff);
3938 return packFloat16(aSign
, 0x1f, 0);
3940 if (aExp
== 0 && aSig
== 0) {
3941 return packFloat16(aSign
, 0, 0);
3943 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3944 * even if the input is denormal; however this is harmless because
3945 * the largest possible single-precision denormal is still smaller
3946 * than the smallest representable half-precision denormal, and so we
3947 * will end up ignoring aSig and returning via the "always return zero"
3953 return roundAndPackFloat16(aSign
, aExp
, aSig
, ieee
, status
);
3956 float64
float16_to_float64(float16 a
, flag ieee
, float_status
*status
)
3962 aSign
= extractFloat16Sign(a
);
3963 aExp
= extractFloat16Exp(a
);
3964 aSig
= extractFloat16Frac(a
);
3966 if (aExp
== 0x1f && ieee
) {
3968 return commonNaNToFloat64(
3969 float16ToCommonNaN(a
, status
), status
);
3971 return packFloat64(aSign
, 0x7ff, 0);
3975 return packFloat64(aSign
, 0, 0);
3978 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3981 return packFloat64(aSign
, aExp
+ 0x3f0, ((uint64_t)aSig
) << 42);
3984 float16
float64_to_float16(float64 a
, flag ieee
, float_status
*status
)
3991 a
= float64_squash_input_denormal(a
, status
);
3993 aSig
= extractFloat64Frac(a
);
3994 aExp
= extractFloat64Exp(a
);
3995 aSign
= extractFloat64Sign(a
);
3996 if (aExp
== 0x7FF) {
3998 /* Input is a NaN */
4000 float_raise(float_flag_invalid
, status
);
4001 return packFloat16(aSign
, 0, 0);
4003 return commonNaNToFloat16(
4004 float64ToCommonNaN(a
, status
), status
);
4008 float_raise(float_flag_invalid
, status
);
4009 return packFloat16(aSign
, 0x1f, 0x3ff);
4011 return packFloat16(aSign
, 0x1f, 0);
4013 shift64RightJamming(aSig
, 29, &aSig
);
4015 if (aExp
== 0 && zSig
== 0) {
4016 return packFloat16(aSign
, 0, 0);
4018 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4019 * even if the input is denormal; however this is harmless because
4020 * the largest possible single-precision denormal is still smaller
4021 * than the smallest representable half-precision denormal, and so we
4022 * will end up ignoring aSig and returning via the "always return zero"
4028 return roundAndPackFloat16(aSign
, aExp
, zSig
, ieee
, status
);
4031 /*----------------------------------------------------------------------------
4032 | Returns the result of converting the double-precision floating-point value
4033 | `a' to the extended double-precision floating-point format. The conversion
4034 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4036 *----------------------------------------------------------------------------*/
4038 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
4044 a
= float64_squash_input_denormal(a
, status
);
4045 aSig
= extractFloat64Frac( a
);
4046 aExp
= extractFloat64Exp( a
);
4047 aSign
= extractFloat64Sign( a
);
4048 if ( aExp
== 0x7FF ) {
4050 return commonNaNToFloatx80(float64ToCommonNaN(a
, status
), status
);
4052 return packFloatx80(aSign
,
4053 floatx80_infinity_high
,
4054 floatx80_infinity_low
);
4057 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
4058 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4062 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
4066 /*----------------------------------------------------------------------------
4067 | Returns the result of converting the double-precision floating-point value
4068 | `a' to the quadruple-precision floating-point format. The conversion is
4069 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4071 *----------------------------------------------------------------------------*/
4073 float128
float64_to_float128(float64 a
, float_status
*status
)
4077 uint64_t aSig
, zSig0
, zSig1
;
4079 a
= float64_squash_input_denormal(a
, status
);
4080 aSig
= extractFloat64Frac( a
);
4081 aExp
= extractFloat64Exp( a
);
4082 aSign
= extractFloat64Sign( a
);
4083 if ( aExp
== 0x7FF ) {
4085 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
4087 return packFloat128( aSign
, 0x7FFF, 0, 0 );
4090 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
4091 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4094 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
4095 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
4100 /*----------------------------------------------------------------------------
4101 | Returns the remainder of the double-precision floating-point value `a'
4102 | with respect to the corresponding value `b'. The operation is performed
4103 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4104 *----------------------------------------------------------------------------*/
4106 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
4109 int aExp
, bExp
, expDiff
;
4110 uint64_t aSig
, bSig
;
4111 uint64_t q
, alternateASig
;
4114 a
= float64_squash_input_denormal(a
, status
);
4115 b
= float64_squash_input_denormal(b
, status
);
4116 aSig
= extractFloat64Frac( a
);
4117 aExp
= extractFloat64Exp( a
);
4118 aSign
= extractFloat64Sign( a
);
4119 bSig
= extractFloat64Frac( b
);
4120 bExp
= extractFloat64Exp( b
);
4121 if ( aExp
== 0x7FF ) {
4122 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
4123 return propagateFloat64NaN(a
, b
, status
);
4125 float_raise(float_flag_invalid
, status
);
4126 return float64_default_nan(status
);
4128 if ( bExp
== 0x7FF ) {
4130 return propagateFloat64NaN(a
, b
, status
);
4136 float_raise(float_flag_invalid
, status
);
4137 return float64_default_nan(status
);
4139 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4142 if ( aSig
== 0 ) return a
;
4143 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4145 expDiff
= aExp
- bExp
;
4146 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
4147 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4148 if ( expDiff
< 0 ) {
4149 if ( expDiff
< -1 ) return a
;
4152 q
= ( bSig
<= aSig
);
4153 if ( q
) aSig
-= bSig
;
4155 while ( 0 < expDiff
) {
4156 q
= estimateDiv128To64( aSig
, 0, bSig
);
4157 q
= ( 2 < q
) ? q
- 2 : 0;
4158 aSig
= - ( ( bSig
>>2 ) * q
);
4162 if ( 0 < expDiff
) {
4163 q
= estimateDiv128To64( aSig
, 0, bSig
);
4164 q
= ( 2 < q
) ? q
- 2 : 0;
4167 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4174 alternateASig
= aSig
;
4177 } while ( 0 <= (int64_t) aSig
);
4178 sigMean
= aSig
+ alternateASig
;
4179 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4180 aSig
= alternateASig
;
4182 zSign
= ( (int64_t) aSig
< 0 );
4183 if ( zSign
) aSig
= - aSig
;
4184 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
4188 /*----------------------------------------------------------------------------
4189 | Returns the binary log of the double-precision floating-point value `a'.
4190 | The operation is performed according to the IEC/IEEE Standard for Binary
4191 | Floating-Point Arithmetic.
4192 *----------------------------------------------------------------------------*/
4193 float64
float64_log2(float64 a
, float_status
*status
)
4197 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
4198 a
= float64_squash_input_denormal(a
, status
);
4200 aSig
= extractFloat64Frac( a
);
4201 aExp
= extractFloat64Exp( a
);
4202 aSign
= extractFloat64Sign( a
);
4205 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
4206 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4209 float_raise(float_flag_invalid
, status
);
4210 return float64_default_nan(status
);
4212 if ( aExp
== 0x7FF ) {
4214 return propagateFloat64NaN(a
, float64_zero
, status
);
4220 aSig
|= LIT64( 0x0010000000000000 );
4222 zSig
= (uint64_t)aExp
<< 52;
4223 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
4224 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
4225 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
4226 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
4234 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
4237 /*----------------------------------------------------------------------------
4238 | Returns 1 if the double-precision floating-point value `a' is equal to the
4239 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4240 | if either operand is a NaN. Otherwise, the comparison is performed
4241 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4242 *----------------------------------------------------------------------------*/
4244 int float64_eq(float64 a
, float64 b
, float_status
*status
)
4247 a
= float64_squash_input_denormal(a
, status
);
4248 b
= float64_squash_input_denormal(b
, status
);
4250 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4251 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4253 float_raise(float_flag_invalid
, status
);
4256 av
= float64_val(a
);
4257 bv
= float64_val(b
);
4258 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4262 /*----------------------------------------------------------------------------
4263 | Returns 1 if the double-precision floating-point value `a' is less than or
4264 | equal to the corresponding value `b', and 0 otherwise. The invalid
4265 | exception is raised if either operand is a NaN. The comparison is performed
4266 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4267 *----------------------------------------------------------------------------*/
4269 int float64_le(float64 a
, float64 b
, float_status
*status
)
4273 a
= float64_squash_input_denormal(a
, status
);
4274 b
= float64_squash_input_denormal(b
, status
);
4276 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4277 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4279 float_raise(float_flag_invalid
, status
);
4282 aSign
= extractFloat64Sign( a
);
4283 bSign
= extractFloat64Sign( b
);
4284 av
= float64_val(a
);
4285 bv
= float64_val(b
);
4286 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4287 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4291 /*----------------------------------------------------------------------------
4292 | Returns 1 if the double-precision floating-point value `a' is less than
4293 | the corresponding value `b', and 0 otherwise. The invalid exception is
4294 | raised if either operand is a NaN. The comparison is performed according
4295 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4296 *----------------------------------------------------------------------------*/
4298 int float64_lt(float64 a
, float64 b
, float_status
*status
)
4303 a
= float64_squash_input_denormal(a
, status
);
4304 b
= float64_squash_input_denormal(b
, status
);
4305 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4306 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4308 float_raise(float_flag_invalid
, status
);
4311 aSign
= extractFloat64Sign( a
);
4312 bSign
= extractFloat64Sign( b
);
4313 av
= float64_val(a
);
4314 bv
= float64_val(b
);
4315 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4316 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4320 /*----------------------------------------------------------------------------
4321 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4322 | be compared, and 0 otherwise. The invalid exception is raised if either
4323 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4324 | Standard for Binary Floating-Point Arithmetic.
4325 *----------------------------------------------------------------------------*/
4327 int float64_unordered(float64 a
, float64 b
, float_status
*status
)
4329 a
= float64_squash_input_denormal(a
, status
);
4330 b
= float64_squash_input_denormal(b
, status
);
4332 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4333 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4335 float_raise(float_flag_invalid
, status
);
4341 /*----------------------------------------------------------------------------
4342 | Returns 1 if the double-precision floating-point value `a' is equal to the
4343 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4344 | exception.The comparison is performed according to the IEC/IEEE Standard
4345 | for Binary Floating-Point Arithmetic.
4346 *----------------------------------------------------------------------------*/
4348 int float64_eq_quiet(float64 a
, float64 b
, float_status
*status
)
4351 a
= float64_squash_input_denormal(a
, status
);
4352 b
= float64_squash_input_denormal(b
, status
);
4354 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4355 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4357 if (float64_is_signaling_nan(a
, status
)
4358 || float64_is_signaling_nan(b
, status
)) {
4359 float_raise(float_flag_invalid
, status
);
4363 av
= float64_val(a
);
4364 bv
= float64_val(b
);
4365 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4369 /*----------------------------------------------------------------------------
4370 | Returns 1 if the double-precision floating-point value `a' is less than or
4371 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4372 | cause an exception. Otherwise, the comparison is performed according to the
4373 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4374 *----------------------------------------------------------------------------*/
4376 int float64_le_quiet(float64 a
, float64 b
, float_status
*status
)
4380 a
= float64_squash_input_denormal(a
, status
);
4381 b
= float64_squash_input_denormal(b
, status
);
4383 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4384 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4386 if (float64_is_signaling_nan(a
, status
)
4387 || float64_is_signaling_nan(b
, status
)) {
4388 float_raise(float_flag_invalid
, status
);
4392 aSign
= extractFloat64Sign( a
);
4393 bSign
= extractFloat64Sign( b
);
4394 av
= float64_val(a
);
4395 bv
= float64_val(b
);
4396 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4397 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4401 /*----------------------------------------------------------------------------
4402 | Returns 1 if the double-precision floating-point value `a' is less than
4403 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4404 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4405 | Standard for Binary Floating-Point Arithmetic.
4406 *----------------------------------------------------------------------------*/
4408 int float64_lt_quiet(float64 a
, float64 b
, float_status
*status
)
4412 a
= float64_squash_input_denormal(a
, status
);
4413 b
= float64_squash_input_denormal(b
, status
);
4415 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4416 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4418 if (float64_is_signaling_nan(a
, status
)
4419 || float64_is_signaling_nan(b
, status
)) {
4420 float_raise(float_flag_invalid
, status
);
4424 aSign
= extractFloat64Sign( a
);
4425 bSign
= extractFloat64Sign( b
);
4426 av
= float64_val(a
);
4427 bv
= float64_val(b
);
4428 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4429 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4433 /*----------------------------------------------------------------------------
4434 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4435 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4436 | comparison is performed according to the IEC/IEEE Standard for Binary
4437 | Floating-Point Arithmetic.
4438 *----------------------------------------------------------------------------*/
4440 int float64_unordered_quiet(float64 a
, float64 b
, float_status
*status
)
4442 a
= float64_squash_input_denormal(a
, status
);
4443 b
= float64_squash_input_denormal(b
, status
);
4445 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4446 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4448 if (float64_is_signaling_nan(a
, status
)
4449 || float64_is_signaling_nan(b
, status
)) {
4450 float_raise(float_flag_invalid
, status
);
4457 /*----------------------------------------------------------------------------
4458 | Returns the result of converting the extended double-precision floating-
4459 | point value `a' to the 32-bit two's complement integer format. The
4460 | conversion is performed according to the IEC/IEEE Standard for Binary
4461 | Floating-Point Arithmetic---which means in particular that the conversion
4462 | is rounded according to the current rounding mode. If `a' is a NaN, the
4463 | largest positive integer is returned. Otherwise, if the conversion
4464 | overflows, the largest integer with the same sign as `a' is returned.
4465 *----------------------------------------------------------------------------*/
4467 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
4470 int32_t aExp
, shiftCount
;
4473 if (floatx80_invalid_encoding(a
)) {
4474 float_raise(float_flag_invalid
, status
);
4477 aSig
= extractFloatx80Frac( a
);
4478 aExp
= extractFloatx80Exp( a
);
4479 aSign
= extractFloatx80Sign( a
);
4480 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4481 shiftCount
= 0x4037 - aExp
;
4482 if ( shiftCount
<= 0 ) shiftCount
= 1;
4483 shift64RightJamming( aSig
, shiftCount
, &aSig
);
4484 return roundAndPackInt32(aSign
, aSig
, status
);
4488 /*----------------------------------------------------------------------------
4489 | Returns the result of converting the extended double-precision floating-
4490 | point value `a' to the 32-bit two's complement integer format. The
4491 | conversion is performed according to the IEC/IEEE Standard for Binary
4492 | Floating-Point Arithmetic, except that the conversion is always rounded
4493 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4494 | Otherwise, if the conversion overflows, the largest integer with the same
4495 | sign as `a' is returned.
4496 *----------------------------------------------------------------------------*/
4498 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
4501 int32_t aExp
, shiftCount
;
4502 uint64_t aSig
, savedASig
;
4505 if (floatx80_invalid_encoding(a
)) {
4506 float_raise(float_flag_invalid
, status
);
4509 aSig
= extractFloatx80Frac( a
);
4510 aExp
= extractFloatx80Exp( a
);
4511 aSign
= extractFloatx80Sign( a
);
4512 if ( 0x401E < aExp
) {
4513 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4516 else if ( aExp
< 0x3FFF ) {
4518 status
->float_exception_flags
|= float_flag_inexact
;
4522 shiftCount
= 0x403E - aExp
;
4524 aSig
>>= shiftCount
;
4526 if ( aSign
) z
= - z
;
4527 if ( ( z
< 0 ) ^ aSign
) {
4529 float_raise(float_flag_invalid
, status
);
4530 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
4532 if ( ( aSig
<<shiftCount
) != savedASig
) {
4533 status
->float_exception_flags
|= float_flag_inexact
;
4539 /*----------------------------------------------------------------------------
4540 | Returns the result of converting the extended double-precision floating-
4541 | point value `a' to the 64-bit two's complement integer format. The
4542 | conversion is performed according to the IEC/IEEE Standard for Binary
4543 | Floating-Point Arithmetic---which means in particular that the conversion
4544 | is rounded according to the current rounding mode. If `a' is a NaN,
4545 | the largest positive integer is returned. Otherwise, if the conversion
4546 | overflows, the largest integer with the same sign as `a' is returned.
4547 *----------------------------------------------------------------------------*/
4549 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
4552 int32_t aExp
, shiftCount
;
4553 uint64_t aSig
, aSigExtra
;
4555 if (floatx80_invalid_encoding(a
)) {
4556 float_raise(float_flag_invalid
, status
);
4559 aSig
= extractFloatx80Frac( a
);
4560 aExp
= extractFloatx80Exp( a
);
4561 aSign
= extractFloatx80Sign( a
);
4562 shiftCount
= 0x403E - aExp
;
4563 if ( shiftCount
<= 0 ) {
4565 float_raise(float_flag_invalid
, status
);
4566 if (!aSign
|| floatx80_is_any_nan(a
)) {
4567 return LIT64( 0x7FFFFFFFFFFFFFFF );
4569 return (int64_t) LIT64( 0x8000000000000000 );
4574 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
4576 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
4580 /*----------------------------------------------------------------------------
4581 | Returns the result of converting the extended double-precision floating-
4582 | point value `a' to the 64-bit two's complement integer format. The
4583 | conversion is performed according to the IEC/IEEE Standard for Binary
4584 | Floating-Point Arithmetic, except that the conversion is always rounded
4585 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4586 | Otherwise, if the conversion overflows, the largest integer with the same
4587 | sign as `a' is returned.
4588 *----------------------------------------------------------------------------*/
4590 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
4593 int32_t aExp
, shiftCount
;
4597 if (floatx80_invalid_encoding(a
)) {
4598 float_raise(float_flag_invalid
, status
);
4601 aSig
= extractFloatx80Frac( a
);
4602 aExp
= extractFloatx80Exp( a
);
4603 aSign
= extractFloatx80Sign( a
);
4604 shiftCount
= aExp
- 0x403E;
4605 if ( 0 <= shiftCount
) {
4606 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
4607 if ( ( a
.high
!= 0xC03E ) || aSig
) {
4608 float_raise(float_flag_invalid
, status
);
4609 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
4610 return LIT64( 0x7FFFFFFFFFFFFFFF );
4613 return (int64_t) LIT64( 0x8000000000000000 );
4615 else if ( aExp
< 0x3FFF ) {
4617 status
->float_exception_flags
|= float_flag_inexact
;
4621 z
= aSig
>>( - shiftCount
);
4622 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
4623 status
->float_exception_flags
|= float_flag_inexact
;
4625 if ( aSign
) z
= - z
;
4630 /*----------------------------------------------------------------------------
4631 | Returns the result of converting the extended double-precision floating-
4632 | point value `a' to the single-precision floating-point format. The
4633 | conversion is performed according to the IEC/IEEE Standard for Binary
4634 | Floating-Point Arithmetic.
4635 *----------------------------------------------------------------------------*/
4637 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
4643 if (floatx80_invalid_encoding(a
)) {
4644 float_raise(float_flag_invalid
, status
);
4645 return float32_default_nan(status
);
4647 aSig
= extractFloatx80Frac( a
);
4648 aExp
= extractFloatx80Exp( a
);
4649 aSign
= extractFloatx80Sign( a
);
4650 if ( aExp
== 0x7FFF ) {
4651 if ( (uint64_t) ( aSig
<<1 ) ) {
4652 return commonNaNToFloat32(floatx80ToCommonNaN(a
, status
), status
);
4654 return packFloat32( aSign
, 0xFF, 0 );
4656 shift64RightJamming( aSig
, 33, &aSig
);
4657 if ( aExp
|| aSig
) aExp
-= 0x3F81;
4658 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
4662 /*----------------------------------------------------------------------------
4663 | Returns the result of converting the extended double-precision floating-
4664 | point value `a' to the double-precision floating-point format. The
4665 | conversion is performed according to the IEC/IEEE Standard for Binary
4666 | Floating-Point Arithmetic.
4667 *----------------------------------------------------------------------------*/
4669 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
4673 uint64_t aSig
, zSig
;
4675 if (floatx80_invalid_encoding(a
)) {
4676 float_raise(float_flag_invalid
, status
);
4677 return float64_default_nan(status
);
4679 aSig
= extractFloatx80Frac( a
);
4680 aExp
= extractFloatx80Exp( a
);
4681 aSign
= extractFloatx80Sign( a
);
4682 if ( aExp
== 0x7FFF ) {
4683 if ( (uint64_t) ( aSig
<<1 ) ) {
4684 return commonNaNToFloat64(floatx80ToCommonNaN(a
, status
), status
);
4686 return packFloat64( aSign
, 0x7FF, 0 );
4688 shift64RightJamming( aSig
, 1, &zSig
);
4689 if ( aExp
|| aSig
) aExp
-= 0x3C01;
4690 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
4694 /*----------------------------------------------------------------------------
4695 | Returns the result of converting the extended double-precision floating-
4696 | point value `a' to the quadruple-precision floating-point format. The
4697 | conversion is performed according to the IEC/IEEE Standard for Binary
4698 | Floating-Point Arithmetic.
4699 *----------------------------------------------------------------------------*/
4701 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
4705 uint64_t aSig
, zSig0
, zSig1
;
4707 if (floatx80_invalid_encoding(a
)) {
4708 float_raise(float_flag_invalid
, status
);
4709 return float128_default_nan(status
);
4711 aSig
= extractFloatx80Frac( a
);
4712 aExp
= extractFloatx80Exp( a
);
4713 aSign
= extractFloatx80Sign( a
);
4714 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
4715 return commonNaNToFloat128(floatx80ToCommonNaN(a
, status
), status
);
4717 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
4718 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
4722 /*----------------------------------------------------------------------------
4723 | Rounds the extended double-precision floating-point value `a'
4724 | to the precision provided by floatx80_rounding_precision and returns the
4725 | result as an extended double-precision floating-point value.
4726 | The operation is performed according to the IEC/IEEE Standard for Binary
4727 | Floating-Point Arithmetic.
4728 *----------------------------------------------------------------------------*/
4730 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
4732 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
4733 extractFloatx80Sign(a
),
4734 extractFloatx80Exp(a
),
4735 extractFloatx80Frac(a
), 0, status
);
4738 /*----------------------------------------------------------------------------
4739 | Rounds the extended double-precision floating-point value `a' to an integer,
4740 | and returns the result as an extended quadruple-precision floating-point
4741 | value. The operation is performed according to the IEC/IEEE Standard for
4742 | Binary Floating-Point Arithmetic.
4743 *----------------------------------------------------------------------------*/
4745 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
4749 uint64_t lastBitMask
, roundBitsMask
;
4752 if (floatx80_invalid_encoding(a
)) {
4753 float_raise(float_flag_invalid
, status
);
4754 return floatx80_default_nan(status
);
4756 aExp
= extractFloatx80Exp( a
);
4757 if ( 0x403E <= aExp
) {
4758 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
4759 return propagateFloatx80NaN(a
, a
, status
);
4763 if ( aExp
< 0x3FFF ) {
4765 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
4768 status
->float_exception_flags
|= float_flag_inexact
;
4769 aSign
= extractFloatx80Sign( a
);
4770 switch (status
->float_rounding_mode
) {
4771 case float_round_nearest_even
:
4772 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
4775 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
4778 case float_round_ties_away
:
4779 if (aExp
== 0x3FFE) {
4780 return packFloatx80(aSign
, 0x3FFF, LIT64(0x8000000000000000));
4783 case float_round_down
:
4786 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4787 : packFloatx80( 0, 0, 0 );
4788 case float_round_up
:
4790 aSign
? packFloatx80( 1, 0, 0 )
4791 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4793 return packFloatx80( aSign
, 0, 0 );
4796 lastBitMask
<<= 0x403E - aExp
;
4797 roundBitsMask
= lastBitMask
- 1;
4799 switch (status
->float_rounding_mode
) {
4800 case float_round_nearest_even
:
4801 z
.low
+= lastBitMask
>>1;
4802 if ((z
.low
& roundBitsMask
) == 0) {
4803 z
.low
&= ~lastBitMask
;
4806 case float_round_ties_away
:
4807 z
.low
+= lastBitMask
>> 1;
4809 case float_round_to_zero
:
4811 case float_round_up
:
4812 if (!extractFloatx80Sign(z
)) {
4813 z
.low
+= roundBitsMask
;
4816 case float_round_down
:
4817 if (extractFloatx80Sign(z
)) {
4818 z
.low
+= roundBitsMask
;
4824 z
.low
&= ~ roundBitsMask
;
4827 z
.low
= LIT64( 0x8000000000000000 );
4829 if (z
.low
!= a
.low
) {
4830 status
->float_exception_flags
|= float_flag_inexact
;
4836 /*----------------------------------------------------------------------------
4837 | Returns the result of adding the absolute values of the extended double-
4838 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4839 | negated before being returned. `zSign' is ignored if the result is a NaN.
4840 | The addition is performed according to the IEC/IEEE Standard for Binary
4841 | Floating-Point Arithmetic.
4842 *----------------------------------------------------------------------------*/
4844 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
4845 float_status
*status
)
4847 int32_t aExp
, bExp
, zExp
;
4848 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4851 aSig
= extractFloatx80Frac( a
);
4852 aExp
= extractFloatx80Exp( a
);
4853 bSig
= extractFloatx80Frac( b
);
4854 bExp
= extractFloatx80Exp( b
);
4855 expDiff
= aExp
- bExp
;
4856 if ( 0 < expDiff
) {
4857 if ( aExp
== 0x7FFF ) {
4858 if ((uint64_t)(aSig
<< 1)) {
4859 return propagateFloatx80NaN(a
, b
, status
);
4863 if ( bExp
== 0 ) --expDiff
;
4864 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4867 else if ( expDiff
< 0 ) {
4868 if ( bExp
== 0x7FFF ) {
4869 if ((uint64_t)(bSig
<< 1)) {
4870 return propagateFloatx80NaN(a
, b
, status
);
4872 return packFloatx80(zSign
,
4873 floatx80_infinity_high
,
4874 floatx80_infinity_low
);
4876 if ( aExp
== 0 ) ++expDiff
;
4877 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4881 if ( aExp
== 0x7FFF ) {
4882 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
4883 return propagateFloatx80NaN(a
, b
, status
);
4888 zSig0
= aSig
+ bSig
;
4890 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
4896 zSig0
= aSig
+ bSig
;
4897 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
4899 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4900 zSig0
|= LIT64( 0x8000000000000000 );
4903 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
4904 zSign
, zExp
, zSig0
, zSig1
, status
);
4907 /*----------------------------------------------------------------------------
4908 | Returns the result of subtracting the absolute values of the extended
4909 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4910 | difference is negated before being returned. `zSign' is ignored if the
4911 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4912 | Standard for Binary Floating-Point Arithmetic.
4913 *----------------------------------------------------------------------------*/
4915 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
4916 float_status
*status
)
4918 int32_t aExp
, bExp
, zExp
;
4919 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4922 aSig
= extractFloatx80Frac( a
);
4923 aExp
= extractFloatx80Exp( a
);
4924 bSig
= extractFloatx80Frac( b
);
4925 bExp
= extractFloatx80Exp( b
);
4926 expDiff
= aExp
- bExp
;
4927 if ( 0 < expDiff
) goto aExpBigger
;
4928 if ( expDiff
< 0 ) goto bExpBigger
;
4929 if ( aExp
== 0x7FFF ) {
4930 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
4931 return propagateFloatx80NaN(a
, b
, status
);
4933 float_raise(float_flag_invalid
, status
);
4934 return floatx80_default_nan(status
);
4941 if ( bSig
< aSig
) goto aBigger
;
4942 if ( aSig
< bSig
) goto bBigger
;
4943 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
4945 if ( bExp
== 0x7FFF ) {
4946 if ((uint64_t)(bSig
<< 1)) {
4947 return propagateFloatx80NaN(a
, b
, status
);
4949 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
4950 floatx80_infinity_low
);
4952 if ( aExp
== 0 ) ++expDiff
;
4953 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4955 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
4958 goto normalizeRoundAndPack
;
4960 if ( aExp
== 0x7FFF ) {
4961 if ((uint64_t)(aSig
<< 1)) {
4962 return propagateFloatx80NaN(a
, b
, status
);
4966 if ( bExp
== 0 ) --expDiff
;
4967 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4969 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
4971 normalizeRoundAndPack
:
4972 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
4973 zSign
, zExp
, zSig0
, zSig1
, status
);
4976 /*----------------------------------------------------------------------------
4977 | Returns the result of adding the extended double-precision floating-point
4978 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4979 | Standard for Binary Floating-Point Arithmetic.
4980 *----------------------------------------------------------------------------*/
4982 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
4986 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
4987 float_raise(float_flag_invalid
, status
);
4988 return floatx80_default_nan(status
);
4990 aSign
= extractFloatx80Sign( a
);
4991 bSign
= extractFloatx80Sign( b
);
4992 if ( aSign
== bSign
) {
4993 return addFloatx80Sigs(a
, b
, aSign
, status
);
4996 return subFloatx80Sigs(a
, b
, aSign
, status
);
5001 /*----------------------------------------------------------------------------
5002 | Returns the result of subtracting the extended double-precision floating-
5003 | point values `a' and `b'. The operation is performed according to the
5004 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5005 *----------------------------------------------------------------------------*/
5007 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5011 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5012 float_raise(float_flag_invalid
, status
);
5013 return floatx80_default_nan(status
);
5015 aSign
= extractFloatx80Sign( a
);
5016 bSign
= extractFloatx80Sign( b
);
5017 if ( aSign
== bSign
) {
5018 return subFloatx80Sigs(a
, b
, aSign
, status
);
5021 return addFloatx80Sigs(a
, b
, aSign
, status
);
5026 /*----------------------------------------------------------------------------
5027 | Returns the result of multiplying the extended double-precision floating-
5028 | point values `a' and `b'. The operation is performed according to the
5029 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5030 *----------------------------------------------------------------------------*/
5032 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
5034 flag aSign
, bSign
, zSign
;
5035 int32_t aExp
, bExp
, zExp
;
5036 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5038 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5039 float_raise(float_flag_invalid
, status
);
5040 return floatx80_default_nan(status
);
5042 aSig
= extractFloatx80Frac( a
);
5043 aExp
= extractFloatx80Exp( a
);
5044 aSign
= extractFloatx80Sign( a
);
5045 bSig
= extractFloatx80Frac( b
);
5046 bExp
= extractFloatx80Exp( b
);
5047 bSign
= extractFloatx80Sign( b
);
5048 zSign
= aSign
^ bSign
;
5049 if ( aExp
== 0x7FFF ) {
5050 if ( (uint64_t) ( aSig
<<1 )
5051 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5052 return propagateFloatx80NaN(a
, b
, status
);
5054 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
5055 return packFloatx80(zSign
, floatx80_infinity_high
,
5056 floatx80_infinity_low
);
5058 if ( bExp
== 0x7FFF ) {
5059 if ((uint64_t)(bSig
<< 1)) {
5060 return propagateFloatx80NaN(a
, b
, status
);
5062 if ( ( aExp
| aSig
) == 0 ) {
5064 float_raise(float_flag_invalid
, status
);
5065 return floatx80_default_nan(status
);
5067 return packFloatx80(zSign
, floatx80_infinity_high
,
5068 floatx80_infinity_low
);
5071 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5072 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5075 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5076 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5078 zExp
= aExp
+ bExp
- 0x3FFE;
5079 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
5080 if ( 0 < (int64_t) zSig0
) {
5081 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5084 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5085 zSign
, zExp
, zSig0
, zSig1
, status
);
5088 /*----------------------------------------------------------------------------
5089 | Returns the result of dividing the extended double-precision floating-point
5090 | value `a' by the corresponding value `b'. The operation is performed
5091 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5092 *----------------------------------------------------------------------------*/
5094 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
5096 flag aSign
, bSign
, zSign
;
5097 int32_t aExp
, bExp
, zExp
;
5098 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5099 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
5101 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5102 float_raise(float_flag_invalid
, status
);
5103 return floatx80_default_nan(status
);
5105 aSig
= extractFloatx80Frac( a
);
5106 aExp
= extractFloatx80Exp( a
);
5107 aSign
= extractFloatx80Sign( a
);
5108 bSig
= extractFloatx80Frac( b
);
5109 bExp
= extractFloatx80Exp( b
);
5110 bSign
= extractFloatx80Sign( b
);
5111 zSign
= aSign
^ bSign
;
5112 if ( aExp
== 0x7FFF ) {
5113 if ((uint64_t)(aSig
<< 1)) {
5114 return propagateFloatx80NaN(a
, b
, status
);
5116 if ( bExp
== 0x7FFF ) {
5117 if ((uint64_t)(bSig
<< 1)) {
5118 return propagateFloatx80NaN(a
, b
, status
);
5122 return packFloatx80(zSign
, floatx80_infinity_high
,
5123 floatx80_infinity_low
);
5125 if ( bExp
== 0x7FFF ) {
5126 if ((uint64_t)(bSig
<< 1)) {
5127 return propagateFloatx80NaN(a
, b
, status
);
5129 return packFloatx80( zSign
, 0, 0 );
5133 if ( ( aExp
| aSig
) == 0 ) {
5135 float_raise(float_flag_invalid
, status
);
5136 return floatx80_default_nan(status
);
5138 float_raise(float_flag_divbyzero
, status
);
5139 return packFloatx80(zSign
, floatx80_infinity_high
,
5140 floatx80_infinity_low
);
5142 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5145 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5146 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5148 zExp
= aExp
- bExp
+ 0x3FFE;
5150 if ( bSig
<= aSig
) {
5151 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
5154 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
5155 mul64To128( bSig
, zSig0
, &term0
, &term1
);
5156 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
5157 while ( (int64_t) rem0
< 0 ) {
5159 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
5161 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
5162 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
5163 mul64To128( bSig
, zSig1
, &term1
, &term2
);
5164 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5165 while ( (int64_t) rem1
< 0 ) {
5167 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
5169 zSig1
|= ( ( rem1
| rem2
) != 0 );
5171 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5172 zSign
, zExp
, zSig0
, zSig1
, status
);
5175 /*----------------------------------------------------------------------------
5176 | Returns the remainder of the extended double-precision floating-point value
5177 | `a' with respect to the corresponding value `b'. The operation is performed
5178 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5179 *----------------------------------------------------------------------------*/
5181 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
5184 int32_t aExp
, bExp
, expDiff
;
5185 uint64_t aSig0
, aSig1
, bSig
;
5186 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
5188 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5189 float_raise(float_flag_invalid
, status
);
5190 return floatx80_default_nan(status
);
5192 aSig0
= extractFloatx80Frac( a
);
5193 aExp
= extractFloatx80Exp( a
);
5194 aSign
= extractFloatx80Sign( a
);
5195 bSig
= extractFloatx80Frac( b
);
5196 bExp
= extractFloatx80Exp( b
);
5197 if ( aExp
== 0x7FFF ) {
5198 if ( (uint64_t) ( aSig0
<<1 )
5199 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5200 return propagateFloatx80NaN(a
, b
, status
);
5204 if ( bExp
== 0x7FFF ) {
5205 if ((uint64_t)(bSig
<< 1)) {
5206 return propagateFloatx80NaN(a
, b
, status
);
5213 float_raise(float_flag_invalid
, status
);
5214 return floatx80_default_nan(status
);
5216 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5219 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
5220 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5222 bSig
|= LIT64( 0x8000000000000000 );
5224 expDiff
= aExp
- bExp
;
5226 if ( expDiff
< 0 ) {
5227 if ( expDiff
< -1 ) return a
;
5228 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
5231 q
= ( bSig
<= aSig0
);
5232 if ( q
) aSig0
-= bSig
;
5234 while ( 0 < expDiff
) {
5235 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5236 q
= ( 2 < q
) ? q
- 2 : 0;
5237 mul64To128( bSig
, q
, &term0
, &term1
);
5238 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5239 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
5243 if ( 0 < expDiff
) {
5244 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5245 q
= ( 2 < q
) ? q
- 2 : 0;
5247 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
5248 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5249 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
5250 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
5252 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5259 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
5260 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5261 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5264 aSig0
= alternateASig0
;
5265 aSig1
= alternateASig1
;
5269 normalizeRoundAndPackFloatx80(
5270 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
5274 /*----------------------------------------------------------------------------
5275 | Returns the square root of the extended double-precision floating-point
5276 | value `a'. The operation is performed according to the IEC/IEEE Standard
5277 | for Binary Floating-Point Arithmetic.
5278 *----------------------------------------------------------------------------*/
5280 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
5284 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
5285 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5287 if (floatx80_invalid_encoding(a
)) {
5288 float_raise(float_flag_invalid
, status
);
5289 return floatx80_default_nan(status
);
5291 aSig0
= extractFloatx80Frac( a
);
5292 aExp
= extractFloatx80Exp( a
);
5293 aSign
= extractFloatx80Sign( a
);
5294 if ( aExp
== 0x7FFF ) {
5295 if ((uint64_t)(aSig0
<< 1)) {
5296 return propagateFloatx80NaN(a
, a
, status
);
5298 if ( ! aSign
) return a
;
5302 if ( ( aExp
| aSig0
) == 0 ) return a
;
5304 float_raise(float_flag_invalid
, status
);
5305 return floatx80_default_nan(status
);
5308 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
5309 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5311 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
5312 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
5313 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
5314 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5315 doubleZSig0
= zSig0
<<1;
5316 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5317 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5318 while ( (int64_t) rem0
< 0 ) {
5321 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5323 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5324 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5325 if ( zSig1
== 0 ) zSig1
= 1;
5326 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5327 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5328 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5329 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5330 while ( (int64_t) rem1
< 0 ) {
5332 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5334 term2
|= doubleZSig0
;
5335 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5337 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5339 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
5340 zSig0
|= doubleZSig0
;
5341 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5342 0, zExp
, zSig0
, zSig1
, status
);
5345 /*----------------------------------------------------------------------------
5346 | Returns 1 if the extended double-precision floating-point value `a' is equal
5347 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5348 | raised if either operand is a NaN. Otherwise, the comparison is performed
5349 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5350 *----------------------------------------------------------------------------*/
5352 int floatx80_eq(floatx80 a
, floatx80 b
, float_status
*status
)
5355 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5356 || (extractFloatx80Exp(a
) == 0x7FFF
5357 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5358 || (extractFloatx80Exp(b
) == 0x7FFF
5359 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5361 float_raise(float_flag_invalid
, status
);
5366 && ( ( a
.high
== b
.high
)
5368 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5373 /*----------------------------------------------------------------------------
5374 | Returns 1 if the extended double-precision floating-point value `a' is
5375 | less than or equal to the corresponding value `b', and 0 otherwise. The
5376 | invalid exception is raised if either operand is a NaN. The comparison is
5377 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5379 *----------------------------------------------------------------------------*/
5381 int floatx80_le(floatx80 a
, floatx80 b
, float_status
*status
)
5385 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5386 || (extractFloatx80Exp(a
) == 0x7FFF
5387 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5388 || (extractFloatx80Exp(b
) == 0x7FFF
5389 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5391 float_raise(float_flag_invalid
, status
);
5394 aSign
= extractFloatx80Sign( a
);
5395 bSign
= extractFloatx80Sign( b
);
5396 if ( aSign
!= bSign
) {
5399 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5403 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5404 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5408 /*----------------------------------------------------------------------------
5409 | Returns 1 if the extended double-precision floating-point value `a' is
5410 | less than the corresponding value `b', and 0 otherwise. The invalid
5411 | exception is raised if either operand is a NaN. The comparison is performed
5412 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5413 *----------------------------------------------------------------------------*/
5415 int floatx80_lt(floatx80 a
, floatx80 b
, float_status
*status
)
5419 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5420 || (extractFloatx80Exp(a
) == 0x7FFF
5421 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5422 || (extractFloatx80Exp(b
) == 0x7FFF
5423 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5425 float_raise(float_flag_invalid
, status
);
5428 aSign
= extractFloatx80Sign( a
);
5429 bSign
= extractFloatx80Sign( b
);
5430 if ( aSign
!= bSign
) {
5433 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5437 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5438 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5442 /*----------------------------------------------------------------------------
5443 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5444 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5445 | either operand is a NaN. The comparison is performed according to the
5446 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5447 *----------------------------------------------------------------------------*/
5448 int floatx80_unordered(floatx80 a
, floatx80 b
, float_status
*status
)
5450 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5451 || (extractFloatx80Exp(a
) == 0x7FFF
5452 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5453 || (extractFloatx80Exp(b
) == 0x7FFF
5454 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5456 float_raise(float_flag_invalid
, status
);
5462 /*----------------------------------------------------------------------------
5463 | Returns 1 if the extended double-precision floating-point value `a' is
5464 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5465 | cause an exception. The comparison is performed according to the IEC/IEEE
5466 | Standard for Binary Floating-Point Arithmetic.
5467 *----------------------------------------------------------------------------*/
5469 int floatx80_eq_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5472 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5473 float_raise(float_flag_invalid
, status
);
5476 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5477 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5478 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5479 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5481 if (floatx80_is_signaling_nan(a
, status
)
5482 || floatx80_is_signaling_nan(b
, status
)) {
5483 float_raise(float_flag_invalid
, status
);
5489 && ( ( a
.high
== b
.high
)
5491 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5496 /*----------------------------------------------------------------------------
5497 | Returns 1 if the extended double-precision floating-point value `a' is less
5498 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5499 | do not cause an exception. Otherwise, the comparison is performed according
5500 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5501 *----------------------------------------------------------------------------*/
5503 int floatx80_le_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5507 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5508 float_raise(float_flag_invalid
, status
);
5511 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5512 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5513 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5514 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5516 if (floatx80_is_signaling_nan(a
, status
)
5517 || floatx80_is_signaling_nan(b
, status
)) {
5518 float_raise(float_flag_invalid
, status
);
5522 aSign
= extractFloatx80Sign( a
);
5523 bSign
= extractFloatx80Sign( b
);
5524 if ( aSign
!= bSign
) {
5527 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5531 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5532 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5536 /*----------------------------------------------------------------------------
5537 | Returns 1 if the extended double-precision floating-point value `a' is less
5538 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5539 | an exception. Otherwise, the comparison is performed according to the
5540 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5541 *----------------------------------------------------------------------------*/
5543 int floatx80_lt_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5547 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5548 float_raise(float_flag_invalid
, status
);
5551 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5552 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5553 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5554 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5556 if (floatx80_is_signaling_nan(a
, status
)
5557 || floatx80_is_signaling_nan(b
, status
)) {
5558 float_raise(float_flag_invalid
, status
);
5562 aSign
= extractFloatx80Sign( a
);
5563 bSign
= extractFloatx80Sign( b
);
5564 if ( aSign
!= bSign
) {
5567 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5571 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5572 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5576 /*----------------------------------------------------------------------------
5577 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5578 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5579 | The comparison is performed according to the IEC/IEEE Standard for Binary
5580 | Floating-Point Arithmetic.
5581 *----------------------------------------------------------------------------*/
5582 int floatx80_unordered_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5584 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5585 float_raise(float_flag_invalid
, status
);
5588 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5589 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5590 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5591 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5593 if (floatx80_is_signaling_nan(a
, status
)
5594 || floatx80_is_signaling_nan(b
, status
)) {
5595 float_raise(float_flag_invalid
, status
);
5602 /*----------------------------------------------------------------------------
5603 | Returns the result of converting the quadruple-precision floating-point
5604 | value `a' to the 32-bit two's complement integer format. The conversion
5605 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5606 | Arithmetic---which means in particular that the conversion is rounded
5607 | according to the current rounding mode. If `a' is a NaN, the largest
5608 | positive integer is returned. Otherwise, if the conversion overflows, the
5609 | largest integer with the same sign as `a' is returned.
5610 *----------------------------------------------------------------------------*/
5612 int32_t float128_to_int32(float128 a
, float_status
*status
)
5615 int32_t aExp
, shiftCount
;
5616 uint64_t aSig0
, aSig1
;
5618 aSig1
= extractFloat128Frac1( a
);
5619 aSig0
= extractFloat128Frac0( a
);
5620 aExp
= extractFloat128Exp( a
);
5621 aSign
= extractFloat128Sign( a
);
5622 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
5623 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5624 aSig0
|= ( aSig1
!= 0 );
5625 shiftCount
= 0x4028 - aExp
;
5626 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
5627 return roundAndPackInt32(aSign
, aSig0
, status
);
5631 /*----------------------------------------------------------------------------
5632 | Returns the result of converting the quadruple-precision floating-point
5633 | value `a' to the 32-bit two's complement integer format. The conversion
5634 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5635 | Arithmetic, except that the conversion is always rounded toward zero. If
5636 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5637 | conversion overflows, the largest integer with the same sign as `a' is
5639 *----------------------------------------------------------------------------*/
5641 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
5644 int32_t aExp
, shiftCount
;
5645 uint64_t aSig0
, aSig1
, savedASig
;
5648 aSig1
= extractFloat128Frac1( a
);
5649 aSig0
= extractFloat128Frac0( a
);
5650 aExp
= extractFloat128Exp( a
);
5651 aSign
= extractFloat128Sign( a
);
5652 aSig0
|= ( aSig1
!= 0 );
5653 if ( 0x401E < aExp
) {
5654 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
5657 else if ( aExp
< 0x3FFF ) {
5658 if (aExp
|| aSig0
) {
5659 status
->float_exception_flags
|= float_flag_inexact
;
5663 aSig0
|= LIT64( 0x0001000000000000 );
5664 shiftCount
= 0x402F - aExp
;
5666 aSig0
>>= shiftCount
;
5668 if ( aSign
) z
= - z
;
5669 if ( ( z
< 0 ) ^ aSign
) {
5671 float_raise(float_flag_invalid
, status
);
5672 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5674 if ( ( aSig0
<<shiftCount
) != savedASig
) {
5675 status
->float_exception_flags
|= float_flag_inexact
;
5681 /*----------------------------------------------------------------------------
5682 | Returns the result of converting the quadruple-precision floating-point
5683 | value `a' to the 64-bit two's complement integer format. The conversion
5684 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5685 | Arithmetic---which means in particular that the conversion is rounded
5686 | according to the current rounding mode. If `a' is a NaN, the largest
5687 | positive integer is returned. Otherwise, if the conversion overflows, the
5688 | largest integer with the same sign as `a' is returned.
5689 *----------------------------------------------------------------------------*/
5691 int64_t float128_to_int64(float128 a
, float_status
*status
)
5694 int32_t aExp
, shiftCount
;
5695 uint64_t aSig0
, aSig1
;
5697 aSig1
= extractFloat128Frac1( a
);
5698 aSig0
= extractFloat128Frac0( a
);
5699 aExp
= extractFloat128Exp( a
);
5700 aSign
= extractFloat128Sign( a
);
5701 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5702 shiftCount
= 0x402F - aExp
;
5703 if ( shiftCount
<= 0 ) {
5704 if ( 0x403E < aExp
) {
5705 float_raise(float_flag_invalid
, status
);
5707 || ( ( aExp
== 0x7FFF )
5708 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
5711 return LIT64( 0x7FFFFFFFFFFFFFFF );
5713 return (int64_t) LIT64( 0x8000000000000000 );
5715 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
5718 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
5720 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
5724 /*----------------------------------------------------------------------------
5725 | Returns the result of converting the quadruple-precision floating-point
5726 | value `a' to the 64-bit two's complement integer format. The conversion
5727 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5728 | Arithmetic, except that the conversion is always rounded toward zero.
5729 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5730 | the conversion overflows, the largest integer with the same sign as `a' is
5732 *----------------------------------------------------------------------------*/
5734 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
5737 int32_t aExp
, shiftCount
;
5738 uint64_t aSig0
, aSig1
;
5741 aSig1
= extractFloat128Frac1( a
);
5742 aSig0
= extractFloat128Frac0( a
);
5743 aExp
= extractFloat128Exp( a
);
5744 aSign
= extractFloat128Sign( a
);
5745 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5746 shiftCount
= aExp
- 0x402F;
5747 if ( 0 < shiftCount
) {
5748 if ( 0x403E <= aExp
) {
5749 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
5750 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
5751 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
5753 status
->float_exception_flags
|= float_flag_inexact
;
5757 float_raise(float_flag_invalid
, status
);
5758 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
5759 return LIT64( 0x7FFFFFFFFFFFFFFF );
5762 return (int64_t) LIT64( 0x8000000000000000 );
5764 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
5765 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
5766 status
->float_exception_flags
|= float_flag_inexact
;
5770 if ( aExp
< 0x3FFF ) {
5771 if ( aExp
| aSig0
| aSig1
) {
5772 status
->float_exception_flags
|= float_flag_inexact
;
5776 z
= aSig0
>>( - shiftCount
);
5778 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
5779 status
->float_exception_flags
|= float_flag_inexact
;
5782 if ( aSign
) z
= - z
;
5787 /*----------------------------------------------------------------------------
5788 | Returns the result of converting the quadruple-precision floating-point value
5789 | `a' to the 64-bit unsigned integer format. The conversion is
5790 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5791 | Arithmetic---which means in particular that the conversion is rounded
5792 | according to the current rounding mode. If `a' is a NaN, the largest
5793 | positive integer is returned. If the conversion overflows, the
5794 | largest unsigned integer is returned. If 'a' is negative, the value is
5795 | rounded and zero is returned; negative values that do not round to zero
5796 | will raise the inexact exception.
5797 *----------------------------------------------------------------------------*/
5799 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
5804 uint64_t aSig0
, aSig1
;
5806 aSig0
= extractFloat128Frac0(a
);
5807 aSig1
= extractFloat128Frac1(a
);
5808 aExp
= extractFloat128Exp(a
);
5809 aSign
= extractFloat128Sign(a
);
5810 if (aSign
&& (aExp
> 0x3FFE)) {
5811 float_raise(float_flag_invalid
, status
);
5812 if (float128_is_any_nan(a
)) {
5813 return LIT64(0xFFFFFFFFFFFFFFFF);
5819 aSig0
|= LIT64(0x0001000000000000);
5821 shiftCount
= 0x402F - aExp
;
5822 if (shiftCount
<= 0) {
5823 if (0x403E < aExp
) {
5824 float_raise(float_flag_invalid
, status
);
5825 return LIT64(0xFFFFFFFFFFFFFFFF);
5827 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
5829 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
5831 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
5834 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
5837 signed char current_rounding_mode
= status
->float_rounding_mode
;
5839 set_float_rounding_mode(float_round_to_zero
, status
);
5840 v
= float128_to_uint64(a
, status
);
5841 set_float_rounding_mode(current_rounding_mode
, status
);
5846 /*----------------------------------------------------------------------------
5847 | Returns the result of converting the quadruple-precision floating-point
5848 | value `a' to the 32-bit unsigned integer format. The conversion
5849 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5850 | Arithmetic except that the conversion is always rounded toward zero.
5851 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5852 | if the conversion overflows, the largest unsigned integer is returned.
5853 | If 'a' is negative, the value is rounded and zero is returned; negative
5854 | values that do not round to zero will raise the inexact exception.
5855 *----------------------------------------------------------------------------*/
5857 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
5861 int old_exc_flags
= get_float_exception_flags(status
);
5863 v
= float128_to_uint64_round_to_zero(a
, status
);
5864 if (v
> 0xffffffff) {
5869 set_float_exception_flags(old_exc_flags
, status
);
5870 float_raise(float_flag_invalid
, status
);
5874 /*----------------------------------------------------------------------------
5875 | Returns the result of converting the quadruple-precision floating-point
5876 | value `a' to the single-precision floating-point format. The conversion
5877 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5879 *----------------------------------------------------------------------------*/
5881 float32
float128_to_float32(float128 a
, float_status
*status
)
5885 uint64_t aSig0
, aSig1
;
5888 aSig1
= extractFloat128Frac1( a
);
5889 aSig0
= extractFloat128Frac0( a
);
5890 aExp
= extractFloat128Exp( a
);
5891 aSign
= extractFloat128Sign( a
);
5892 if ( aExp
== 0x7FFF ) {
5893 if ( aSig0
| aSig1
) {
5894 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
5896 return packFloat32( aSign
, 0xFF, 0 );
5898 aSig0
|= ( aSig1
!= 0 );
5899 shift64RightJamming( aSig0
, 18, &aSig0
);
5901 if ( aExp
|| zSig
) {
5905 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
5909 /*----------------------------------------------------------------------------
5910 | Returns the result of converting the quadruple-precision floating-point
5911 | value `a' to the double-precision floating-point format. The conversion
5912 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5914 *----------------------------------------------------------------------------*/
5916 float64
float128_to_float64(float128 a
, float_status
*status
)
5920 uint64_t aSig0
, aSig1
;
5922 aSig1
= extractFloat128Frac1( a
);
5923 aSig0
= extractFloat128Frac0( a
);
5924 aExp
= extractFloat128Exp( a
);
5925 aSign
= extractFloat128Sign( a
);
5926 if ( aExp
== 0x7FFF ) {
5927 if ( aSig0
| aSig1
) {
5928 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
5930 return packFloat64( aSign
, 0x7FF, 0 );
5932 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
5933 aSig0
|= ( aSig1
!= 0 );
5934 if ( aExp
|| aSig0
) {
5935 aSig0
|= LIT64( 0x4000000000000000 );
5938 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
5942 /*----------------------------------------------------------------------------
5943 | Returns the result of converting the quadruple-precision floating-point
5944 | value `a' to the extended double-precision floating-point format. The
5945 | conversion is performed according to the IEC/IEEE Standard for Binary
5946 | Floating-Point Arithmetic.
5947 *----------------------------------------------------------------------------*/
5949 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
5953 uint64_t aSig0
, aSig1
;
5955 aSig1
= extractFloat128Frac1( a
);
5956 aSig0
= extractFloat128Frac0( a
);
5957 aExp
= extractFloat128Exp( a
);
5958 aSign
= extractFloat128Sign( a
);
5959 if ( aExp
== 0x7FFF ) {
5960 if ( aSig0
| aSig1
) {
5961 return commonNaNToFloatx80(float128ToCommonNaN(a
, status
), status
);
5963 return packFloatx80(aSign
, floatx80_infinity_high
,
5964 floatx80_infinity_low
);
5967 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
5968 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5971 aSig0
|= LIT64( 0x0001000000000000 );
5973 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
5974 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
5978 /*----------------------------------------------------------------------------
5979 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5980 | returns the result as a quadruple-precision floating-point value. The
5981 | operation is performed according to the IEC/IEEE Standard for Binary
5982 | Floating-Point Arithmetic.
5983 *----------------------------------------------------------------------------*/
5985 float128
float128_round_to_int(float128 a
, float_status
*status
)
5989 uint64_t lastBitMask
, roundBitsMask
;
5992 aExp
= extractFloat128Exp( a
);
5993 if ( 0x402F <= aExp
) {
5994 if ( 0x406F <= aExp
) {
5995 if ( ( aExp
== 0x7FFF )
5996 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
5998 return propagateFloat128NaN(a
, a
, status
);
6003 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6004 roundBitsMask
= lastBitMask
- 1;
6006 switch (status
->float_rounding_mode
) {
6007 case float_round_nearest_even
:
6008 if ( lastBitMask
) {
6009 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6010 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6013 if ( (int64_t) z
.low
< 0 ) {
6015 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6019 case float_round_ties_away
:
6021 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6023 if ((int64_t) z
.low
< 0) {
6028 case float_round_to_zero
:
6030 case float_round_up
:
6031 if (!extractFloat128Sign(z
)) {
6032 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6035 case float_round_down
:
6036 if (extractFloat128Sign(z
)) {
6037 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6043 z
.low
&= ~ roundBitsMask
;
6046 if ( aExp
< 0x3FFF ) {
6047 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6048 status
->float_exception_flags
|= float_flag_inexact
;
6049 aSign
= extractFloat128Sign( a
);
6050 switch (status
->float_rounding_mode
) {
6051 case float_round_nearest_even
:
6052 if ( ( aExp
== 0x3FFE )
6053 && ( extractFloat128Frac0( a
)
6054 | extractFloat128Frac1( a
) )
6056 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6059 case float_round_ties_away
:
6060 if (aExp
== 0x3FFE) {
6061 return packFloat128(aSign
, 0x3FFF, 0, 0);
6064 case float_round_down
:
6066 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6067 : packFloat128( 0, 0, 0, 0 );
6068 case float_round_up
:
6070 aSign
? packFloat128( 1, 0, 0, 0 )
6071 : packFloat128( 0, 0x3FFF, 0, 0 );
6073 return packFloat128( aSign
, 0, 0, 0 );
6076 lastBitMask
<<= 0x402F - aExp
;
6077 roundBitsMask
= lastBitMask
- 1;
6080 switch (status
->float_rounding_mode
) {
6081 case float_round_nearest_even
:
6082 z
.high
+= lastBitMask
>>1;
6083 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6084 z
.high
&= ~ lastBitMask
;
6087 case float_round_ties_away
:
6088 z
.high
+= lastBitMask
>>1;
6090 case float_round_to_zero
:
6092 case float_round_up
:
6093 if (!extractFloat128Sign(z
)) {
6094 z
.high
|= ( a
.low
!= 0 );
6095 z
.high
+= roundBitsMask
;
6098 case float_round_down
:
6099 if (extractFloat128Sign(z
)) {
6100 z
.high
|= (a
.low
!= 0);
6101 z
.high
+= roundBitsMask
;
6107 z
.high
&= ~ roundBitsMask
;
6109 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
6110 status
->float_exception_flags
|= float_flag_inexact
;
6116 /*----------------------------------------------------------------------------
6117 | Returns the result of adding the absolute values of the quadruple-precision
6118 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6119 | before being returned. `zSign' is ignored if the result is a NaN.
6120 | The addition is performed according to the IEC/IEEE Standard for Binary
6121 | Floating-Point Arithmetic.
6122 *----------------------------------------------------------------------------*/
6124 static float128
addFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6125 float_status
*status
)
6127 int32_t aExp
, bExp
, zExp
;
6128 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6131 aSig1
= extractFloat128Frac1( a
);
6132 aSig0
= extractFloat128Frac0( a
);
6133 aExp
= extractFloat128Exp( a
);
6134 bSig1
= extractFloat128Frac1( b
);
6135 bSig0
= extractFloat128Frac0( b
);
6136 bExp
= extractFloat128Exp( b
);
6137 expDiff
= aExp
- bExp
;
6138 if ( 0 < expDiff
) {
6139 if ( aExp
== 0x7FFF ) {
6140 if (aSig0
| aSig1
) {
6141 return propagateFloat128NaN(a
, b
, status
);
6149 bSig0
|= LIT64( 0x0001000000000000 );
6151 shift128ExtraRightJamming(
6152 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
6155 else if ( expDiff
< 0 ) {
6156 if ( bExp
== 0x7FFF ) {
6157 if (bSig0
| bSig1
) {
6158 return propagateFloat128NaN(a
, b
, status
);
6160 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6166 aSig0
|= LIT64( 0x0001000000000000 );
6168 shift128ExtraRightJamming(
6169 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
6173 if ( aExp
== 0x7FFF ) {
6174 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6175 return propagateFloat128NaN(a
, b
, status
);
6179 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6181 if (status
->flush_to_zero
) {
6182 if (zSig0
| zSig1
) {
6183 float_raise(float_flag_output_denormal
, status
);
6185 return packFloat128(zSign
, 0, 0, 0);
6187 return packFloat128( zSign
, 0, zSig0
, zSig1
);
6190 zSig0
|= LIT64( 0x0002000000000000 );
6194 aSig0
|= LIT64( 0x0001000000000000 );
6195 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6197 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
6200 shift128ExtraRightJamming(
6201 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6203 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6207 /*----------------------------------------------------------------------------
6208 | Returns the result of subtracting the absolute values of the quadruple-
6209 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6210 | difference is negated before being returned. `zSign' is ignored if the
6211 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6212 | Standard for Binary Floating-Point Arithmetic.
6213 *----------------------------------------------------------------------------*/
6215 static float128
subFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6216 float_status
*status
)
6218 int32_t aExp
, bExp
, zExp
;
6219 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
6222 aSig1
= extractFloat128Frac1( a
);
6223 aSig0
= extractFloat128Frac0( a
);
6224 aExp
= extractFloat128Exp( a
);
6225 bSig1
= extractFloat128Frac1( b
);
6226 bSig0
= extractFloat128Frac0( b
);
6227 bExp
= extractFloat128Exp( b
);
6228 expDiff
= aExp
- bExp
;
6229 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6230 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
6231 if ( 0 < expDiff
) goto aExpBigger
;
6232 if ( expDiff
< 0 ) goto bExpBigger
;
6233 if ( aExp
== 0x7FFF ) {
6234 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6235 return propagateFloat128NaN(a
, b
, status
);
6237 float_raise(float_flag_invalid
, status
);
6238 return float128_default_nan(status
);
6244 if ( bSig0
< aSig0
) goto aBigger
;
6245 if ( aSig0
< bSig0
) goto bBigger
;
6246 if ( bSig1
< aSig1
) goto aBigger
;
6247 if ( aSig1
< bSig1
) goto bBigger
;
6248 return packFloat128(status
->float_rounding_mode
== float_round_down
,
6251 if ( bExp
== 0x7FFF ) {
6252 if (bSig0
| bSig1
) {
6253 return propagateFloat128NaN(a
, b
, status
);
6255 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
6261 aSig0
|= LIT64( 0x4000000000000000 );
6263 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6264 bSig0
|= LIT64( 0x4000000000000000 );
6266 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6269 goto normalizeRoundAndPack
;
6271 if ( aExp
== 0x7FFF ) {
6272 if (aSig0
| aSig1
) {
6273 return propagateFloat128NaN(a
, b
, status
);
6281 bSig0
|= LIT64( 0x4000000000000000 );
6283 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
6284 aSig0
|= LIT64( 0x4000000000000000 );
6286 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6288 normalizeRoundAndPack
:
6290 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
6295 /*----------------------------------------------------------------------------
6296 | Returns the result of adding the quadruple-precision floating-point values
6297 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6298 | for Binary Floating-Point Arithmetic.
6299 *----------------------------------------------------------------------------*/
6301 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
6305 aSign
= extractFloat128Sign( a
);
6306 bSign
= extractFloat128Sign( b
);
6307 if ( aSign
== bSign
) {
6308 return addFloat128Sigs(a
, b
, aSign
, status
);
6311 return subFloat128Sigs(a
, b
, aSign
, status
);
6316 /*----------------------------------------------------------------------------
6317 | Returns the result of subtracting the quadruple-precision floating-point
6318 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6319 | Standard for Binary Floating-Point Arithmetic.
6320 *----------------------------------------------------------------------------*/
6322 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
6326 aSign
= extractFloat128Sign( a
);
6327 bSign
= extractFloat128Sign( b
);
6328 if ( aSign
== bSign
) {
6329 return subFloat128Sigs(a
, b
, aSign
, status
);
6332 return addFloat128Sigs(a
, b
, aSign
, status
);
6337 /*----------------------------------------------------------------------------
6338 | Returns the result of multiplying the quadruple-precision floating-point
6339 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6340 | Standard for Binary Floating-Point Arithmetic.
6341 *----------------------------------------------------------------------------*/
6343 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
6345 flag aSign
, bSign
, zSign
;
6346 int32_t aExp
, bExp
, zExp
;
6347 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
6349 aSig1
= extractFloat128Frac1( a
);
6350 aSig0
= extractFloat128Frac0( a
);
6351 aExp
= extractFloat128Exp( a
);
6352 aSign
= extractFloat128Sign( a
);
6353 bSig1
= extractFloat128Frac1( b
);
6354 bSig0
= extractFloat128Frac0( b
);
6355 bExp
= extractFloat128Exp( b
);
6356 bSign
= extractFloat128Sign( b
);
6357 zSign
= aSign
^ bSign
;
6358 if ( aExp
== 0x7FFF ) {
6359 if ( ( aSig0
| aSig1
)
6360 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6361 return propagateFloat128NaN(a
, b
, status
);
6363 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
6364 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6366 if ( bExp
== 0x7FFF ) {
6367 if (bSig0
| bSig1
) {
6368 return propagateFloat128NaN(a
, b
, status
);
6370 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6372 float_raise(float_flag_invalid
, status
);
6373 return float128_default_nan(status
);
6375 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6378 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6379 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6382 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6383 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6385 zExp
= aExp
+ bExp
- 0x4000;
6386 aSig0
|= LIT64( 0x0001000000000000 );
6387 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
6388 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
6389 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6390 zSig2
|= ( zSig3
!= 0 );
6391 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
6392 shift128ExtraRightJamming(
6393 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6396 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6400 /*----------------------------------------------------------------------------
6401 | Returns the result of dividing the quadruple-precision floating-point value
6402 | `a' by the corresponding value `b'. The operation is performed according to
6403 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6404 *----------------------------------------------------------------------------*/
6406 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
6408 flag aSign
, bSign
, zSign
;
6409 int32_t aExp
, bExp
, zExp
;
6410 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6411 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6413 aSig1
= extractFloat128Frac1( a
);
6414 aSig0
= extractFloat128Frac0( a
);
6415 aExp
= extractFloat128Exp( a
);
6416 aSign
= extractFloat128Sign( a
);
6417 bSig1
= extractFloat128Frac1( b
);
6418 bSig0
= extractFloat128Frac0( b
);
6419 bExp
= extractFloat128Exp( b
);
6420 bSign
= extractFloat128Sign( b
);
6421 zSign
= aSign
^ bSign
;
6422 if ( aExp
== 0x7FFF ) {
6423 if (aSig0
| aSig1
) {
6424 return propagateFloat128NaN(a
, b
, status
);
6426 if ( bExp
== 0x7FFF ) {
6427 if (bSig0
| bSig1
) {
6428 return propagateFloat128NaN(a
, b
, status
);
6432 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6434 if ( bExp
== 0x7FFF ) {
6435 if (bSig0
| bSig1
) {
6436 return propagateFloat128NaN(a
, b
, status
);
6438 return packFloat128( zSign
, 0, 0, 0 );
6441 if ( ( bSig0
| bSig1
) == 0 ) {
6442 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6444 float_raise(float_flag_invalid
, status
);
6445 return float128_default_nan(status
);
6447 float_raise(float_flag_divbyzero
, status
);
6448 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6450 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6453 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6454 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6456 zExp
= aExp
- bExp
+ 0x3FFD;
6458 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
6460 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6461 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
6462 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
6465 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6466 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
6467 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
6468 while ( (int64_t) rem0
< 0 ) {
6470 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
6472 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
6473 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
6474 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
6475 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
6476 while ( (int64_t) rem1
< 0 ) {
6478 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
6480 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6482 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
6483 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6487 /*----------------------------------------------------------------------------
6488 | Returns the remainder of the quadruple-precision floating-point value `a'
6489 | with respect to the corresponding value `b'. The operation is performed
6490 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6491 *----------------------------------------------------------------------------*/
6493 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
6496 int32_t aExp
, bExp
, expDiff
;
6497 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
6498 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
6501 aSig1
= extractFloat128Frac1( a
);
6502 aSig0
= extractFloat128Frac0( a
);
6503 aExp
= extractFloat128Exp( a
);
6504 aSign
= extractFloat128Sign( a
);
6505 bSig1
= extractFloat128Frac1( b
);
6506 bSig0
= extractFloat128Frac0( b
);
6507 bExp
= extractFloat128Exp( b
);
6508 if ( aExp
== 0x7FFF ) {
6509 if ( ( aSig0
| aSig1
)
6510 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6511 return propagateFloat128NaN(a
, b
, status
);
6515 if ( bExp
== 0x7FFF ) {
6516 if (bSig0
| bSig1
) {
6517 return propagateFloat128NaN(a
, b
, status
);
6522 if ( ( bSig0
| bSig1
) == 0 ) {
6524 float_raise(float_flag_invalid
, status
);
6525 return float128_default_nan(status
);
6527 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6530 if ( ( aSig0
| aSig1
) == 0 ) return a
;
6531 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6533 expDiff
= aExp
- bExp
;
6534 if ( expDiff
< -1 ) return a
;
6536 aSig0
| LIT64( 0x0001000000000000 ),
6538 15 - ( expDiff
< 0 ),
6543 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6544 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
6545 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6547 while ( 0 < expDiff
) {
6548 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6549 q
= ( 4 < q
) ? q
- 4 : 0;
6550 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6551 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
6552 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
6553 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
6556 if ( -64 < expDiff
) {
6557 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6558 q
= ( 4 < q
) ? q
- 4 : 0;
6560 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6562 if ( expDiff
< 0 ) {
6563 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6566 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
6568 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6569 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
6572 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
6573 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6576 alternateASig0
= aSig0
;
6577 alternateASig1
= aSig1
;
6579 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6580 } while ( 0 <= (int64_t) aSig0
);
6582 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
6583 if ( ( sigMean0
< 0 )
6584 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
6585 aSig0
= alternateASig0
;
6586 aSig1
= alternateASig1
;
6588 zSign
= ( (int64_t) aSig0
< 0 );
6589 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
6590 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
6594 /*----------------------------------------------------------------------------
6595 | Returns the square root of the quadruple-precision floating-point value `a'.
6596 | The operation is performed according to the IEC/IEEE Standard for Binary
6597 | Floating-Point Arithmetic.
6598 *----------------------------------------------------------------------------*/
6600 float128
float128_sqrt(float128 a
, float_status
*status
)
6604 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
6605 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6607 aSig1
= extractFloat128Frac1( a
);
6608 aSig0
= extractFloat128Frac0( a
);
6609 aExp
= extractFloat128Exp( a
);
6610 aSign
= extractFloat128Sign( a
);
6611 if ( aExp
== 0x7FFF ) {
6612 if (aSig0
| aSig1
) {
6613 return propagateFloat128NaN(a
, a
, status
);
6615 if ( ! aSign
) return a
;
6619 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
6621 float_raise(float_flag_invalid
, status
);
6622 return float128_default_nan(status
);
6625 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
6626 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6628 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
6629 aSig0
|= LIT64( 0x0001000000000000 );
6630 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
6631 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
6632 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6633 doubleZSig0
= zSig0
<<1;
6634 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6635 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6636 while ( (int64_t) rem0
< 0 ) {
6639 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6641 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6642 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
6643 if ( zSig1
== 0 ) zSig1
= 1;
6644 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6645 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6646 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6647 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6648 while ( (int64_t) rem1
< 0 ) {
6650 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6652 term2
|= doubleZSig0
;
6653 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6655 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6657 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
6658 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
6662 /*----------------------------------------------------------------------------
6663 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6664 | the corresponding value `b', and 0 otherwise. The invalid exception is
6665 | raised if either operand is a NaN. Otherwise, the comparison is performed
6666 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6667 *----------------------------------------------------------------------------*/
6669 int float128_eq(float128 a
, float128 b
, float_status
*status
)
6672 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6673 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6674 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6675 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6677 float_raise(float_flag_invalid
, status
);
6682 && ( ( a
.high
== b
.high
)
6684 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6689 /*----------------------------------------------------------------------------
6690 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6691 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6692 | exception is raised if either operand is a NaN. The comparison is performed
6693 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6694 *----------------------------------------------------------------------------*/
6696 int float128_le(float128 a
, float128 b
, float_status
*status
)
6700 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6701 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6702 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6703 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6705 float_raise(float_flag_invalid
, status
);
6708 aSign
= extractFloat128Sign( a
);
6709 bSign
= extractFloat128Sign( b
);
6710 if ( aSign
!= bSign
) {
6713 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6717 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6718 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6722 /*----------------------------------------------------------------------------
6723 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6724 | the corresponding value `b', and 0 otherwise. The invalid exception is
6725 | raised if either operand is a NaN. The comparison is performed according
6726 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6727 *----------------------------------------------------------------------------*/
6729 int float128_lt(float128 a
, float128 b
, float_status
*status
)
6733 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6734 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6735 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6736 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6738 float_raise(float_flag_invalid
, status
);
6741 aSign
= extractFloat128Sign( a
);
6742 bSign
= extractFloat128Sign( b
);
6743 if ( aSign
!= bSign
) {
6746 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6750 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6751 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6755 /*----------------------------------------------------------------------------
6756 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6757 | be compared, and 0 otherwise. The invalid exception is raised if either
6758 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6759 | Standard for Binary Floating-Point Arithmetic.
6760 *----------------------------------------------------------------------------*/
6762 int float128_unordered(float128 a
, float128 b
, float_status
*status
)
6764 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6765 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6766 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6767 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6769 float_raise(float_flag_invalid
, status
);
6775 /*----------------------------------------------------------------------------
6776 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6777 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6778 | exception. The comparison is performed according to the IEC/IEEE Standard
6779 | for Binary Floating-Point Arithmetic.
6780 *----------------------------------------------------------------------------*/
6782 int float128_eq_quiet(float128 a
, float128 b
, float_status
*status
)
6785 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6786 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6787 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6788 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6790 if (float128_is_signaling_nan(a
, status
)
6791 || float128_is_signaling_nan(b
, status
)) {
6792 float_raise(float_flag_invalid
, status
);
6798 && ( ( a
.high
== b
.high
)
6800 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6805 /*----------------------------------------------------------------------------
6806 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6807 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6808 | cause an exception. Otherwise, the comparison is performed according to the
6809 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6810 *----------------------------------------------------------------------------*/
6812 int float128_le_quiet(float128 a
, float128 b
, float_status
*status
)
6816 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6817 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6818 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6819 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6821 if (float128_is_signaling_nan(a
, status
)
6822 || float128_is_signaling_nan(b
, status
)) {
6823 float_raise(float_flag_invalid
, status
);
6827 aSign
= extractFloat128Sign( a
);
6828 bSign
= extractFloat128Sign( b
);
6829 if ( aSign
!= bSign
) {
6832 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6836 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6837 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6841 /*----------------------------------------------------------------------------
6842 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6843 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6844 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6845 | Standard for Binary Floating-Point Arithmetic.
6846 *----------------------------------------------------------------------------*/
6848 int float128_lt_quiet(float128 a
, float128 b
, float_status
*status
)
6852 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6853 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6854 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6855 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6857 if (float128_is_signaling_nan(a
, status
)
6858 || float128_is_signaling_nan(b
, status
)) {
6859 float_raise(float_flag_invalid
, status
);
6863 aSign
= extractFloat128Sign( a
);
6864 bSign
= extractFloat128Sign( b
);
6865 if ( aSign
!= bSign
) {
6868 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6872 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6873 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6877 /*----------------------------------------------------------------------------
6878 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6879 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6880 | comparison is performed according to the IEC/IEEE Standard for Binary
6881 | Floating-Point Arithmetic.
6882 *----------------------------------------------------------------------------*/
6884 int float128_unordered_quiet(float128 a
, float128 b
, float_status
*status
)
6886 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6887 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6888 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6889 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6891 if (float128_is_signaling_nan(a
, status
)
6892 || float128_is_signaling_nan(b
, status
)) {
6893 float_raise(float_flag_invalid
, status
);
6900 static inline int floatx80_compare_internal(floatx80 a
, floatx80 b
,
6901 int is_quiet
, float_status
*status
)
6905 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6906 float_raise(float_flag_invalid
, status
);
6907 return float_relation_unordered
;
6909 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
6910 ( extractFloatx80Frac( a
)<<1 ) ) ||
6911 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
6912 ( extractFloatx80Frac( b
)<<1 ) )) {
6914 floatx80_is_signaling_nan(a
, status
) ||
6915 floatx80_is_signaling_nan(b
, status
)) {
6916 float_raise(float_flag_invalid
, status
);
6918 return float_relation_unordered
;
6920 aSign
= extractFloatx80Sign( a
);
6921 bSign
= extractFloatx80Sign( b
);
6922 if ( aSign
!= bSign
) {
6924 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
6925 ( ( a
.low
| b
.low
) == 0 ) ) {
6927 return float_relation_equal
;
6929 return 1 - (2 * aSign
);
6932 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6933 return float_relation_equal
;
6935 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6940 int floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
6942 return floatx80_compare_internal(a
, b
, 0, status
);
6945 int floatx80_compare_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6947 return floatx80_compare_internal(a
, b
, 1, status
);
6950 static inline int float128_compare_internal(float128 a
, float128 b
,
6951 int is_quiet
, float_status
*status
)
6955 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
6956 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
6957 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
6958 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
6960 float128_is_signaling_nan(a
, status
) ||
6961 float128_is_signaling_nan(b
, status
)) {
6962 float_raise(float_flag_invalid
, status
);
6964 return float_relation_unordered
;
6966 aSign
= extractFloat128Sign( a
);
6967 bSign
= extractFloat128Sign( b
);
6968 if ( aSign
!= bSign
) {
6969 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
6971 return float_relation_equal
;
6973 return 1 - (2 * aSign
);
6976 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6977 return float_relation_equal
;
6979 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6984 int float128_compare(float128 a
, float128 b
, float_status
*status
)
6986 return float128_compare_internal(a
, b
, 0, status
);
6989 int float128_compare_quiet(float128 a
, float128 b
, float_status
*status
)
6991 return float128_compare_internal(a
, b
, 1, status
);
6994 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7000 if (floatx80_invalid_encoding(a
)) {
7001 float_raise(float_flag_invalid
, status
);
7002 return floatx80_default_nan(status
);
7004 aSig
= extractFloatx80Frac( a
);
7005 aExp
= extractFloatx80Exp( a
);
7006 aSign
= extractFloatx80Sign( a
);
7008 if ( aExp
== 0x7FFF ) {
7010 return propagateFloatx80NaN(a
, a
, status
);
7024 } else if (n
< -0x10000) {
7029 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7030 aSign
, aExp
, aSig
, 0, status
);
7033 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7037 uint64_t aSig0
, aSig1
;
7039 aSig1
= extractFloat128Frac1( a
);
7040 aSig0
= extractFloat128Frac0( a
);
7041 aExp
= extractFloat128Exp( a
);
7042 aSign
= extractFloat128Sign( a
);
7043 if ( aExp
== 0x7FFF ) {
7044 if ( aSig0
| aSig1
) {
7045 return propagateFloat128NaN(a
, a
, status
);
7050 aSig0
|= LIT64( 0x0001000000000000 );
7051 } else if (aSig0
== 0 && aSig1
== 0) {
7059 } else if (n
< -0x10000) {
7064 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1