4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine: (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output. These details are target-
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
112 static inline uint32_t extractFloat16Frac(float16 a
)
114 return float16_val(a
) & 0x3ff;
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
121 static inline int extractFloat16Exp(float16 a
)
123 return (float16_val(a
) >> 10) & 0x1f;
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
130 static inline flag
extractFloat16Sign(float16 a
)
132 return float16_val(a
)>>15;
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
139 static inline uint32_t extractFloat32Frac(float32 a
)
141 return float32_val(a
) & 0x007FFFFF;
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
148 static inline int extractFloat32Exp(float32 a
)
150 return (float32_val(a
) >> 23) & 0xFF;
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
157 static inline flag
extractFloat32Sign(float32 a
)
159 return float32_val(a
) >> 31;
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
166 static inline uint64_t extractFloat64Frac(float64 a
)
168 return float64_val(a
) & LIT64(0x000FFFFFFFFFFFFF);
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
175 static inline int extractFloat64Exp(float64 a
)
177 return (float64_val(a
) >> 52) & 0x7FF;
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
184 static inline flag
extractFloat64Sign(float64 a
)
186 return float64_val(a
) >> 63;
190 * Classify a floating point number. Everything above float_class_qnan
191 * is a NaN so cls >= float_class_qnan is any NaN.
194 typedef enum __attribute__ ((__packed__
)) {
195 float_class_unclassified
,
199 float_class_qnan
, /* all NaNs from here */
202 float_class_msnan
, /* maybe silenced */
206 * Structure holding all of the decomposed parts of a float. The
207 * exponent is unbiased and the fraction is normalized. All
208 * calculations are done with a 64 bit fraction and then rounded as
209 * appropriate for the final format.
211 * Thanks to the packed FloatClass a decent compiler should be able to
212 * fit the whole structure into registers and avoid using the stack
213 * for parameter passing.
223 #define DECOMPOSED_BINARY_POINT (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
227 /* Structure holding all of the relevant parameters for a format.
228 * exp_size: the size of the exponent field
229 * exp_bias: the offset applied to the exponent field
230 * exp_max: the maximum normalised exponent
231 * frac_size: the size of the fraction field
232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233 * The following are computed based the size of fraction
234 * frac_lsb: least significant bit of fraction
235 * fram_lsbm1: the bit bellow the least significant bit (for rounding)
236 * round_mask/roundeven_mask: masks used for rounding
247 uint64_t roundeven_mask
;
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F) \
253 .exp_bias = ((1 << E) - 1) >> 1, \
254 .exp_max = (1 << E) - 1, \
256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
262 static const FloatFmt float16_params
= {
266 static const FloatFmt float32_params
= {
270 static const FloatFmt float64_params
= {
274 /* Unpack a float to parts, but do not canonicalize. */
275 static inline FloatParts
unpack_raw(FloatFmt fmt
, uint64_t raw
)
277 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
279 return (FloatParts
) {
280 .cls
= float_class_unclassified
,
281 .sign
= extract64(raw
, sign_pos
, 1),
282 .exp
= extract64(raw
, fmt
.frac_size
, fmt
.exp_size
),
283 .frac
= extract64(raw
, 0, fmt
.frac_size
),
287 static inline FloatParts
float16_unpack_raw(float16 f
)
289 return unpack_raw(float16_params
, f
);
292 static inline FloatParts
float32_unpack_raw(float32 f
)
294 return unpack_raw(float32_params
, f
);
297 static inline FloatParts
float64_unpack_raw(float64 f
)
299 return unpack_raw(float64_params
, f
);
302 /* Pack a float from parts, but do not canonicalize. */
303 static inline uint64_t pack_raw(FloatFmt fmt
, FloatParts p
)
305 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
306 uint64_t ret
= deposit64(p
.frac
, fmt
.frac_size
, fmt
.exp_size
, p
.exp
);
307 return deposit64(ret
, sign_pos
, 1, p
.sign
);
310 static inline float16
float16_pack_raw(FloatParts p
)
312 return make_float16(pack_raw(float16_params
, p
));
315 static inline float32
float32_pack_raw(FloatParts p
)
317 return make_float32(pack_raw(float32_params
, p
));
320 static inline float64
float64_pack_raw(FloatParts p
)
322 return make_float64(pack_raw(float64_params
, p
));
325 /* Canonicalize EXP and FRAC, setting CLS. */
326 static FloatParts
canonicalize(FloatParts part
, const FloatFmt
*parm
,
327 float_status
*status
)
329 if (part
.exp
== parm
->exp_max
) {
330 if (part
.frac
== 0) {
331 part
.cls
= float_class_inf
;
333 #ifdef NO_SIGNALING_NANS
334 part
.cls
= float_class_qnan
;
336 int64_t msb
= part
.frac
<< (parm
->frac_shift
+ 2);
337 if ((msb
< 0) == status
->snan_bit_is_one
) {
338 part
.cls
= float_class_snan
;
340 part
.cls
= float_class_qnan
;
344 } else if (part
.exp
== 0) {
345 if (likely(part
.frac
== 0)) {
346 part
.cls
= float_class_zero
;
347 } else if (status
->flush_inputs_to_zero
) {
348 float_raise(float_flag_input_denormal
, status
);
349 part
.cls
= float_class_zero
;
352 int shift
= clz64(part
.frac
) - 1;
353 part
.cls
= float_class_normal
;
354 part
.exp
= parm
->frac_shift
- parm
->exp_bias
- shift
+ 1;
358 part
.cls
= float_class_normal
;
359 part
.exp
-= parm
->exp_bias
;
360 part
.frac
= DECOMPOSED_IMPLICIT_BIT
+ (part
.frac
<< parm
->frac_shift
);
365 /* Round and uncanonicalize a floating-point number by parts. There
366 * are FRAC_SHIFT bits that may require rounding at the bottom of the
367 * fraction; these bits will be removed. The exponent will be biased
368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
371 static FloatParts
round_canonical(FloatParts p
, float_status
*s
,
372 const FloatFmt
*parm
)
374 const uint64_t frac_lsbm1
= parm
->frac_lsbm1
;
375 const uint64_t round_mask
= parm
->round_mask
;
376 const uint64_t roundeven_mask
= parm
->roundeven_mask
;
377 const int exp_max
= parm
->exp_max
;
378 const int frac_shift
= parm
->frac_shift
;
387 case float_class_normal
:
388 switch (s
->float_rounding_mode
) {
389 case float_round_nearest_even
:
390 overflow_norm
= false;
391 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
393 case float_round_ties_away
:
394 overflow_norm
= false;
397 case float_round_to_zero
:
398 overflow_norm
= true;
402 inc
= p
.sign
? 0 : round_mask
;
403 overflow_norm
= p
.sign
;
405 case float_round_down
:
406 inc
= p
.sign
? round_mask
: 0;
407 overflow_norm
= !p
.sign
;
410 g_assert_not_reached();
413 exp
+= parm
->exp_bias
;
414 if (likely(exp
> 0)) {
415 if (frac
& round_mask
) {
416 flags
|= float_flag_inexact
;
418 if (frac
& DECOMPOSED_OVERFLOW_BIT
) {
425 if (unlikely(exp
>= exp_max
)) {
426 flags
|= float_flag_overflow
| float_flag_inexact
;
431 p
.cls
= float_class_inf
;
435 } else if (s
->flush_to_zero
) {
436 flags
|= float_flag_output_denormal
;
437 p
.cls
= float_class_zero
;
440 bool is_tiny
= (s
->float_detect_tininess
441 == float_tininess_before_rounding
)
443 || !((frac
+ inc
) & DECOMPOSED_OVERFLOW_BIT
);
445 shift64RightJamming(frac
, 1 - exp
, &frac
);
446 if (frac
& round_mask
) {
447 /* Need to recompute round-to-even. */
448 if (s
->float_rounding_mode
== float_round_nearest_even
) {
449 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
452 flags
|= float_flag_inexact
;
456 exp
= (frac
& DECOMPOSED_IMPLICIT_BIT
? 1 : 0);
459 if (is_tiny
&& (flags
& float_flag_inexact
)) {
460 flags
|= float_flag_underflow
;
462 if (exp
== 0 && frac
== 0) {
463 p
.cls
= float_class_zero
;
468 case float_class_zero
:
474 case float_class_inf
:
480 case float_class_qnan
:
481 case float_class_snan
:
486 g_assert_not_reached();
489 float_raise(flags
, s
);
495 static FloatParts
float16_unpack_canonical(float16 f
, float_status
*s
)
497 return canonicalize(float16_unpack_raw(f
), &float16_params
, s
);
500 static float16
float16_round_pack_canonical(FloatParts p
, float_status
*s
)
503 case float_class_dnan
:
504 return float16_default_nan(s
);
505 case float_class_msnan
:
506 return float16_maybe_silence_nan(float16_pack_raw(p
), s
);
508 p
= round_canonical(p
, s
, &float16_params
);
509 return float16_pack_raw(p
);
513 static FloatParts
float32_unpack_canonical(float32 f
, float_status
*s
)
515 return canonicalize(float32_unpack_raw(f
), &float32_params
, s
);
518 static float32
float32_round_pack_canonical(FloatParts p
, float_status
*s
)
521 case float_class_dnan
:
522 return float32_default_nan(s
);
523 case float_class_msnan
:
524 return float32_maybe_silence_nan(float32_pack_raw(p
), s
);
526 p
= round_canonical(p
, s
, &float32_params
);
527 return float32_pack_raw(p
);
531 static FloatParts
float64_unpack_canonical(float64 f
, float_status
*s
)
533 return canonicalize(float64_unpack_raw(f
), &float64_params
, s
);
536 static float64
float64_round_pack_canonical(FloatParts p
, float_status
*s
)
539 case float_class_dnan
:
540 return float64_default_nan(s
);
541 case float_class_msnan
:
542 return float64_maybe_silence_nan(float64_pack_raw(p
), s
);
544 p
= round_canonical(p
, s
, &float64_params
);
545 return float64_pack_raw(p
);
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c
)
552 return unlikely(c
>= float_class_qnan
);
554 static bool is_snan(FloatClass c
)
556 return c
== float_class_snan
;
558 static bool is_qnan(FloatClass c
)
560 return c
== float_class_qnan
;
563 static FloatParts
return_nan(FloatParts a
, float_status
*s
)
566 case float_class_snan
:
567 s
->float_exception_flags
|= float_flag_invalid
;
568 a
.cls
= float_class_msnan
;
570 case float_class_qnan
:
571 if (s
->default_nan_mode
) {
572 a
.cls
= float_class_dnan
;
577 g_assert_not_reached();
582 static FloatParts
pick_nan(FloatParts a
, FloatParts b
, float_status
*s
)
584 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
585 s
->float_exception_flags
|= float_flag_invalid
;
588 if (s
->default_nan_mode
) {
589 a
.cls
= float_class_dnan
;
591 if (pickNaN(is_qnan(a
.cls
), is_snan(a
.cls
),
592 is_qnan(b
.cls
), is_snan(b
.cls
),
594 (a
.frac
== b
.frac
&& a
.sign
< b
.sign
))) {
597 a
.cls
= float_class_msnan
;
602 static FloatParts
pick_nan_muladd(FloatParts a
, FloatParts b
, FloatParts c
,
603 bool inf_zero
, float_status
*s
)
605 if (is_snan(a
.cls
) || is_snan(b
.cls
) || is_snan(c
.cls
)) {
606 s
->float_exception_flags
|= float_flag_invalid
;
609 if (s
->default_nan_mode
) {
610 a
.cls
= float_class_dnan
;
612 switch (pickNaNMulAdd(is_qnan(a
.cls
), is_snan(a
.cls
),
613 is_qnan(b
.cls
), is_snan(b
.cls
),
614 is_qnan(c
.cls
), is_snan(c
.cls
),
625 a
.cls
= float_class_dnan
;
628 g_assert_not_reached();
631 a
.cls
= float_class_msnan
;
637 * Returns the result of adding or subtracting the values of the
638 * floating-point values `a' and `b'. The operation is performed
639 * according to the IEC/IEEE Standard for Binary Floating-Point
643 static FloatParts
addsub_floats(FloatParts a
, FloatParts b
, bool subtract
,
646 bool a_sign
= a
.sign
;
647 bool b_sign
= b
.sign
^ subtract
;
649 if (a_sign
!= b_sign
) {
652 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
653 if (a
.exp
> b
.exp
|| (a
.exp
== b
.exp
&& a
.frac
>= b
.frac
)) {
654 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
655 a
.frac
= a
.frac
- b
.frac
;
657 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
658 a
.frac
= b
.frac
- a
.frac
;
664 a
.cls
= float_class_zero
;
665 a
.sign
= s
->float_rounding_mode
== float_round_down
;
667 int shift
= clz64(a
.frac
) - 1;
668 a
.frac
= a
.frac
<< shift
;
669 a
.exp
= a
.exp
- shift
;
674 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
675 return pick_nan(a
, b
, s
);
677 if (a
.cls
== float_class_inf
) {
678 if (b
.cls
== float_class_inf
) {
679 float_raise(float_flag_invalid
, s
);
680 a
.cls
= float_class_dnan
;
684 if (a
.cls
== float_class_zero
&& b
.cls
== float_class_zero
) {
685 a
.sign
= s
->float_rounding_mode
== float_round_down
;
688 if (a
.cls
== float_class_zero
|| b
.cls
== float_class_inf
) {
692 if (b
.cls
== float_class_zero
) {
697 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
699 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
700 } else if (a
.exp
< b
.exp
) {
701 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
705 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
711 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
712 return pick_nan(a
, b
, s
);
714 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
717 if (b
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
722 g_assert_not_reached();
726 * Returns the result of adding or subtracting the floating-point
727 * values `a' and `b'. The operation is performed according to the
728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
731 float16
__attribute__((flatten
)) float16_add(float16 a
, float16 b
,
732 float_status
*status
)
734 FloatParts pa
= float16_unpack_canonical(a
, status
);
735 FloatParts pb
= float16_unpack_canonical(b
, status
);
736 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
738 return float16_round_pack_canonical(pr
, status
);
741 float32
__attribute__((flatten
)) float32_add(float32 a
, float32 b
,
742 float_status
*status
)
744 FloatParts pa
= float32_unpack_canonical(a
, status
);
745 FloatParts pb
= float32_unpack_canonical(b
, status
);
746 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
748 return float32_round_pack_canonical(pr
, status
);
751 float64
__attribute__((flatten
)) float64_add(float64 a
, float64 b
,
752 float_status
*status
)
754 FloatParts pa
= float64_unpack_canonical(a
, status
);
755 FloatParts pb
= float64_unpack_canonical(b
, status
);
756 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
758 return float64_round_pack_canonical(pr
, status
);
761 float16
__attribute__((flatten
)) float16_sub(float16 a
, float16 b
,
762 float_status
*status
)
764 FloatParts pa
= float16_unpack_canonical(a
, status
);
765 FloatParts pb
= float16_unpack_canonical(b
, status
);
766 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
768 return float16_round_pack_canonical(pr
, status
);
771 float32
__attribute__((flatten
)) float32_sub(float32 a
, float32 b
,
772 float_status
*status
)
774 FloatParts pa
= float32_unpack_canonical(a
, status
);
775 FloatParts pb
= float32_unpack_canonical(b
, status
);
776 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
778 return float32_round_pack_canonical(pr
, status
);
781 float64
__attribute__((flatten
)) float64_sub(float64 a
, float64 b
,
782 float_status
*status
)
784 FloatParts pa
= float64_unpack_canonical(a
, status
);
785 FloatParts pb
= float64_unpack_canonical(b
, status
);
786 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
788 return float64_round_pack_canonical(pr
, status
);
792 * Returns the result of multiplying the floating-point values `a' and
793 * `b'. The operation is performed according to the IEC/IEEE Standard
794 * for Binary Floating-Point Arithmetic.
797 static FloatParts
mul_floats(FloatParts a
, FloatParts b
, float_status
*s
)
799 bool sign
= a
.sign
^ b
.sign
;
801 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
803 int exp
= a
.exp
+ b
.exp
;
805 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
806 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
807 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
808 shift64RightJamming(lo
, 1, &lo
);
818 /* handle all the NaN cases */
819 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
820 return pick_nan(a
, b
, s
);
822 /* Inf * Zero == NaN */
823 if ((a
.cls
== float_class_inf
&& b
.cls
== float_class_zero
) ||
824 (a
.cls
== float_class_zero
&& b
.cls
== float_class_inf
)) {
825 s
->float_exception_flags
|= float_flag_invalid
;
826 a
.cls
= float_class_dnan
;
830 /* Multiply by 0 or Inf */
831 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
835 if (b
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
839 g_assert_not_reached();
842 float16
__attribute__((flatten
)) float16_mul(float16 a
, float16 b
,
843 float_status
*status
)
845 FloatParts pa
= float16_unpack_canonical(a
, status
);
846 FloatParts pb
= float16_unpack_canonical(b
, status
);
847 FloatParts pr
= mul_floats(pa
, pb
, status
);
849 return float16_round_pack_canonical(pr
, status
);
852 float32
__attribute__((flatten
)) float32_mul(float32 a
, float32 b
,
853 float_status
*status
)
855 FloatParts pa
= float32_unpack_canonical(a
, status
);
856 FloatParts pb
= float32_unpack_canonical(b
, status
);
857 FloatParts pr
= mul_floats(pa
, pb
, status
);
859 return float32_round_pack_canonical(pr
, status
);
862 float64
__attribute__((flatten
)) float64_mul(float64 a
, float64 b
,
863 float_status
*status
)
865 FloatParts pa
= float64_unpack_canonical(a
, status
);
866 FloatParts pb
= float64_unpack_canonical(b
, status
);
867 FloatParts pr
= mul_floats(pa
, pb
, status
);
869 return float64_round_pack_canonical(pr
, status
);
873 * Returns the result of multiplying the floating-point values `a' and
874 * `b' then adding 'c', with no intermediate rounding step after the
875 * multiplication. The operation is performed according to the
876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
877 * The flags argument allows the caller to select negation of the
878 * addend, the intermediate product, or the final result. (The
879 * difference between this and having the caller do a separate
880 * negation is that negating externally will flip the sign bit on
884 static FloatParts
muladd_floats(FloatParts a
, FloatParts b
, FloatParts c
,
885 int flags
, float_status
*s
)
887 bool inf_zero
= ((1 << a
.cls
) | (1 << b
.cls
)) ==
888 ((1 << float_class_inf
) | (1 << float_class_zero
));
890 bool sign_flip
= flags
& float_muladd_negate_result
;
895 /* It is implementation-defined whether the cases of (0,inf,qnan)
896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
897 * they return if they do), so we have to hand this information
898 * off to the target-specific pick-a-NaN routine.
900 if (is_nan(a
.cls
) || is_nan(b
.cls
) || is_nan(c
.cls
)) {
901 return pick_nan_muladd(a
, b
, c
, inf_zero
, s
);
905 s
->float_exception_flags
|= float_flag_invalid
;
906 a
.cls
= float_class_dnan
;
910 if (flags
& float_muladd_negate_c
) {
914 p_sign
= a
.sign
^ b
.sign
;
916 if (flags
& float_muladd_negate_product
) {
920 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_inf
) {
921 p_class
= float_class_inf
;
922 } else if (a
.cls
== float_class_zero
|| b
.cls
== float_class_zero
) {
923 p_class
= float_class_zero
;
925 p_class
= float_class_normal
;
928 if (c
.cls
== float_class_inf
) {
929 if (p_class
== float_class_inf
&& p_sign
!= c
.sign
) {
930 s
->float_exception_flags
|= float_flag_invalid
;
931 a
.cls
= float_class_dnan
;
933 a
.cls
= float_class_inf
;
934 a
.sign
= c
.sign
^ sign_flip
;
939 if (p_class
== float_class_inf
) {
940 a
.cls
= float_class_inf
;
941 a
.sign
= p_sign
^ sign_flip
;
945 if (p_class
== float_class_zero
) {
946 if (c
.cls
== float_class_zero
) {
947 if (p_sign
!= c
.sign
) {
948 p_sign
= s
->float_rounding_mode
== float_round_down
;
951 } else if (flags
& float_muladd_halve_result
) {
958 /* a & b should be normals now... */
959 assert(a
.cls
== float_class_normal
&&
960 b
.cls
== float_class_normal
);
962 p_exp
= a
.exp
+ b
.exp
;
964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
967 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
968 /* binary point now at bit 124 */
970 /* check for overflow */
971 if (hi
& (1ULL << (DECOMPOSED_BINARY_POINT
* 2 + 1 - 64))) {
972 shift128RightJamming(hi
, lo
, 1, &hi
, &lo
);
977 if (c
.cls
== float_class_zero
) {
978 /* move binary point back to 62 */
979 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
981 int exp_diff
= p_exp
- c
.exp
;
982 if (p_sign
== c
.sign
) {
985 shift128RightJamming(hi
, lo
,
986 DECOMPOSED_BINARY_POINT
- exp_diff
,
992 /* shift c to the same binary point as the product (124) */
995 shift128RightJamming(c_hi
, c_lo
,
998 add128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
999 /* move binary point back to 62 */
1000 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1003 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
1004 shift64RightJamming(lo
, 1, &lo
);
1010 uint64_t c_hi
, c_lo
;
1011 /* make C binary point match product at bit 124 */
1015 if (exp_diff
<= 0) {
1016 shift128RightJamming(hi
, lo
, -exp_diff
, &hi
, &lo
);
1019 (hi
> c_hi
|| (hi
== c_hi
&& lo
>= c_lo
))) {
1020 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1022 sub128(c_hi
, c_lo
, hi
, lo
, &hi
, &lo
);
1027 shift128RightJamming(c_hi
, c_lo
,
1030 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1033 if (hi
== 0 && lo
== 0) {
1034 a
.cls
= float_class_zero
;
1035 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1036 a
.sign
^= sign_flip
;
1043 shift
= clz64(lo
) + 64;
1045 /* Normalizing to a binary point of 124 is the
1046 correct adjust for the exponent. However since we're
1047 shifting, we might as well put the binary point back
1048 at 62 where we really want it. Therefore shift as
1049 if we're leaving 1 bit at the top of the word, but
1050 adjust the exponent as if we're leaving 3 bits. */
1053 lo
= lo
<< (shift
- 64);
1055 hi
= (hi
<< shift
) | (lo
>> (64 - shift
));
1056 lo
= hi
| ((lo
<< shift
) != 0);
1063 if (flags
& float_muladd_halve_result
) {
1067 /* finally prepare our result */
1068 a
.cls
= float_class_normal
;
1069 a
.sign
= p_sign
^ sign_flip
;
1076 float16
__attribute__((flatten
)) float16_muladd(float16 a
, float16 b
, float16 c
,
1077 int flags
, float_status
*status
)
1079 FloatParts pa
= float16_unpack_canonical(a
, status
);
1080 FloatParts pb
= float16_unpack_canonical(b
, status
);
1081 FloatParts pc
= float16_unpack_canonical(c
, status
);
1082 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1084 return float16_round_pack_canonical(pr
, status
);
1087 float32
__attribute__((flatten
)) float32_muladd(float32 a
, float32 b
, float32 c
,
1088 int flags
, float_status
*status
)
1090 FloatParts pa
= float32_unpack_canonical(a
, status
);
1091 FloatParts pb
= float32_unpack_canonical(b
, status
);
1092 FloatParts pc
= float32_unpack_canonical(c
, status
);
1093 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1095 return float32_round_pack_canonical(pr
, status
);
1098 float64
__attribute__((flatten
)) float64_muladd(float64 a
, float64 b
, float64 c
,
1099 int flags
, float_status
*status
)
1101 FloatParts pa
= float64_unpack_canonical(a
, status
);
1102 FloatParts pb
= float64_unpack_canonical(b
, status
);
1103 FloatParts pc
= float64_unpack_canonical(c
, status
);
1104 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1106 return float64_round_pack_canonical(pr
, status
);
1110 * Returns the result of dividing the floating-point value `a' by the
1111 * corresponding value `b'. The operation is performed according to
1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 static FloatParts
div_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1117 bool sign
= a
.sign
^ b
.sign
;
1119 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1120 uint64_t temp_lo
, temp_hi
;
1121 int exp
= a
.exp
- b
.exp
;
1122 if (a
.frac
< b
.frac
) {
1124 shortShift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1,
1125 &temp_hi
, &temp_lo
);
1127 shortShift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
,
1128 &temp_hi
, &temp_lo
);
1130 /* LSB of quot is set if inexact which roundandpack will use
1131 * to set flags. Yet again we re-use a for the result */
1132 a
.frac
= div128To64(temp_lo
, temp_hi
, b
.frac
);
1137 /* handle all the NaN cases */
1138 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1139 return pick_nan(a
, b
, s
);
1141 /* 0/0 or Inf/Inf */
1144 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1145 s
->float_exception_flags
|= float_flag_invalid
;
1146 a
.cls
= float_class_dnan
;
1150 if (b
.cls
== float_class_zero
) {
1151 s
->float_exception_flags
|= float_flag_divbyzero
;
1152 a
.cls
= float_class_inf
;
1156 /* Inf / x or 0 / x */
1157 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1162 if (b
.cls
== float_class_inf
) {
1163 a
.cls
= float_class_zero
;
1167 g_assert_not_reached();
1170 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1172 FloatParts pa
= float16_unpack_canonical(a
, status
);
1173 FloatParts pb
= float16_unpack_canonical(b
, status
);
1174 FloatParts pr
= div_floats(pa
, pb
, status
);
1176 return float16_round_pack_canonical(pr
, status
);
1179 float32
float32_div(float32 a
, float32 b
, float_status
*status
)
1181 FloatParts pa
= float32_unpack_canonical(a
, status
);
1182 FloatParts pb
= float32_unpack_canonical(b
, status
);
1183 FloatParts pr
= div_floats(pa
, pb
, status
);
1185 return float32_round_pack_canonical(pr
, status
);
1188 float64
float64_div(float64 a
, float64 b
, float_status
*status
)
1190 FloatParts pa
= float64_unpack_canonical(a
, status
);
1191 FloatParts pb
= float64_unpack_canonical(b
, status
);
1192 FloatParts pr
= div_floats(pa
, pb
, status
);
1194 return float64_round_pack_canonical(pr
, status
);
1198 * Rounds the floating-point value `a' to an integer, and returns the
1199 * result as a floating-point value. The operation is performed
1200 * according to the IEC/IEEE Standard for Binary Floating-Point
1204 static FloatParts
round_to_int(FloatParts a
, int rounding_mode
, float_status
*s
)
1206 if (is_nan(a
.cls
)) {
1207 return return_nan(a
, s
);
1211 case float_class_zero
:
1212 case float_class_inf
:
1213 case float_class_qnan
:
1214 /* already "integral" */
1216 case float_class_normal
:
1217 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
1218 /* already integral */
1223 /* all fractional */
1224 s
->float_exception_flags
|= float_flag_inexact
;
1225 switch (rounding_mode
) {
1226 case float_round_nearest_even
:
1227 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
1229 case float_round_ties_away
:
1230 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
1232 case float_round_to_zero
:
1235 case float_round_up
:
1238 case float_round_down
:
1242 g_assert_not_reached();
1246 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
1249 a
.cls
= float_class_zero
;
1252 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
1253 uint64_t frac_lsbm1
= frac_lsb
>> 1;
1254 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
1255 uint64_t rnd_mask
= rnd_even_mask
>> 1;
1258 switch (rounding_mode
) {
1259 case float_round_nearest_even
:
1260 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
1262 case float_round_ties_away
:
1265 case float_round_to_zero
:
1268 case float_round_up
:
1269 inc
= a
.sign
? 0 : rnd_mask
;
1271 case float_round_down
:
1272 inc
= a
.sign
? rnd_mask
: 0;
1275 g_assert_not_reached();
1278 if (a
.frac
& rnd_mask
) {
1279 s
->float_exception_flags
|= float_flag_inexact
;
1281 a
.frac
&= ~rnd_mask
;
1282 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
1290 g_assert_not_reached();
1295 float16
float16_round_to_int(float16 a
, float_status
*s
)
1297 FloatParts pa
= float16_unpack_canonical(a
, s
);
1298 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, s
);
1299 return float16_round_pack_canonical(pr
, s
);
1302 float32
float32_round_to_int(float32 a
, float_status
*s
)
1304 FloatParts pa
= float32_unpack_canonical(a
, s
);
1305 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, s
);
1306 return float32_round_pack_canonical(pr
, s
);
1309 float64
float64_round_to_int(float64 a
, float_status
*s
)
1311 FloatParts pa
= float64_unpack_canonical(a
, s
);
1312 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, s
);
1313 return float64_round_pack_canonical(pr
, s
);
1316 float64
float64_trunc_to_int(float64 a
, float_status
*s
)
1318 FloatParts pa
= float64_unpack_canonical(a
, s
);
1319 FloatParts pr
= round_to_int(pa
, float_round_to_zero
, s
);
1320 return float64_round_pack_canonical(pr
, s
);
1324 * Returns the result of converting the floating-point value `a' to
1325 * the two's complement integer format. The conversion is performed
1326 * according to the IEC/IEEE Standard for Binary Floating-Point
1327 * Arithmetic---which means in particular that the conversion is
1328 * rounded according to the current rounding mode. If `a' is a NaN,
1329 * the largest positive integer is returned. Otherwise, if the
1330 * conversion overflows, the largest integer with the same sign as `a'
1334 static int64_t round_to_int_and_pack(FloatParts in
, int rmode
,
1335 int64_t min
, int64_t max
,
1339 int orig_flags
= get_float_exception_flags(s
);
1340 FloatParts p
= round_to_int(in
, rmode
, s
);
1343 case float_class_snan
:
1344 case float_class_qnan
:
1346 case float_class_inf
:
1347 return p
.sign
? min
: max
;
1348 case float_class_zero
:
1350 case float_class_normal
:
1351 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
1352 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
1353 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
1354 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
1359 if (r
< -(uint64_t) min
) {
1362 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1369 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1374 g_assert_not_reached();
1378 #define FLOAT_TO_INT(fsz, isz) \
1379 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1382 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1383 return round_to_int_and_pack(p, s->float_rounding_mode, \
1384 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1388 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1389 (float ## fsz a, float_status *s) \
1391 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1392 return round_to_int_and_pack(p, float_round_to_zero, \
1393 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1397 FLOAT_TO_INT(16, 16)
1398 FLOAT_TO_INT(16, 32)
1399 FLOAT_TO_INT(16, 64)
1401 FLOAT_TO_INT(32, 16)
1402 FLOAT_TO_INT(32, 32)
1403 FLOAT_TO_INT(32, 64)
1405 FLOAT_TO_INT(64, 16)
1406 FLOAT_TO_INT(64, 32)
1407 FLOAT_TO_INT(64, 64)
1412 * Returns the result of converting the floating-point value `a' to
1413 * the unsigned integer format. The conversion is performed according
1414 * to the IEC/IEEE Standard for Binary Floating-Point
1415 * Arithmetic---which means in particular that the conversion is
1416 * rounded according to the current rounding mode. If `a' is a NaN,
1417 * the largest unsigned integer is returned. Otherwise, if the
1418 * conversion overflows, the largest unsigned integer is returned. If
1419 * the 'a' is negative, the result is rounded and zero is returned;
1420 * values that do not round to zero will raise the inexact exception
1424 static uint64_t round_to_uint_and_pack(FloatParts in
, int rmode
, uint64_t max
,
1427 int orig_flags
= get_float_exception_flags(s
);
1428 FloatParts p
= round_to_int(in
, rmode
, s
);
1431 case float_class_snan
:
1432 case float_class_qnan
:
1433 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1435 case float_class_inf
:
1436 return p
.sign
? 0 : max
;
1437 case float_class_zero
:
1439 case float_class_normal
:
1443 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1447 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
1448 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
1449 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
1450 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
1452 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1456 /* For uint64 this will never trip, but if p.exp is too large
1457 * to shift a decomposed fraction we shall have exited via the
1461 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1468 g_assert_not_reached();
1472 #define FLOAT_TO_UINT(fsz, isz) \
1473 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1476 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1477 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1478 UINT ## isz ## _MAX, s); \
1481 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1482 (float ## fsz a, float_status *s) \
1484 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1485 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1486 UINT ## isz ## _MAX, s); \
1489 FLOAT_TO_UINT(16, 16)
1490 FLOAT_TO_UINT(16, 32)
1491 FLOAT_TO_UINT(16, 64)
1493 FLOAT_TO_UINT(32, 16)
1494 FLOAT_TO_UINT(32, 32)
1495 FLOAT_TO_UINT(32, 64)
1497 FLOAT_TO_UINT(64, 16)
1498 FLOAT_TO_UINT(64, 32)
1499 FLOAT_TO_UINT(64, 64)
1501 #undef FLOAT_TO_UINT
1504 * Integer to float conversions
1506 * Returns the result of converting the two's complement integer `a'
1507 * to the floating-point format. The conversion is performed according
1508 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1511 static FloatParts
int_to_float(int64_t a
, float_status
*status
)
1515 r
.cls
= float_class_zero
;
1517 } else if (a
== (1ULL << 63)) {
1518 r
.cls
= float_class_normal
;
1520 r
.frac
= DECOMPOSED_IMPLICIT_BIT
;
1531 int shift
= clz64(f
) - 1;
1532 r
.cls
= float_class_normal
;
1533 r
.exp
= (DECOMPOSED_BINARY_POINT
- shift
);
1534 r
.frac
= f
<< shift
;
1540 float16
int64_to_float16(int64_t a
, float_status
*status
)
1542 FloatParts pa
= int_to_float(a
, status
);
1543 return float16_round_pack_canonical(pa
, status
);
1546 float16
int32_to_float16(int32_t a
, float_status
*status
)
1548 return int64_to_float16(a
, status
);
1551 float16
int16_to_float16(int16_t a
, float_status
*status
)
1553 return int64_to_float16(a
, status
);
1556 float32
int64_to_float32(int64_t a
, float_status
*status
)
1558 FloatParts pa
= int_to_float(a
, status
);
1559 return float32_round_pack_canonical(pa
, status
);
1562 float32
int32_to_float32(int32_t a
, float_status
*status
)
1564 return int64_to_float32(a
, status
);
1567 float32
int16_to_float32(int16_t a
, float_status
*status
)
1569 return int64_to_float32(a
, status
);
1572 float64
int64_to_float64(int64_t a
, float_status
*status
)
1574 FloatParts pa
= int_to_float(a
, status
);
1575 return float64_round_pack_canonical(pa
, status
);
1578 float64
int32_to_float64(int32_t a
, float_status
*status
)
1580 return int64_to_float64(a
, status
);
1583 float64
int16_to_float64(int16_t a
, float_status
*status
)
1585 return int64_to_float64(a
, status
);
1590 * Unsigned Integer to float conversions
1592 * Returns the result of converting the unsigned integer `a' to the
1593 * floating-point format. The conversion is performed according to the
1594 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1597 static FloatParts
uint_to_float(uint64_t a
, float_status
*status
)
1599 FloatParts r
= { .sign
= false};
1602 r
.cls
= float_class_zero
;
1604 int spare_bits
= clz64(a
) - 1;
1605 r
.cls
= float_class_normal
;
1606 r
.exp
= DECOMPOSED_BINARY_POINT
- spare_bits
;
1607 if (spare_bits
< 0) {
1608 shift64RightJamming(a
, -spare_bits
, &a
);
1611 r
.frac
= a
<< spare_bits
;
1618 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
1620 FloatParts pa
= uint_to_float(a
, status
);
1621 return float16_round_pack_canonical(pa
, status
);
1624 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
1626 return uint64_to_float16(a
, status
);
1629 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
1631 return uint64_to_float16(a
, status
);
1634 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
1636 FloatParts pa
= uint_to_float(a
, status
);
1637 return float32_round_pack_canonical(pa
, status
);
1640 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
1642 return uint64_to_float32(a
, status
);
1645 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
1647 return uint64_to_float32(a
, status
);
1650 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
1652 FloatParts pa
= uint_to_float(a
, status
);
1653 return float64_round_pack_canonical(pa
, status
);
1656 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
1658 return uint64_to_float64(a
, status
);
1661 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
1663 return uint64_to_float64(a
, status
);
1667 /* min() and max() functions. These can't be implemented as
1668 * 'compare and pick one input' because that would mishandle
1669 * NaNs and +0 vs -0.
1671 * minnum() and maxnum() functions. These are similar to the min()
1672 * and max() functions but if one of the arguments is a QNaN and
1673 * the other is numerical then the numerical argument is returned.
1674 * SNaNs will get quietened before being returned.
1675 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1676 * and maxNum() operations. min() and max() are the typical min/max
1677 * semantics provided by many CPUs which predate that specification.
1679 * minnummag() and maxnummag() functions correspond to minNumMag()
1680 * and minNumMag() from the IEEE-754 2008.
1682 static FloatParts
minmax_floats(FloatParts a
, FloatParts b
, bool ismin
,
1683 bool ieee
, bool ismag
, float_status
*s
)
1685 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
1687 /* Takes two floating-point values `a' and `b', one of
1688 * which is a NaN, and returns the appropriate NaN
1689 * result. If either `a' or `b' is a signaling NaN,
1690 * the invalid exception is raised.
1692 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
1693 return pick_nan(a
, b
, s
);
1694 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
1696 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
1700 return pick_nan(a
, b
, s
);
1703 bool a_sign
, b_sign
;
1706 case float_class_normal
:
1709 case float_class_inf
:
1712 case float_class_zero
:
1716 g_assert_not_reached();
1720 case float_class_normal
:
1723 case float_class_inf
:
1726 case float_class_zero
:
1730 g_assert_not_reached();
1737 a_sign
= b_sign
= 0;
1740 if (a_sign
== b_sign
) {
1741 bool a_less
= a_exp
< b_exp
;
1742 if (a_exp
== b_exp
) {
1743 a_less
= a
.frac
< b
.frac
;
1745 return a_sign
^ a_less
^ ismin
? b
: a
;
1747 return a_sign
^ ismin
? b
: a
;
1752 #define MINMAX(sz, name, ismin, isiee, ismag) \
1753 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1756 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1757 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1758 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1760 return float ## sz ## _round_pack_canonical(pr, s); \
1763 MINMAX(16, min
, true, false, false)
1764 MINMAX(16, minnum
, true, true, false)
1765 MINMAX(16, minnummag
, true, true, true)
1766 MINMAX(16, max
, false, false, false)
1767 MINMAX(16, maxnum
, false, true, false)
1768 MINMAX(16, maxnummag
, false, true, true)
1770 MINMAX(32, min
, true, false, false)
1771 MINMAX(32, minnum
, true, true, false)
1772 MINMAX(32, minnummag
, true, true, true)
1773 MINMAX(32, max
, false, false, false)
1774 MINMAX(32, maxnum
, false, true, false)
1775 MINMAX(32, maxnummag
, false, true, true)
1777 MINMAX(64, min
, true, false, false)
1778 MINMAX(64, minnum
, true, true, false)
1779 MINMAX(64, minnummag
, true, true, true)
1780 MINMAX(64, max
, false, false, false)
1781 MINMAX(64, maxnum
, false, true, false)
1782 MINMAX(64, maxnummag
, false, true, true)
1786 /* Floating point compare */
1787 static int compare_floats(FloatParts a
, FloatParts b
, bool is_quiet
,
1790 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1792 a
.cls
== float_class_snan
||
1793 b
.cls
== float_class_snan
) {
1794 s
->float_exception_flags
|= float_flag_invalid
;
1796 return float_relation_unordered
;
1799 if (a
.cls
== float_class_zero
) {
1800 if (b
.cls
== float_class_zero
) {
1801 return float_relation_equal
;
1803 return b
.sign
? float_relation_greater
: float_relation_less
;
1804 } else if (b
.cls
== float_class_zero
) {
1805 return a
.sign
? float_relation_less
: float_relation_greater
;
1808 /* The only really important thing about infinity is its sign. If
1809 * both are infinities the sign marks the smallest of the two.
1811 if (a
.cls
== float_class_inf
) {
1812 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
1813 return float_relation_equal
;
1815 return a
.sign
? float_relation_less
: float_relation_greater
;
1816 } else if (b
.cls
== float_class_inf
) {
1817 return b
.sign
? float_relation_greater
: float_relation_less
;
1820 if (a
.sign
!= b
.sign
) {
1821 return a
.sign
? float_relation_less
: float_relation_greater
;
1824 if (a
.exp
== b
.exp
) {
1825 if (a
.frac
== b
.frac
) {
1826 return float_relation_equal
;
1829 return a
.frac
> b
.frac
?
1830 float_relation_less
: float_relation_greater
;
1832 return a
.frac
> b
.frac
?
1833 float_relation_greater
: float_relation_less
;
1837 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
1839 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
1844 #define COMPARE(sz) \
1845 int float ## sz ## _compare(float ## sz a, float ## sz b, \
1848 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1849 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1850 return compare_floats(pa, pb, false, s); \
1852 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
1855 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1856 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1857 return compare_floats(pa, pb, true, s); \
1866 /* Multiply A by 2 raised to the power N. */
1867 static FloatParts
scalbn_decomposed(FloatParts a
, int n
, float_status
*s
)
1869 if (unlikely(is_nan(a
.cls
))) {
1870 return return_nan(a
, s
);
1872 if (a
.cls
== float_class_normal
) {
1878 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
1880 FloatParts pa
= float16_unpack_canonical(a
, status
);
1881 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
1882 return float16_round_pack_canonical(pr
, status
);
1885 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
1887 FloatParts pa
= float32_unpack_canonical(a
, status
);
1888 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
1889 return float32_round_pack_canonical(pr
, status
);
1892 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
1894 FloatParts pa
= float64_unpack_canonical(a
, status
);
1895 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
1896 return float64_round_pack_canonical(pr
, status
);
1902 * The old softfloat code did an approximation step before zeroing in
1903 * on the final result. However for simpleness we just compute the
1904 * square root by iterating down from the implicit bit to enough extra
1905 * bits to ensure we get a correctly rounded result.
1907 * This does mean however the calculation is slower than before,
1908 * especially for 64 bit floats.
1911 static FloatParts
sqrt_float(FloatParts a
, float_status
*s
, const FloatFmt
*p
)
1913 uint64_t a_frac
, r_frac
, s_frac
;
1916 if (is_nan(a
.cls
)) {
1917 return return_nan(a
, s
);
1919 if (a
.cls
== float_class_zero
) {
1920 return a
; /* sqrt(+-0) = +-0 */
1923 s
->float_exception_flags
|= float_flag_invalid
;
1924 a
.cls
= float_class_dnan
;
1927 if (a
.cls
== float_class_inf
) {
1928 return a
; /* sqrt(+inf) = +inf */
1931 assert(a
.cls
== float_class_normal
);
1933 /* We need two overflow bits at the top. Adding room for that is a
1934 * right shift. If the exponent is odd, we can discard the low bit
1935 * by multiplying the fraction by 2; that's a left shift. Combine
1936 * those and we shift right if the exponent is even.
1944 /* Bit-by-bit computation of sqrt. */
1948 /* Iterate from implicit bit down to the 3 extra bits to compute a
1949 * properly rounded result. Remember we've inserted one more bit
1950 * at the top, so these positions are one less.
1952 bit
= DECOMPOSED_BINARY_POINT
- 1;
1953 last_bit
= MAX(p
->frac_shift
- 4, 0);
1955 uint64_t q
= 1ULL << bit
;
1956 uint64_t t_frac
= s_frac
+ q
;
1957 if (t_frac
<= a_frac
) {
1958 s_frac
= t_frac
+ q
;
1963 } while (--bit
>= last_bit
);
1965 /* Undo the right shift done above. If there is any remaining
1966 * fraction, the result is inexact. Set the sticky bit.
1968 a
.frac
= (r_frac
<< 1) + (a_frac
!= 0);
1973 float16
__attribute__((flatten
)) float16_sqrt(float16 a
, float_status
*status
)
1975 FloatParts pa
= float16_unpack_canonical(a
, status
);
1976 FloatParts pr
= sqrt_float(pa
, status
, &float16_params
);
1977 return float16_round_pack_canonical(pr
, status
);
1980 float32
__attribute__((flatten
)) float32_sqrt(float32 a
, float_status
*status
)
1982 FloatParts pa
= float32_unpack_canonical(a
, status
);
1983 FloatParts pr
= sqrt_float(pa
, status
, &float32_params
);
1984 return float32_round_pack_canonical(pr
, status
);
1987 float64
__attribute__((flatten
)) float64_sqrt(float64 a
, float_status
*status
)
1989 FloatParts pa
= float64_unpack_canonical(a
, status
);
1990 FloatParts pr
= sqrt_float(pa
, status
, &float64_params
);
1991 return float64_round_pack_canonical(pr
, status
);
1995 /*----------------------------------------------------------------------------
1996 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
1997 | and 7, and returns the properly rounded 32-bit integer corresponding to the
1998 | input. If `zSign' is 1, the input is negated before being converted to an
1999 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2000 | is simply rounded to an integer, with the inexact exception raised if the
2001 | input cannot be represented exactly as an integer. However, if the fixed-
2002 | point input is too large, the invalid exception is raised and the largest
2003 | positive or negative integer is returned.
2004 *----------------------------------------------------------------------------*/
2006 static int32_t roundAndPackInt32(flag zSign
, uint64_t absZ
, float_status
*status
)
2008 int8_t roundingMode
;
2009 flag roundNearestEven
;
2010 int8_t roundIncrement
, roundBits
;
2013 roundingMode
= status
->float_rounding_mode
;
2014 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2015 switch (roundingMode
) {
2016 case float_round_nearest_even
:
2017 case float_round_ties_away
:
2018 roundIncrement
= 0x40;
2020 case float_round_to_zero
:
2023 case float_round_up
:
2024 roundIncrement
= zSign
? 0 : 0x7f;
2026 case float_round_down
:
2027 roundIncrement
= zSign
? 0x7f : 0;
2032 roundBits
= absZ
& 0x7F;
2033 absZ
= ( absZ
+ roundIncrement
)>>7;
2034 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
2036 if ( zSign
) z
= - z
;
2037 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
2038 float_raise(float_flag_invalid
, status
);
2039 return zSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
2042 status
->float_exception_flags
|= float_flag_inexact
;
2048 /*----------------------------------------------------------------------------
2049 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2050 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2051 | and returns the properly rounded 64-bit integer corresponding to the input.
2052 | If `zSign' is 1, the input is negated before being converted to an integer.
2053 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2054 | the inexact exception raised if the input cannot be represented exactly as
2055 | an integer. However, if the fixed-point input is too large, the invalid
2056 | exception is raised and the largest positive or negative integer is
2058 *----------------------------------------------------------------------------*/
2060 static int64_t roundAndPackInt64(flag zSign
, uint64_t absZ0
, uint64_t absZ1
,
2061 float_status
*status
)
2063 int8_t roundingMode
;
2064 flag roundNearestEven
, increment
;
2067 roundingMode
= status
->float_rounding_mode
;
2068 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2069 switch (roundingMode
) {
2070 case float_round_nearest_even
:
2071 case float_round_ties_away
:
2072 increment
= ((int64_t) absZ1
< 0);
2074 case float_round_to_zero
:
2077 case float_round_up
:
2078 increment
= !zSign
&& absZ1
;
2080 case float_round_down
:
2081 increment
= zSign
&& absZ1
;
2088 if ( absZ0
== 0 ) goto overflow
;
2089 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
2092 if ( zSign
) z
= - z
;
2093 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
2095 float_raise(float_flag_invalid
, status
);
2097 zSign
? (int64_t) LIT64( 0x8000000000000000 )
2098 : LIT64( 0x7FFFFFFFFFFFFFFF );
2101 status
->float_exception_flags
|= float_flag_inexact
;
2107 /*----------------------------------------------------------------------------
2108 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2109 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2110 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2111 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2112 | with the inexact exception raised if the input cannot be represented exactly
2113 | as an integer. However, if the fixed-point input is too large, the invalid
2114 | exception is raised and the largest unsigned integer is returned.
2115 *----------------------------------------------------------------------------*/
2117 static int64_t roundAndPackUint64(flag zSign
, uint64_t absZ0
,
2118 uint64_t absZ1
, float_status
*status
)
2120 int8_t roundingMode
;
2121 flag roundNearestEven
, increment
;
2123 roundingMode
= status
->float_rounding_mode
;
2124 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
2125 switch (roundingMode
) {
2126 case float_round_nearest_even
:
2127 case float_round_ties_away
:
2128 increment
= ((int64_t)absZ1
< 0);
2130 case float_round_to_zero
:
2133 case float_round_up
:
2134 increment
= !zSign
&& absZ1
;
2136 case float_round_down
:
2137 increment
= zSign
&& absZ1
;
2145 float_raise(float_flag_invalid
, status
);
2146 return LIT64(0xFFFFFFFFFFFFFFFF);
2148 absZ0
&= ~(((uint64_t)(absZ1
<<1) == 0) & roundNearestEven
);
2151 if (zSign
&& absZ0
) {
2152 float_raise(float_flag_invalid
, status
);
2157 status
->float_exception_flags
|= float_flag_inexact
;
2162 /*----------------------------------------------------------------------------
2163 | If `a' is denormal and we are in flush-to-zero mode then set the
2164 | input-denormal exception and return zero. Otherwise just return the value.
2165 *----------------------------------------------------------------------------*/
2166 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
2168 if (status
->flush_inputs_to_zero
) {
2169 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
2170 float_raise(float_flag_input_denormal
, status
);
2171 return make_float32(float32_val(a
) & 0x80000000);
2177 /*----------------------------------------------------------------------------
2178 | Normalizes the subnormal single-precision floating-point value represented
2179 | by the denormalized significand `aSig'. The normalized exponent and
2180 | significand are stored at the locations pointed to by `zExpPtr' and
2181 | `zSigPtr', respectively.
2182 *----------------------------------------------------------------------------*/
2185 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
2189 shiftCount
= countLeadingZeros32( aSig
) - 8;
2190 *zSigPtr
= aSig
<<shiftCount
;
2191 *zExpPtr
= 1 - shiftCount
;
2195 /*----------------------------------------------------------------------------
2196 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2197 | and significand `zSig', and returns the proper single-precision floating-
2198 | point value corresponding to the abstract input. Ordinarily, the abstract
2199 | value is simply rounded and packed into the single-precision format, with
2200 | the inexact exception raised if the abstract input cannot be represented
2201 | exactly. However, if the abstract value is too large, the overflow and
2202 | inexact exceptions are raised and an infinity or maximal finite value is
2203 | returned. If the abstract value is too small, the input value is rounded to
2204 | a subnormal number, and the underflow and inexact exceptions are raised if
2205 | the abstract input cannot be represented exactly as a subnormal single-
2206 | precision floating-point number.
2207 | The input significand `zSig' has its binary point between bits 30
2208 | and 29, which is 7 bits to the left of the usual location. This shifted
2209 | significand must be normalized or smaller. If `zSig' is not normalized,
2210 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2211 | and it must not require rounding. In the usual case that `zSig' is
2212 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2213 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2214 | Binary Floating-Point Arithmetic.
2215 *----------------------------------------------------------------------------*/
2217 static float32
roundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
2218 float_status
*status
)
2220 int8_t roundingMode
;
2221 flag roundNearestEven
;
2222 int8_t roundIncrement
, roundBits
;
2225 roundingMode
= status
->float_rounding_mode
;
2226 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2227 switch (roundingMode
) {
2228 case float_round_nearest_even
:
2229 case float_round_ties_away
:
2230 roundIncrement
= 0x40;
2232 case float_round_to_zero
:
2235 case float_round_up
:
2236 roundIncrement
= zSign
? 0 : 0x7f;
2238 case float_round_down
:
2239 roundIncrement
= zSign
? 0x7f : 0;
2245 roundBits
= zSig
& 0x7F;
2246 if ( 0xFD <= (uint16_t) zExp
) {
2247 if ( ( 0xFD < zExp
)
2248 || ( ( zExp
== 0xFD )
2249 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
2251 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2252 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
2255 if (status
->flush_to_zero
) {
2256 float_raise(float_flag_output_denormal
, status
);
2257 return packFloat32(zSign
, 0, 0);
2260 (status
->float_detect_tininess
2261 == float_tininess_before_rounding
)
2263 || ( zSig
+ roundIncrement
< 0x80000000 );
2264 shift32RightJamming( zSig
, - zExp
, &zSig
);
2266 roundBits
= zSig
& 0x7F;
2267 if (isTiny
&& roundBits
) {
2268 float_raise(float_flag_underflow
, status
);
2273 status
->float_exception_flags
|= float_flag_inexact
;
2275 zSig
= ( zSig
+ roundIncrement
)>>7;
2276 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
2277 if ( zSig
== 0 ) zExp
= 0;
2278 return packFloat32( zSign
, zExp
, zSig
);
2282 /*----------------------------------------------------------------------------
2283 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2284 | and significand `zSig', and returns the proper single-precision floating-
2285 | point value corresponding to the abstract input. This routine is just like
2286 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2287 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2288 | floating-point exponent.
2289 *----------------------------------------------------------------------------*/
2292 normalizeRoundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
2293 float_status
*status
)
2297 shiftCount
= countLeadingZeros32( zSig
) - 1;
2298 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
2303 /*----------------------------------------------------------------------------
2304 | If `a' is denormal and we are in flush-to-zero mode then set the
2305 | input-denormal exception and return zero. Otherwise just return the value.
2306 *----------------------------------------------------------------------------*/
2307 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
2309 if (status
->flush_inputs_to_zero
) {
2310 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
2311 float_raise(float_flag_input_denormal
, status
);
2312 return make_float64(float64_val(a
) & (1ULL << 63));
2318 /*----------------------------------------------------------------------------
2319 | Normalizes the subnormal double-precision floating-point value represented
2320 | by the denormalized significand `aSig'. The normalized exponent and
2321 | significand are stored at the locations pointed to by `zExpPtr' and
2322 | `zSigPtr', respectively.
2323 *----------------------------------------------------------------------------*/
2326 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
2330 shiftCount
= countLeadingZeros64( aSig
) - 11;
2331 *zSigPtr
= aSig
<<shiftCount
;
2332 *zExpPtr
= 1 - shiftCount
;
2336 /*----------------------------------------------------------------------------
2337 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2338 | double-precision floating-point value, returning the result. After being
2339 | shifted into the proper positions, the three fields are simply added
2340 | together to form the result. This means that any integer portion of `zSig'
2341 | will be added into the exponent. Since a properly normalized significand
2342 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2343 | than the desired result exponent whenever `zSig' is a complete, normalized
2345 *----------------------------------------------------------------------------*/
2347 static inline float64
packFloat64(flag zSign
, int zExp
, uint64_t zSig
)
2350 return make_float64(
2351 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
2355 /*----------------------------------------------------------------------------
2356 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2357 | and significand `zSig', and returns the proper double-precision floating-
2358 | point value corresponding to the abstract input. Ordinarily, the abstract
2359 | value is simply rounded and packed into the double-precision format, with
2360 | the inexact exception raised if the abstract input cannot be represented
2361 | exactly. However, if the abstract value is too large, the overflow and
2362 | inexact exceptions are raised and an infinity or maximal finite value is
2363 | returned. If the abstract value is too small, the input value is rounded to
2364 | a subnormal number, and the underflow and inexact exceptions are raised if
2365 | the abstract input cannot be represented exactly as a subnormal double-
2366 | precision floating-point number.
2367 | The input significand `zSig' has its binary point between bits 62
2368 | and 61, which is 10 bits to the left of the usual location. This shifted
2369 | significand must be normalized or smaller. If `zSig' is not normalized,
2370 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2371 | and it must not require rounding. In the usual case that `zSig' is
2372 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2373 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2374 | Binary Floating-Point Arithmetic.
2375 *----------------------------------------------------------------------------*/
2377 static float64
roundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
2378 float_status
*status
)
2380 int8_t roundingMode
;
2381 flag roundNearestEven
;
2382 int roundIncrement
, roundBits
;
2385 roundingMode
= status
->float_rounding_mode
;
2386 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2387 switch (roundingMode
) {
2388 case float_round_nearest_even
:
2389 case float_round_ties_away
:
2390 roundIncrement
= 0x200;
2392 case float_round_to_zero
:
2395 case float_round_up
:
2396 roundIncrement
= zSign
? 0 : 0x3ff;
2398 case float_round_down
:
2399 roundIncrement
= zSign
? 0x3ff : 0;
2401 case float_round_to_odd
:
2402 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
2407 roundBits
= zSig
& 0x3FF;
2408 if ( 0x7FD <= (uint16_t) zExp
) {
2409 if ( ( 0x7FD < zExp
)
2410 || ( ( zExp
== 0x7FD )
2411 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
2413 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
2414 roundIncrement
!= 0;
2415 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2416 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
2419 if (status
->flush_to_zero
) {
2420 float_raise(float_flag_output_denormal
, status
);
2421 return packFloat64(zSign
, 0, 0);
2424 (status
->float_detect_tininess
2425 == float_tininess_before_rounding
)
2427 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
2428 shift64RightJamming( zSig
, - zExp
, &zSig
);
2430 roundBits
= zSig
& 0x3FF;
2431 if (isTiny
&& roundBits
) {
2432 float_raise(float_flag_underflow
, status
);
2434 if (roundingMode
== float_round_to_odd
) {
2436 * For round-to-odd case, the roundIncrement depends on
2437 * zSig which just changed.
2439 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
2444 status
->float_exception_flags
|= float_flag_inexact
;
2446 zSig
= ( zSig
+ roundIncrement
)>>10;
2447 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
2448 if ( zSig
== 0 ) zExp
= 0;
2449 return packFloat64( zSign
, zExp
, zSig
);
2453 /*----------------------------------------------------------------------------
2454 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2455 | and significand `zSig', and returns the proper double-precision floating-
2456 | point value corresponding to the abstract input. This routine is just like
2457 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2458 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2459 | floating-point exponent.
2460 *----------------------------------------------------------------------------*/
2463 normalizeRoundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
2464 float_status
*status
)
2468 shiftCount
= countLeadingZeros64( zSig
) - 1;
2469 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
2474 /*----------------------------------------------------------------------------
2475 | Normalizes the subnormal extended double-precision floating-point value
2476 | represented by the denormalized significand `aSig'. The normalized exponent
2477 | and significand are stored at the locations pointed to by `zExpPtr' and
2478 | `zSigPtr', respectively.
2479 *----------------------------------------------------------------------------*/
2481 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
2486 shiftCount
= countLeadingZeros64( aSig
);
2487 *zSigPtr
= aSig
<<shiftCount
;
2488 *zExpPtr
= 1 - shiftCount
;
2491 /*----------------------------------------------------------------------------
2492 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2493 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2494 | and returns the proper extended double-precision floating-point value
2495 | corresponding to the abstract input. Ordinarily, the abstract value is
2496 | rounded and packed into the extended double-precision format, with the
2497 | inexact exception raised if the abstract input cannot be represented
2498 | exactly. However, if the abstract value is too large, the overflow and
2499 | inexact exceptions are raised and an infinity or maximal finite value is
2500 | returned. If the abstract value is too small, the input value is rounded to
2501 | a subnormal number, and the underflow and inexact exceptions are raised if
2502 | the abstract input cannot be represented exactly as a subnormal extended
2503 | double-precision floating-point number.
2504 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
2505 | number of bits as single or double precision, respectively. Otherwise, the
2506 | result is rounded to the full precision of the extended double-precision
2508 | The input significand must be normalized or smaller. If the input
2509 | significand is not normalized, `zExp' must be 0; in that case, the result
2510 | returned is a subnormal number, and it must not require rounding. The
2511 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2512 | Floating-Point Arithmetic.
2513 *----------------------------------------------------------------------------*/
2515 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, flag zSign
,
2516 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
2517 float_status
*status
)
2519 int8_t roundingMode
;
2520 flag roundNearestEven
, increment
, isTiny
;
2521 int64_t roundIncrement
, roundMask
, roundBits
;
2523 roundingMode
= status
->float_rounding_mode
;
2524 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2525 if ( roundingPrecision
== 80 ) goto precision80
;
2526 if ( roundingPrecision
== 64 ) {
2527 roundIncrement
= LIT64( 0x0000000000000400 );
2528 roundMask
= LIT64( 0x00000000000007FF );
2530 else if ( roundingPrecision
== 32 ) {
2531 roundIncrement
= LIT64( 0x0000008000000000 );
2532 roundMask
= LIT64( 0x000000FFFFFFFFFF );
2537 zSig0
|= ( zSig1
!= 0 );
2538 switch (roundingMode
) {
2539 case float_round_nearest_even
:
2540 case float_round_ties_away
:
2542 case float_round_to_zero
:
2545 case float_round_up
:
2546 roundIncrement
= zSign
? 0 : roundMask
;
2548 case float_round_down
:
2549 roundIncrement
= zSign
? roundMask
: 0;
2554 roundBits
= zSig0
& roundMask
;
2555 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
2556 if ( ( 0x7FFE < zExp
)
2557 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
2562 if (status
->flush_to_zero
) {
2563 float_raise(float_flag_output_denormal
, status
);
2564 return packFloatx80(zSign
, 0, 0);
2567 (status
->float_detect_tininess
2568 == float_tininess_before_rounding
)
2570 || ( zSig0
<= zSig0
+ roundIncrement
);
2571 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
2573 roundBits
= zSig0
& roundMask
;
2574 if (isTiny
&& roundBits
) {
2575 float_raise(float_flag_underflow
, status
);
2578 status
->float_exception_flags
|= float_flag_inexact
;
2580 zSig0
+= roundIncrement
;
2581 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
2582 roundIncrement
= roundMask
+ 1;
2583 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
2584 roundMask
|= roundIncrement
;
2586 zSig0
&= ~ roundMask
;
2587 return packFloatx80( zSign
, zExp
, zSig0
);
2591 status
->float_exception_flags
|= float_flag_inexact
;
2593 zSig0
+= roundIncrement
;
2594 if ( zSig0
< roundIncrement
) {
2596 zSig0
= LIT64( 0x8000000000000000 );
2598 roundIncrement
= roundMask
+ 1;
2599 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
2600 roundMask
|= roundIncrement
;
2602 zSig0
&= ~ roundMask
;
2603 if ( zSig0
== 0 ) zExp
= 0;
2604 return packFloatx80( zSign
, zExp
, zSig0
);
2606 switch (roundingMode
) {
2607 case float_round_nearest_even
:
2608 case float_round_ties_away
:
2609 increment
= ((int64_t)zSig1
< 0);
2611 case float_round_to_zero
:
2614 case float_round_up
:
2615 increment
= !zSign
&& zSig1
;
2617 case float_round_down
:
2618 increment
= zSign
&& zSig1
;
2623 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
2624 if ( ( 0x7FFE < zExp
)
2625 || ( ( zExp
== 0x7FFE )
2626 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
2632 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2633 if ( ( roundingMode
== float_round_to_zero
)
2634 || ( zSign
&& ( roundingMode
== float_round_up
) )
2635 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
2637 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
2639 return packFloatx80(zSign
,
2640 floatx80_infinity_high
,
2641 floatx80_infinity_low
);
2645 (status
->float_detect_tininess
2646 == float_tininess_before_rounding
)
2649 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
2650 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
2652 if (isTiny
&& zSig1
) {
2653 float_raise(float_flag_underflow
, status
);
2656 status
->float_exception_flags
|= float_flag_inexact
;
2658 switch (roundingMode
) {
2659 case float_round_nearest_even
:
2660 case float_round_ties_away
:
2661 increment
= ((int64_t)zSig1
< 0);
2663 case float_round_to_zero
:
2666 case float_round_up
:
2667 increment
= !zSign
&& zSig1
;
2669 case float_round_down
:
2670 increment
= zSign
&& zSig1
;
2678 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
2679 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
2681 return packFloatx80( zSign
, zExp
, zSig0
);
2685 status
->float_exception_flags
|= float_flag_inexact
;
2691 zSig0
= LIT64( 0x8000000000000000 );
2694 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
2698 if ( zSig0
== 0 ) zExp
= 0;
2700 return packFloatx80( zSign
, zExp
, zSig0
);
2704 /*----------------------------------------------------------------------------
2705 | Takes an abstract floating-point value having sign `zSign', exponent
2706 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2707 | and returns the proper extended double-precision floating-point value
2708 | corresponding to the abstract input. This routine is just like
2709 | `roundAndPackFloatx80' except that the input significand does not have to be
2711 *----------------------------------------------------------------------------*/
2713 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
2714 flag zSign
, int32_t zExp
,
2715 uint64_t zSig0
, uint64_t zSig1
,
2716 float_status
*status
)
2725 shiftCount
= countLeadingZeros64( zSig0
);
2726 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
2728 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
2729 zSig0
, zSig1
, status
);
2733 /*----------------------------------------------------------------------------
2734 | Returns the least-significant 64 fraction bits of the quadruple-precision
2735 | floating-point value `a'.
2736 *----------------------------------------------------------------------------*/
2738 static inline uint64_t extractFloat128Frac1( float128 a
)
2745 /*----------------------------------------------------------------------------
2746 | Returns the most-significant 48 fraction bits of the quadruple-precision
2747 | floating-point value `a'.
2748 *----------------------------------------------------------------------------*/
2750 static inline uint64_t extractFloat128Frac0( float128 a
)
2753 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
2757 /*----------------------------------------------------------------------------
2758 | Returns the exponent bits of the quadruple-precision floating-point value
2760 *----------------------------------------------------------------------------*/
2762 static inline int32_t extractFloat128Exp( float128 a
)
2765 return ( a
.high
>>48 ) & 0x7FFF;
2769 /*----------------------------------------------------------------------------
2770 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2771 *----------------------------------------------------------------------------*/
2773 static inline flag
extractFloat128Sign( float128 a
)
2780 /*----------------------------------------------------------------------------
2781 | Normalizes the subnormal quadruple-precision floating-point value
2782 | represented by the denormalized significand formed by the concatenation of
2783 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
2784 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2785 | significand are stored at the location pointed to by `zSig0Ptr', and the
2786 | least significant 64 bits of the normalized significand are stored at the
2787 | location pointed to by `zSig1Ptr'.
2788 *----------------------------------------------------------------------------*/
2791 normalizeFloat128Subnormal(
2802 shiftCount
= countLeadingZeros64( aSig1
) - 15;
2803 if ( shiftCount
< 0 ) {
2804 *zSig0Ptr
= aSig1
>>( - shiftCount
);
2805 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
2808 *zSig0Ptr
= aSig1
<<shiftCount
;
2811 *zExpPtr
= - shiftCount
- 63;
2814 shiftCount
= countLeadingZeros64( aSig0
) - 15;
2815 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
2816 *zExpPtr
= 1 - shiftCount
;
2821 /*----------------------------------------------------------------------------
2822 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2823 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2824 | floating-point value, returning the result. After being shifted into the
2825 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2826 | added together to form the most significant 32 bits of the result. This
2827 | means that any integer portion of `zSig0' will be added into the exponent.
2828 | Since a properly normalized significand will have an integer portion equal
2829 | to 1, the `zExp' input should be 1 less than the desired result exponent
2830 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2832 *----------------------------------------------------------------------------*/
2834 static inline float128
2835 packFloat128( flag zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
2840 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
2845 /*----------------------------------------------------------------------------
2846 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2847 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2848 | and `zSig2', and returns the proper quadruple-precision floating-point value
2849 | corresponding to the abstract input. Ordinarily, the abstract value is
2850 | simply rounded and packed into the quadruple-precision format, with the
2851 | inexact exception raised if the abstract input cannot be represented
2852 | exactly. However, if the abstract value is too large, the overflow and
2853 | inexact exceptions are raised and an infinity or maximal finite value is
2854 | returned. If the abstract value is too small, the input value is rounded to
2855 | a subnormal number, and the underflow and inexact exceptions are raised if
2856 | the abstract input cannot be represented exactly as a subnormal quadruple-
2857 | precision floating-point number.
2858 | The input significand must be normalized or smaller. If the input
2859 | significand is not normalized, `zExp' must be 0; in that case, the result
2860 | returned is a subnormal number, and it must not require rounding. In the
2861 | usual case that the input significand is normalized, `zExp' must be 1 less
2862 | than the ``true'' floating-point exponent. The handling of underflow and
2863 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2864 *----------------------------------------------------------------------------*/
2866 static float128
roundAndPackFloat128(flag zSign
, int32_t zExp
,
2867 uint64_t zSig0
, uint64_t zSig1
,
2868 uint64_t zSig2
, float_status
*status
)
2870 int8_t roundingMode
;
2871 flag roundNearestEven
, increment
, isTiny
;
2873 roundingMode
= status
->float_rounding_mode
;
2874 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2875 switch (roundingMode
) {
2876 case float_round_nearest_even
:
2877 case float_round_ties_away
:
2878 increment
= ((int64_t)zSig2
< 0);
2880 case float_round_to_zero
:
2883 case float_round_up
:
2884 increment
= !zSign
&& zSig2
;
2886 case float_round_down
:
2887 increment
= zSign
&& zSig2
;
2889 case float_round_to_odd
:
2890 increment
= !(zSig1
& 0x1) && zSig2
;
2895 if ( 0x7FFD <= (uint32_t) zExp
) {
2896 if ( ( 0x7FFD < zExp
)
2897 || ( ( zExp
== 0x7FFD )
2899 LIT64( 0x0001FFFFFFFFFFFF ),
2900 LIT64( 0xFFFFFFFFFFFFFFFF ),
2907 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2908 if ( ( roundingMode
== float_round_to_zero
)
2909 || ( zSign
&& ( roundingMode
== float_round_up
) )
2910 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
2911 || (roundingMode
== float_round_to_odd
)
2917 LIT64( 0x0000FFFFFFFFFFFF ),
2918 LIT64( 0xFFFFFFFFFFFFFFFF )
2921 return packFloat128( zSign
, 0x7FFF, 0, 0 );
2924 if (status
->flush_to_zero
) {
2925 float_raise(float_flag_output_denormal
, status
);
2926 return packFloat128(zSign
, 0, 0, 0);
2929 (status
->float_detect_tininess
2930 == float_tininess_before_rounding
)
2936 LIT64( 0x0001FFFFFFFFFFFF ),
2937 LIT64( 0xFFFFFFFFFFFFFFFF )
2939 shift128ExtraRightJamming(
2940 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
2942 if (isTiny
&& zSig2
) {
2943 float_raise(float_flag_underflow
, status
);
2945 switch (roundingMode
) {
2946 case float_round_nearest_even
:
2947 case float_round_ties_away
:
2948 increment
= ((int64_t)zSig2
< 0);
2950 case float_round_to_zero
:
2953 case float_round_up
:
2954 increment
= !zSign
&& zSig2
;
2956 case float_round_down
:
2957 increment
= zSign
&& zSig2
;
2959 case float_round_to_odd
:
2960 increment
= !(zSig1
& 0x1) && zSig2
;
2968 status
->float_exception_flags
|= float_flag_inexact
;
2971 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
2972 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
2975 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
2977 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
2981 /*----------------------------------------------------------------------------
2982 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2983 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2984 | returns the proper quadruple-precision floating-point value corresponding
2985 | to the abstract input. This routine is just like `roundAndPackFloat128'
2986 | except that the input significand has fewer bits and does not have to be
2987 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
2989 *----------------------------------------------------------------------------*/
2991 static float128
normalizeRoundAndPackFloat128(flag zSign
, int32_t zExp
,
2992 uint64_t zSig0
, uint64_t zSig1
,
2993 float_status
*status
)
3003 shiftCount
= countLeadingZeros64( zSig0
) - 15;
3004 if ( 0 <= shiftCount
) {
3006 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
3009 shift128ExtraRightJamming(
3010 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
3013 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
3018 /*----------------------------------------------------------------------------
3019 | Returns the result of converting the 32-bit two's complement integer `a'
3020 | to the extended double-precision floating-point format. The conversion
3021 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3023 *----------------------------------------------------------------------------*/
3025 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
3032 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
3034 absA
= zSign
? - a
: a
;
3035 shiftCount
= countLeadingZeros32( absA
) + 32;
3037 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
3041 /*----------------------------------------------------------------------------
3042 | Returns the result of converting the 32-bit two's complement integer `a' to
3043 | the quadruple-precision floating-point format. The conversion is performed
3044 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3045 *----------------------------------------------------------------------------*/
3047 float128
int32_to_float128(int32_t a
, float_status
*status
)
3054 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
3056 absA
= zSign
? - a
: a
;
3057 shiftCount
= countLeadingZeros32( absA
) + 17;
3059 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
3063 /*----------------------------------------------------------------------------
3064 | Returns the result of converting the 64-bit two's complement integer `a'
3065 | to the extended double-precision floating-point format. The conversion
3066 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3068 *----------------------------------------------------------------------------*/
3070 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
3076 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
3078 absA
= zSign
? - a
: a
;
3079 shiftCount
= countLeadingZeros64( absA
);
3080 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
3084 /*----------------------------------------------------------------------------
3085 | Returns the result of converting the 64-bit two's complement integer `a' to
3086 | the quadruple-precision floating-point format. The conversion is performed
3087 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3088 *----------------------------------------------------------------------------*/
3090 float128
int64_to_float128(int64_t a
, float_status
*status
)
3096 uint64_t zSig0
, zSig1
;
3098 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
3100 absA
= zSign
? - a
: a
;
3101 shiftCount
= countLeadingZeros64( absA
) + 49;
3102 zExp
= 0x406E - shiftCount
;
3103 if ( 64 <= shiftCount
) {
3112 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
3113 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
3117 /*----------------------------------------------------------------------------
3118 | Returns the result of converting the 64-bit unsigned integer `a'
3119 | to the quadruple-precision floating-point format. The conversion is performed
3120 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3121 *----------------------------------------------------------------------------*/
3123 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
3126 return float128_zero
;
3128 return normalizeRoundAndPackFloat128(0, 0x406E, a
, 0, status
);
3134 /*----------------------------------------------------------------------------
3135 | Returns the result of converting the single-precision floating-point value
3136 | `a' to the double-precision floating-point format. The conversion is
3137 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3139 *----------------------------------------------------------------------------*/
3141 float64
float32_to_float64(float32 a
, float_status
*status
)
3146 a
= float32_squash_input_denormal(a
, status
);
3148 aSig
= extractFloat32Frac( a
);
3149 aExp
= extractFloat32Exp( a
);
3150 aSign
= extractFloat32Sign( a
);
3151 if ( aExp
== 0xFF ) {
3153 return commonNaNToFloat64(float32ToCommonNaN(a
, status
), status
);
3155 return packFloat64( aSign
, 0x7FF, 0 );
3158 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
3159 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3162 return packFloat64( aSign
, aExp
+ 0x380, ( (uint64_t) aSig
)<<29 );
3166 /*----------------------------------------------------------------------------
3167 | Returns the result of converting the single-precision floating-point value
3168 | `a' to the extended double-precision floating-point format. The conversion
3169 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3171 *----------------------------------------------------------------------------*/
3173 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
3179 a
= float32_squash_input_denormal(a
, status
);
3180 aSig
= extractFloat32Frac( a
);
3181 aExp
= extractFloat32Exp( a
);
3182 aSign
= extractFloat32Sign( a
);
3183 if ( aExp
== 0xFF ) {
3185 return commonNaNToFloatx80(float32ToCommonNaN(a
, status
), status
);
3187 return packFloatx80(aSign
,
3188 floatx80_infinity_high
,
3189 floatx80_infinity_low
);
3192 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
3193 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3196 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
3200 /*----------------------------------------------------------------------------
3201 | Returns the result of converting the single-precision floating-point value
3202 | `a' to the double-precision floating-point format. The conversion is
3203 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3205 *----------------------------------------------------------------------------*/
3207 float128
float32_to_float128(float32 a
, float_status
*status
)
3213 a
= float32_squash_input_denormal(a
, status
);
3214 aSig
= extractFloat32Frac( a
);
3215 aExp
= extractFloat32Exp( a
);
3216 aSign
= extractFloat32Sign( a
);
3217 if ( aExp
== 0xFF ) {
3219 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
3221 return packFloat128( aSign
, 0x7FFF, 0, 0 );
3224 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
3225 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3228 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
3232 /*----------------------------------------------------------------------------
3233 | Returns the remainder of the single-precision floating-point value `a'
3234 | with respect to the corresponding value `b'. The operation is performed
3235 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3236 *----------------------------------------------------------------------------*/
3238 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
3241 int aExp
, bExp
, expDiff
;
3242 uint32_t aSig
, bSig
;
3244 uint64_t aSig64
, bSig64
, q64
;
3245 uint32_t alternateASig
;
3247 a
= float32_squash_input_denormal(a
, status
);
3248 b
= float32_squash_input_denormal(b
, status
);
3250 aSig
= extractFloat32Frac( a
);
3251 aExp
= extractFloat32Exp( a
);
3252 aSign
= extractFloat32Sign( a
);
3253 bSig
= extractFloat32Frac( b
);
3254 bExp
= extractFloat32Exp( b
);
3255 if ( aExp
== 0xFF ) {
3256 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
3257 return propagateFloat32NaN(a
, b
, status
);
3259 float_raise(float_flag_invalid
, status
);
3260 return float32_default_nan(status
);
3262 if ( bExp
== 0xFF ) {
3264 return propagateFloat32NaN(a
, b
, status
);
3270 float_raise(float_flag_invalid
, status
);
3271 return float32_default_nan(status
);
3273 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
3276 if ( aSig
== 0 ) return a
;
3277 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3279 expDiff
= aExp
- bExp
;
3282 if ( expDiff
< 32 ) {
3285 if ( expDiff
< 0 ) {
3286 if ( expDiff
< -1 ) return a
;
3289 q
= ( bSig
<= aSig
);
3290 if ( q
) aSig
-= bSig
;
3291 if ( 0 < expDiff
) {
3292 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
3295 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3303 if ( bSig
<= aSig
) aSig
-= bSig
;
3304 aSig64
= ( (uint64_t) aSig
)<<40;
3305 bSig64
= ( (uint64_t) bSig
)<<40;
3307 while ( 0 < expDiff
) {
3308 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
3309 q64
= ( 2 < q64
) ? q64
- 2 : 0;
3310 aSig64
= - ( ( bSig
* q64
)<<38 );
3314 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
3315 q64
= ( 2 < q64
) ? q64
- 2 : 0;
3316 q
= q64
>>( 64 - expDiff
);
3318 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
3321 alternateASig
= aSig
;
3324 } while ( 0 <= (int32_t) aSig
);
3325 sigMean
= aSig
+ alternateASig
;
3326 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3327 aSig
= alternateASig
;
3329 zSign
= ( (int32_t) aSig
< 0 );
3330 if ( zSign
) aSig
= - aSig
;
3331 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
3336 /*----------------------------------------------------------------------------
3337 | Returns the binary exponential of the single-precision floating-point value
3338 | `a'. The operation is performed according to the IEC/IEEE Standard for
3339 | Binary Floating-Point Arithmetic.
3341 | Uses the following identities:
3343 | 1. -------------------------------------------------------------------------
3347 | 2. -------------------------------------------------------------------------
3350 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3352 *----------------------------------------------------------------------------*/
3354 static const float64 float32_exp2_coefficients
[15] =
3356 const_float64( 0x3ff0000000000000ll
), /* 1 */
3357 const_float64( 0x3fe0000000000000ll
), /* 2 */
3358 const_float64( 0x3fc5555555555555ll
), /* 3 */
3359 const_float64( 0x3fa5555555555555ll
), /* 4 */
3360 const_float64( 0x3f81111111111111ll
), /* 5 */
3361 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
3362 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
3363 const_float64( 0x3efa01a01a01a01all
), /* 8 */
3364 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
3365 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
3366 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
3367 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
3368 const_float64( 0x3de6124613a86d09ll
), /* 13 */
3369 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
3370 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
3373 float32
float32_exp2(float32 a
, float_status
*status
)
3380 a
= float32_squash_input_denormal(a
, status
);
3382 aSig
= extractFloat32Frac( a
);
3383 aExp
= extractFloat32Exp( a
);
3384 aSign
= extractFloat32Sign( a
);
3386 if ( aExp
== 0xFF) {
3388 return propagateFloat32NaN(a
, float32_zero
, status
);
3390 return (aSign
) ? float32_zero
: a
;
3393 if (aSig
== 0) return float32_one
;
3396 float_raise(float_flag_inexact
, status
);
3398 /* ******************************* */
3399 /* using float64 for approximation */
3400 /* ******************************* */
3401 x
= float32_to_float64(a
, status
);
3402 x
= float64_mul(x
, float64_ln2
, status
);
3406 for (i
= 0 ; i
< 15 ; i
++) {
3409 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
3410 r
= float64_add(r
, f
, status
);
3412 xn
= float64_mul(xn
, x
, status
);
3415 return float64_to_float32(r
, status
);
3418 /*----------------------------------------------------------------------------
3419 | Returns the binary log of the single-precision floating-point value `a'.
3420 | The operation is performed according to the IEC/IEEE Standard for Binary
3421 | Floating-Point Arithmetic.
3422 *----------------------------------------------------------------------------*/
3423 float32
float32_log2(float32 a
, float_status
*status
)
3427 uint32_t aSig
, zSig
, i
;
3429 a
= float32_squash_input_denormal(a
, status
);
3430 aSig
= extractFloat32Frac( a
);
3431 aExp
= extractFloat32Exp( a
);
3432 aSign
= extractFloat32Sign( a
);
3435 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
3436 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3439 float_raise(float_flag_invalid
, status
);
3440 return float32_default_nan(status
);
3442 if ( aExp
== 0xFF ) {
3444 return propagateFloat32NaN(a
, float32_zero
, status
);
3454 for (i
= 1 << 22; i
> 0; i
>>= 1) {
3455 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
3456 if ( aSig
& 0x01000000 ) {
3465 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
3468 /*----------------------------------------------------------------------------
3469 | Returns 1 if the single-precision floating-point value `a' is equal to
3470 | the corresponding value `b', and 0 otherwise. The invalid exception is
3471 | raised if either operand is a NaN. Otherwise, the comparison is performed
3472 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3473 *----------------------------------------------------------------------------*/
3475 int float32_eq(float32 a
, float32 b
, float_status
*status
)
3478 a
= float32_squash_input_denormal(a
, status
);
3479 b
= float32_squash_input_denormal(b
, status
);
3481 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3482 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3484 float_raise(float_flag_invalid
, status
);
3487 av
= float32_val(a
);
3488 bv
= float32_val(b
);
3489 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3492 /*----------------------------------------------------------------------------
3493 | Returns 1 if the single-precision floating-point value `a' is less than
3494 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3495 | exception is raised if either operand is a NaN. The comparison is performed
3496 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3497 *----------------------------------------------------------------------------*/
3499 int float32_le(float32 a
, float32 b
, float_status
*status
)
3503 a
= float32_squash_input_denormal(a
, status
);
3504 b
= float32_squash_input_denormal(b
, status
);
3506 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3507 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3509 float_raise(float_flag_invalid
, status
);
3512 aSign
= extractFloat32Sign( a
);
3513 bSign
= extractFloat32Sign( b
);
3514 av
= float32_val(a
);
3515 bv
= float32_val(b
);
3516 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3517 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3521 /*----------------------------------------------------------------------------
3522 | Returns 1 if the single-precision floating-point value `a' is less than
3523 | the corresponding value `b', and 0 otherwise. The invalid exception is
3524 | raised if either operand is a NaN. The comparison is performed according
3525 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3526 *----------------------------------------------------------------------------*/
3528 int float32_lt(float32 a
, float32 b
, float_status
*status
)
3532 a
= float32_squash_input_denormal(a
, status
);
3533 b
= float32_squash_input_denormal(b
, status
);
3535 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3536 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3538 float_raise(float_flag_invalid
, status
);
3541 aSign
= extractFloat32Sign( a
);
3542 bSign
= extractFloat32Sign( b
);
3543 av
= float32_val(a
);
3544 bv
= float32_val(b
);
3545 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
3546 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3550 /*----------------------------------------------------------------------------
3551 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3552 | be compared, and 0 otherwise. The invalid exception is raised if either
3553 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3554 | Standard for Binary Floating-Point Arithmetic.
3555 *----------------------------------------------------------------------------*/
3557 int float32_unordered(float32 a
, float32 b
, float_status
*status
)
3559 a
= float32_squash_input_denormal(a
, status
);
3560 b
= float32_squash_input_denormal(b
, status
);
3562 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3563 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3565 float_raise(float_flag_invalid
, status
);
3571 /*----------------------------------------------------------------------------
3572 | Returns 1 if the single-precision floating-point value `a' is equal to
3573 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3574 | exception. The comparison is performed according to the IEC/IEEE Standard
3575 | for Binary Floating-Point Arithmetic.
3576 *----------------------------------------------------------------------------*/
3578 int float32_eq_quiet(float32 a
, float32 b
, float_status
*status
)
3580 a
= float32_squash_input_denormal(a
, status
);
3581 b
= float32_squash_input_denormal(b
, status
);
3583 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3584 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3586 if (float32_is_signaling_nan(a
, status
)
3587 || float32_is_signaling_nan(b
, status
)) {
3588 float_raise(float_flag_invalid
, status
);
3592 return ( float32_val(a
) == float32_val(b
) ) ||
3593 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
3596 /*----------------------------------------------------------------------------
3597 | Returns 1 if the single-precision floating-point value `a' is less than or
3598 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3599 | cause an exception. Otherwise, the comparison is performed according to the
3600 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3601 *----------------------------------------------------------------------------*/
3603 int float32_le_quiet(float32 a
, float32 b
, float_status
*status
)
3607 a
= float32_squash_input_denormal(a
, status
);
3608 b
= float32_squash_input_denormal(b
, status
);
3610 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3611 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3613 if (float32_is_signaling_nan(a
, status
)
3614 || float32_is_signaling_nan(b
, status
)) {
3615 float_raise(float_flag_invalid
, status
);
3619 aSign
= extractFloat32Sign( a
);
3620 bSign
= extractFloat32Sign( b
);
3621 av
= float32_val(a
);
3622 bv
= float32_val(b
);
3623 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3624 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3628 /*----------------------------------------------------------------------------
3629 | Returns 1 if the single-precision floating-point value `a' is less than
3630 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3631 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3632 | Standard for Binary Floating-Point Arithmetic.
3633 *----------------------------------------------------------------------------*/
3635 int float32_lt_quiet(float32 a
, float32 b
, float_status
*status
)
3639 a
= float32_squash_input_denormal(a
, status
);
3640 b
= float32_squash_input_denormal(b
, status
);
3642 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3643 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3645 if (float32_is_signaling_nan(a
, status
)
3646 || float32_is_signaling_nan(b
, status
)) {
3647 float_raise(float_flag_invalid
, status
);
3651 aSign
= extractFloat32Sign( a
);
3652 bSign
= extractFloat32Sign( b
);
3653 av
= float32_val(a
);
3654 bv
= float32_val(b
);
3655 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
3656 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3660 /*----------------------------------------------------------------------------
3661 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3662 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3663 | comparison is performed according to the IEC/IEEE Standard for Binary
3664 | Floating-Point Arithmetic.
3665 *----------------------------------------------------------------------------*/
3667 int float32_unordered_quiet(float32 a
, float32 b
, float_status
*status
)
3669 a
= float32_squash_input_denormal(a
, status
);
3670 b
= float32_squash_input_denormal(b
, status
);
3672 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3673 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3675 if (float32_is_signaling_nan(a
, status
)
3676 || float32_is_signaling_nan(b
, status
)) {
3677 float_raise(float_flag_invalid
, status
);
3685 /*----------------------------------------------------------------------------
3686 | Returns the result of converting the double-precision floating-point value
3687 | `a' to the single-precision floating-point format. The conversion is
3688 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3690 *----------------------------------------------------------------------------*/
3692 float32
float64_to_float32(float64 a
, float_status
*status
)
3698 a
= float64_squash_input_denormal(a
, status
);
3700 aSig
= extractFloat64Frac( a
);
3701 aExp
= extractFloat64Exp( a
);
3702 aSign
= extractFloat64Sign( a
);
3703 if ( aExp
== 0x7FF ) {
3705 return commonNaNToFloat32(float64ToCommonNaN(a
, status
), status
);
3707 return packFloat32( aSign
, 0xFF, 0 );
3709 shift64RightJamming( aSig
, 22, &aSig
);
3711 if ( aExp
|| zSig
) {
3715 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
3720 /*----------------------------------------------------------------------------
3721 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3722 | half-precision floating-point value, returning the result. After being
3723 | shifted into the proper positions, the three fields are simply added
3724 | together to form the result. This means that any integer portion of `zSig'
3725 | will be added into the exponent. Since a properly normalized significand
3726 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3727 | than the desired result exponent whenever `zSig' is a complete, normalized
3729 *----------------------------------------------------------------------------*/
3730 static float16
packFloat16(flag zSign
, int zExp
, uint16_t zSig
)
3732 return make_float16(
3733 (((uint32_t)zSign
) << 15) + (((uint32_t)zExp
) << 10) + zSig
);
3736 /*----------------------------------------------------------------------------
3737 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3738 | and significand `zSig', and returns the proper half-precision floating-
3739 | point value corresponding to the abstract input. Ordinarily, the abstract
3740 | value is simply rounded and packed into the half-precision format, with
3741 | the inexact exception raised if the abstract input cannot be represented
3742 | exactly. However, if the abstract value is too large, the overflow and
3743 | inexact exceptions are raised and an infinity or maximal finite value is
3744 | returned. If the abstract value is too small, the input value is rounded to
3745 | a subnormal number, and the underflow and inexact exceptions are raised if
3746 | the abstract input cannot be represented exactly as a subnormal half-
3747 | precision floating-point number.
3748 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3749 | ARM-style "alternative representation", which omits the NaN and Inf
3750 | encodings in order to raise the maximum representable exponent by one.
3751 | The input significand `zSig' has its binary point between bits 22
3752 | and 23, which is 13 bits to the left of the usual location. This shifted
3753 | significand must be normalized or smaller. If `zSig' is not normalized,
3754 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3755 | and it must not require rounding. In the usual case that `zSig' is
3756 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3757 | Note the slightly odd position of the binary point in zSig compared with the
3758 | other roundAndPackFloat functions. This should probably be fixed if we
3759 | need to implement more float16 routines than just conversion.
3760 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3761 | Binary Floating-Point Arithmetic.
3762 *----------------------------------------------------------------------------*/
3764 static float16
roundAndPackFloat16(flag zSign
, int zExp
,
3765 uint32_t zSig
, flag ieee
,
3766 float_status
*status
)
3768 int maxexp
= ieee
? 29 : 30;
3771 bool rounding_bumps_exp
;
3772 bool is_tiny
= false;
3774 /* Calculate the mask of bits of the mantissa which are not
3775 * representable in half-precision and will be lost.
3778 /* Will be denormal in halfprec */
3784 /* Normal number in halfprec */
3788 switch (status
->float_rounding_mode
) {
3789 case float_round_nearest_even
:
3790 increment
= (mask
+ 1) >> 1;
3791 if ((zSig
& mask
) == increment
) {
3792 increment
= zSig
& (increment
<< 1);
3795 case float_round_ties_away
:
3796 increment
= (mask
+ 1) >> 1;
3798 case float_round_up
:
3799 increment
= zSign
? 0 : mask
;
3801 case float_round_down
:
3802 increment
= zSign
? mask
: 0;
3804 default: /* round_to_zero */
3809 rounding_bumps_exp
= (zSig
+ increment
>= 0x01000000);
3811 if (zExp
> maxexp
|| (zExp
== maxexp
&& rounding_bumps_exp
)) {
3813 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3814 return packFloat16(zSign
, 0x1f, 0);
3816 float_raise(float_flag_invalid
, status
);
3817 return packFloat16(zSign
, 0x1f, 0x3ff);
3822 /* Note that flush-to-zero does not affect half-precision results */
3824 (status
->float_detect_tininess
== float_tininess_before_rounding
)
3826 || (!rounding_bumps_exp
);
3829 float_raise(float_flag_inexact
, status
);
3831 float_raise(float_flag_underflow
, status
);
3836 if (rounding_bumps_exp
) {
3842 return packFloat16(zSign
, 0, 0);
3848 return packFloat16(zSign
, zExp
, zSig
>> 13);
3851 /*----------------------------------------------------------------------------
3852 | If `a' is denormal and we are in flush-to-zero mode then set the
3853 | input-denormal exception and return zero. Otherwise just return the value.
3854 *----------------------------------------------------------------------------*/
3855 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
3857 if (status
->flush_inputs_to_zero
) {
3858 if (extractFloat16Exp(a
) == 0 && extractFloat16Frac(a
) != 0) {
3859 float_raise(float_flag_input_denormal
, status
);
3860 return make_float16(float16_val(a
) & 0x8000);
3866 static void normalizeFloat16Subnormal(uint32_t aSig
, int *zExpPtr
,
3869 int8_t shiftCount
= countLeadingZeros32(aSig
) - 21;
3870 *zSigPtr
= aSig
<< shiftCount
;
3871 *zExpPtr
= 1 - shiftCount
;
3874 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3875 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3877 float32
float16_to_float32(float16 a
, flag ieee
, float_status
*status
)
3883 aSign
= extractFloat16Sign(a
);
3884 aExp
= extractFloat16Exp(a
);
3885 aSig
= extractFloat16Frac(a
);
3887 if (aExp
== 0x1f && ieee
) {
3889 return commonNaNToFloat32(float16ToCommonNaN(a
, status
), status
);
3891 return packFloat32(aSign
, 0xff, 0);
3895 return packFloat32(aSign
, 0, 0);
3898 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3901 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
3904 float16
float32_to_float16(float32 a
, flag ieee
, float_status
*status
)
3910 a
= float32_squash_input_denormal(a
, status
);
3912 aSig
= extractFloat32Frac( a
);
3913 aExp
= extractFloat32Exp( a
);
3914 aSign
= extractFloat32Sign( a
);
3915 if ( aExp
== 0xFF ) {
3917 /* Input is a NaN */
3919 float_raise(float_flag_invalid
, status
);
3920 return packFloat16(aSign
, 0, 0);
3922 return commonNaNToFloat16(
3923 float32ToCommonNaN(a
, status
), status
);
3927 float_raise(float_flag_invalid
, status
);
3928 return packFloat16(aSign
, 0x1f, 0x3ff);
3930 return packFloat16(aSign
, 0x1f, 0);
3932 if (aExp
== 0 && aSig
== 0) {
3933 return packFloat16(aSign
, 0, 0);
3935 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3936 * even if the input is denormal; however this is harmless because
3937 * the largest possible single-precision denormal is still smaller
3938 * than the smallest representable half-precision denormal, and so we
3939 * will end up ignoring aSig and returning via the "always return zero"
3945 return roundAndPackFloat16(aSign
, aExp
, aSig
, ieee
, status
);
3948 float64
float16_to_float64(float16 a
, flag ieee
, float_status
*status
)
3954 aSign
= extractFloat16Sign(a
);
3955 aExp
= extractFloat16Exp(a
);
3956 aSig
= extractFloat16Frac(a
);
3958 if (aExp
== 0x1f && ieee
) {
3960 return commonNaNToFloat64(
3961 float16ToCommonNaN(a
, status
), status
);
3963 return packFloat64(aSign
, 0x7ff, 0);
3967 return packFloat64(aSign
, 0, 0);
3970 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3973 return packFloat64(aSign
, aExp
+ 0x3f0, ((uint64_t)aSig
) << 42);
3976 float16
float64_to_float16(float64 a
, flag ieee
, float_status
*status
)
3983 a
= float64_squash_input_denormal(a
, status
);
3985 aSig
= extractFloat64Frac(a
);
3986 aExp
= extractFloat64Exp(a
);
3987 aSign
= extractFloat64Sign(a
);
3988 if (aExp
== 0x7FF) {
3990 /* Input is a NaN */
3992 float_raise(float_flag_invalid
, status
);
3993 return packFloat16(aSign
, 0, 0);
3995 return commonNaNToFloat16(
3996 float64ToCommonNaN(a
, status
), status
);
4000 float_raise(float_flag_invalid
, status
);
4001 return packFloat16(aSign
, 0x1f, 0x3ff);
4003 return packFloat16(aSign
, 0x1f, 0);
4005 shift64RightJamming(aSig
, 29, &aSig
);
4007 if (aExp
== 0 && zSig
== 0) {
4008 return packFloat16(aSign
, 0, 0);
4010 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4011 * even if the input is denormal; however this is harmless because
4012 * the largest possible single-precision denormal is still smaller
4013 * than the smallest representable half-precision denormal, and so we
4014 * will end up ignoring aSig and returning via the "always return zero"
4020 return roundAndPackFloat16(aSign
, aExp
, zSig
, ieee
, status
);
4023 /*----------------------------------------------------------------------------
4024 | Returns the result of converting the double-precision floating-point value
4025 | `a' to the extended double-precision floating-point format. The conversion
4026 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4028 *----------------------------------------------------------------------------*/
4030 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
4036 a
= float64_squash_input_denormal(a
, status
);
4037 aSig
= extractFloat64Frac( a
);
4038 aExp
= extractFloat64Exp( a
);
4039 aSign
= extractFloat64Sign( a
);
4040 if ( aExp
== 0x7FF ) {
4042 return commonNaNToFloatx80(float64ToCommonNaN(a
, status
), status
);
4044 return packFloatx80(aSign
,
4045 floatx80_infinity_high
,
4046 floatx80_infinity_low
);
4049 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
4050 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4054 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
4058 /*----------------------------------------------------------------------------
4059 | Returns the result of converting the double-precision floating-point value
4060 | `a' to the quadruple-precision floating-point format. The conversion is
4061 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4063 *----------------------------------------------------------------------------*/
4065 float128
float64_to_float128(float64 a
, float_status
*status
)
4069 uint64_t aSig
, zSig0
, zSig1
;
4071 a
= float64_squash_input_denormal(a
, status
);
4072 aSig
= extractFloat64Frac( a
);
4073 aExp
= extractFloat64Exp( a
);
4074 aSign
= extractFloat64Sign( a
);
4075 if ( aExp
== 0x7FF ) {
4077 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
4079 return packFloat128( aSign
, 0x7FFF, 0, 0 );
4082 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
4083 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4086 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
4087 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
4092 /*----------------------------------------------------------------------------
4093 | Returns the remainder of the double-precision floating-point value `a'
4094 | with respect to the corresponding value `b'. The operation is performed
4095 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4096 *----------------------------------------------------------------------------*/
4098 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
4101 int aExp
, bExp
, expDiff
;
4102 uint64_t aSig
, bSig
;
4103 uint64_t q
, alternateASig
;
4106 a
= float64_squash_input_denormal(a
, status
);
4107 b
= float64_squash_input_denormal(b
, status
);
4108 aSig
= extractFloat64Frac( a
);
4109 aExp
= extractFloat64Exp( a
);
4110 aSign
= extractFloat64Sign( a
);
4111 bSig
= extractFloat64Frac( b
);
4112 bExp
= extractFloat64Exp( b
);
4113 if ( aExp
== 0x7FF ) {
4114 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
4115 return propagateFloat64NaN(a
, b
, status
);
4117 float_raise(float_flag_invalid
, status
);
4118 return float64_default_nan(status
);
4120 if ( bExp
== 0x7FF ) {
4122 return propagateFloat64NaN(a
, b
, status
);
4128 float_raise(float_flag_invalid
, status
);
4129 return float64_default_nan(status
);
4131 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4134 if ( aSig
== 0 ) return a
;
4135 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4137 expDiff
= aExp
- bExp
;
4138 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
4139 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4140 if ( expDiff
< 0 ) {
4141 if ( expDiff
< -1 ) return a
;
4144 q
= ( bSig
<= aSig
);
4145 if ( q
) aSig
-= bSig
;
4147 while ( 0 < expDiff
) {
4148 q
= estimateDiv128To64( aSig
, 0, bSig
);
4149 q
= ( 2 < q
) ? q
- 2 : 0;
4150 aSig
= - ( ( bSig
>>2 ) * q
);
4154 if ( 0 < expDiff
) {
4155 q
= estimateDiv128To64( aSig
, 0, bSig
);
4156 q
= ( 2 < q
) ? q
- 2 : 0;
4159 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4166 alternateASig
= aSig
;
4169 } while ( 0 <= (int64_t) aSig
);
4170 sigMean
= aSig
+ alternateASig
;
4171 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4172 aSig
= alternateASig
;
4174 zSign
= ( (int64_t) aSig
< 0 );
4175 if ( zSign
) aSig
= - aSig
;
4176 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
4180 /*----------------------------------------------------------------------------
4181 | Returns the binary log of the double-precision floating-point value `a'.
4182 | The operation is performed according to the IEC/IEEE Standard for Binary
4183 | Floating-Point Arithmetic.
4184 *----------------------------------------------------------------------------*/
4185 float64
float64_log2(float64 a
, float_status
*status
)
4189 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
4190 a
= float64_squash_input_denormal(a
, status
);
4192 aSig
= extractFloat64Frac( a
);
4193 aExp
= extractFloat64Exp( a
);
4194 aSign
= extractFloat64Sign( a
);
4197 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
4198 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4201 float_raise(float_flag_invalid
, status
);
4202 return float64_default_nan(status
);
4204 if ( aExp
== 0x7FF ) {
4206 return propagateFloat64NaN(a
, float64_zero
, status
);
4212 aSig
|= LIT64( 0x0010000000000000 );
4214 zSig
= (uint64_t)aExp
<< 52;
4215 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
4216 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
4217 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
4218 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
4226 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
4229 /*----------------------------------------------------------------------------
4230 | Returns 1 if the double-precision floating-point value `a' is equal to the
4231 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4232 | if either operand is a NaN. Otherwise, the comparison is performed
4233 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4234 *----------------------------------------------------------------------------*/
4236 int float64_eq(float64 a
, float64 b
, float_status
*status
)
4239 a
= float64_squash_input_denormal(a
, status
);
4240 b
= float64_squash_input_denormal(b
, status
);
4242 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4243 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4245 float_raise(float_flag_invalid
, status
);
4248 av
= float64_val(a
);
4249 bv
= float64_val(b
);
4250 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4254 /*----------------------------------------------------------------------------
4255 | Returns 1 if the double-precision floating-point value `a' is less than or
4256 | equal to the corresponding value `b', and 0 otherwise. The invalid
4257 | exception is raised if either operand is a NaN. The comparison is performed
4258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4259 *----------------------------------------------------------------------------*/
4261 int float64_le(float64 a
, float64 b
, float_status
*status
)
4265 a
= float64_squash_input_denormal(a
, status
);
4266 b
= float64_squash_input_denormal(b
, status
);
4268 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4269 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4271 float_raise(float_flag_invalid
, status
);
4274 aSign
= extractFloat64Sign( a
);
4275 bSign
= extractFloat64Sign( b
);
4276 av
= float64_val(a
);
4277 bv
= float64_val(b
);
4278 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4279 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4283 /*----------------------------------------------------------------------------
4284 | Returns 1 if the double-precision floating-point value `a' is less than
4285 | the corresponding value `b', and 0 otherwise. The invalid exception is
4286 | raised if either operand is a NaN. The comparison is performed according
4287 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4288 *----------------------------------------------------------------------------*/
4290 int float64_lt(float64 a
, float64 b
, float_status
*status
)
4295 a
= float64_squash_input_denormal(a
, status
);
4296 b
= float64_squash_input_denormal(b
, status
);
4297 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4298 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4300 float_raise(float_flag_invalid
, status
);
4303 aSign
= extractFloat64Sign( a
);
4304 bSign
= extractFloat64Sign( b
);
4305 av
= float64_val(a
);
4306 bv
= float64_val(b
);
4307 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4308 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4312 /*----------------------------------------------------------------------------
4313 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4314 | be compared, and 0 otherwise. The invalid exception is raised if either
4315 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4316 | Standard for Binary Floating-Point Arithmetic.
4317 *----------------------------------------------------------------------------*/
4319 int float64_unordered(float64 a
, float64 b
, float_status
*status
)
4321 a
= float64_squash_input_denormal(a
, status
);
4322 b
= float64_squash_input_denormal(b
, status
);
4324 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4325 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4327 float_raise(float_flag_invalid
, status
);
4333 /*----------------------------------------------------------------------------
4334 | Returns 1 if the double-precision floating-point value `a' is equal to the
4335 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4336 | exception.The comparison is performed according to the IEC/IEEE Standard
4337 | for Binary Floating-Point Arithmetic.
4338 *----------------------------------------------------------------------------*/
4340 int float64_eq_quiet(float64 a
, float64 b
, float_status
*status
)
4343 a
= float64_squash_input_denormal(a
, status
);
4344 b
= float64_squash_input_denormal(b
, status
);
4346 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4347 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4349 if (float64_is_signaling_nan(a
, status
)
4350 || float64_is_signaling_nan(b
, status
)) {
4351 float_raise(float_flag_invalid
, status
);
4355 av
= float64_val(a
);
4356 bv
= float64_val(b
);
4357 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4361 /*----------------------------------------------------------------------------
4362 | Returns 1 if the double-precision floating-point value `a' is less than or
4363 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4364 | cause an exception. Otherwise, the comparison is performed according to the
4365 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4366 *----------------------------------------------------------------------------*/
4368 int float64_le_quiet(float64 a
, float64 b
, float_status
*status
)
4372 a
= float64_squash_input_denormal(a
, status
);
4373 b
= float64_squash_input_denormal(b
, status
);
4375 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4376 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4378 if (float64_is_signaling_nan(a
, status
)
4379 || float64_is_signaling_nan(b
, status
)) {
4380 float_raise(float_flag_invalid
, status
);
4384 aSign
= extractFloat64Sign( a
);
4385 bSign
= extractFloat64Sign( b
);
4386 av
= float64_val(a
);
4387 bv
= float64_val(b
);
4388 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4389 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4393 /*----------------------------------------------------------------------------
4394 | Returns 1 if the double-precision floating-point value `a' is less than
4395 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4396 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4397 | Standard for Binary Floating-Point Arithmetic.
4398 *----------------------------------------------------------------------------*/
4400 int float64_lt_quiet(float64 a
, float64 b
, float_status
*status
)
4404 a
= float64_squash_input_denormal(a
, status
);
4405 b
= float64_squash_input_denormal(b
, status
);
4407 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4408 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4410 if (float64_is_signaling_nan(a
, status
)
4411 || float64_is_signaling_nan(b
, status
)) {
4412 float_raise(float_flag_invalid
, status
);
4416 aSign
= extractFloat64Sign( a
);
4417 bSign
= extractFloat64Sign( b
);
4418 av
= float64_val(a
);
4419 bv
= float64_val(b
);
4420 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4421 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4425 /*----------------------------------------------------------------------------
4426 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4427 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4428 | comparison is performed according to the IEC/IEEE Standard for Binary
4429 | Floating-Point Arithmetic.
4430 *----------------------------------------------------------------------------*/
4432 int float64_unordered_quiet(float64 a
, float64 b
, float_status
*status
)
4434 a
= float64_squash_input_denormal(a
, status
);
4435 b
= float64_squash_input_denormal(b
, status
);
4437 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4438 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4440 if (float64_is_signaling_nan(a
, status
)
4441 || float64_is_signaling_nan(b
, status
)) {
4442 float_raise(float_flag_invalid
, status
);
4449 /*----------------------------------------------------------------------------
4450 | Returns the result of converting the extended double-precision floating-
4451 | point value `a' to the 32-bit two's complement integer format. The
4452 | conversion is performed according to the IEC/IEEE Standard for Binary
4453 | Floating-Point Arithmetic---which means in particular that the conversion
4454 | is rounded according to the current rounding mode. If `a' is a NaN, the
4455 | largest positive integer is returned. Otherwise, if the conversion
4456 | overflows, the largest integer with the same sign as `a' is returned.
4457 *----------------------------------------------------------------------------*/
4459 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
4462 int32_t aExp
, shiftCount
;
4465 if (floatx80_invalid_encoding(a
)) {
4466 float_raise(float_flag_invalid
, status
);
4469 aSig
= extractFloatx80Frac( a
);
4470 aExp
= extractFloatx80Exp( a
);
4471 aSign
= extractFloatx80Sign( a
);
4472 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4473 shiftCount
= 0x4037 - aExp
;
4474 if ( shiftCount
<= 0 ) shiftCount
= 1;
4475 shift64RightJamming( aSig
, shiftCount
, &aSig
);
4476 return roundAndPackInt32(aSign
, aSig
, status
);
4480 /*----------------------------------------------------------------------------
4481 | Returns the result of converting the extended double-precision floating-
4482 | point value `a' to the 32-bit two's complement integer format. The
4483 | conversion is performed according to the IEC/IEEE Standard for Binary
4484 | Floating-Point Arithmetic, except that the conversion is always rounded
4485 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4486 | Otherwise, if the conversion overflows, the largest integer with the same
4487 | sign as `a' is returned.
4488 *----------------------------------------------------------------------------*/
4490 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
4493 int32_t aExp
, shiftCount
;
4494 uint64_t aSig
, savedASig
;
4497 if (floatx80_invalid_encoding(a
)) {
4498 float_raise(float_flag_invalid
, status
);
4501 aSig
= extractFloatx80Frac( a
);
4502 aExp
= extractFloatx80Exp( a
);
4503 aSign
= extractFloatx80Sign( a
);
4504 if ( 0x401E < aExp
) {
4505 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4508 else if ( aExp
< 0x3FFF ) {
4510 status
->float_exception_flags
|= float_flag_inexact
;
4514 shiftCount
= 0x403E - aExp
;
4516 aSig
>>= shiftCount
;
4518 if ( aSign
) z
= - z
;
4519 if ( ( z
< 0 ) ^ aSign
) {
4521 float_raise(float_flag_invalid
, status
);
4522 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
4524 if ( ( aSig
<<shiftCount
) != savedASig
) {
4525 status
->float_exception_flags
|= float_flag_inexact
;
4531 /*----------------------------------------------------------------------------
4532 | Returns the result of converting the extended double-precision floating-
4533 | point value `a' to the 64-bit two's complement integer format. The
4534 | conversion is performed according to the IEC/IEEE Standard for Binary
4535 | Floating-Point Arithmetic---which means in particular that the conversion
4536 | is rounded according to the current rounding mode. If `a' is a NaN,
4537 | the largest positive integer is returned. Otherwise, if the conversion
4538 | overflows, the largest integer with the same sign as `a' is returned.
4539 *----------------------------------------------------------------------------*/
4541 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
4544 int32_t aExp
, shiftCount
;
4545 uint64_t aSig
, aSigExtra
;
4547 if (floatx80_invalid_encoding(a
)) {
4548 float_raise(float_flag_invalid
, status
);
4551 aSig
= extractFloatx80Frac( a
);
4552 aExp
= extractFloatx80Exp( a
);
4553 aSign
= extractFloatx80Sign( a
);
4554 shiftCount
= 0x403E - aExp
;
4555 if ( shiftCount
<= 0 ) {
4557 float_raise(float_flag_invalid
, status
);
4558 if (!aSign
|| floatx80_is_any_nan(a
)) {
4559 return LIT64( 0x7FFFFFFFFFFFFFFF );
4561 return (int64_t) LIT64( 0x8000000000000000 );
4566 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
4568 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
4572 /*----------------------------------------------------------------------------
4573 | Returns the result of converting the extended double-precision floating-
4574 | point value `a' to the 64-bit two's complement integer format. The
4575 | conversion is performed according to the IEC/IEEE Standard for Binary
4576 | Floating-Point Arithmetic, except that the conversion is always rounded
4577 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4578 | Otherwise, if the conversion overflows, the largest integer with the same
4579 | sign as `a' is returned.
4580 *----------------------------------------------------------------------------*/
4582 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
4585 int32_t aExp
, shiftCount
;
4589 if (floatx80_invalid_encoding(a
)) {
4590 float_raise(float_flag_invalid
, status
);
4593 aSig
= extractFloatx80Frac( a
);
4594 aExp
= extractFloatx80Exp( a
);
4595 aSign
= extractFloatx80Sign( a
);
4596 shiftCount
= aExp
- 0x403E;
4597 if ( 0 <= shiftCount
) {
4598 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
4599 if ( ( a
.high
!= 0xC03E ) || aSig
) {
4600 float_raise(float_flag_invalid
, status
);
4601 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
4602 return LIT64( 0x7FFFFFFFFFFFFFFF );
4605 return (int64_t) LIT64( 0x8000000000000000 );
4607 else if ( aExp
< 0x3FFF ) {
4609 status
->float_exception_flags
|= float_flag_inexact
;
4613 z
= aSig
>>( - shiftCount
);
4614 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
4615 status
->float_exception_flags
|= float_flag_inexact
;
4617 if ( aSign
) z
= - z
;
4622 /*----------------------------------------------------------------------------
4623 | Returns the result of converting the extended double-precision floating-
4624 | point value `a' to the single-precision floating-point format. The
4625 | conversion is performed according to the IEC/IEEE Standard for Binary
4626 | Floating-Point Arithmetic.
4627 *----------------------------------------------------------------------------*/
4629 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
4635 if (floatx80_invalid_encoding(a
)) {
4636 float_raise(float_flag_invalid
, status
);
4637 return float32_default_nan(status
);
4639 aSig
= extractFloatx80Frac( a
);
4640 aExp
= extractFloatx80Exp( a
);
4641 aSign
= extractFloatx80Sign( a
);
4642 if ( aExp
== 0x7FFF ) {
4643 if ( (uint64_t) ( aSig
<<1 ) ) {
4644 return commonNaNToFloat32(floatx80ToCommonNaN(a
, status
), status
);
4646 return packFloat32( aSign
, 0xFF, 0 );
4648 shift64RightJamming( aSig
, 33, &aSig
);
4649 if ( aExp
|| aSig
) aExp
-= 0x3F81;
4650 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
4654 /*----------------------------------------------------------------------------
4655 | Returns the result of converting the extended double-precision floating-
4656 | point value `a' to the double-precision floating-point format. The
4657 | conversion is performed according to the IEC/IEEE Standard for Binary
4658 | Floating-Point Arithmetic.
4659 *----------------------------------------------------------------------------*/
4661 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
4665 uint64_t aSig
, zSig
;
4667 if (floatx80_invalid_encoding(a
)) {
4668 float_raise(float_flag_invalid
, status
);
4669 return float64_default_nan(status
);
4671 aSig
= extractFloatx80Frac( a
);
4672 aExp
= extractFloatx80Exp( a
);
4673 aSign
= extractFloatx80Sign( a
);
4674 if ( aExp
== 0x7FFF ) {
4675 if ( (uint64_t) ( aSig
<<1 ) ) {
4676 return commonNaNToFloat64(floatx80ToCommonNaN(a
, status
), status
);
4678 return packFloat64( aSign
, 0x7FF, 0 );
4680 shift64RightJamming( aSig
, 1, &zSig
);
4681 if ( aExp
|| aSig
) aExp
-= 0x3C01;
4682 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
4686 /*----------------------------------------------------------------------------
4687 | Returns the result of converting the extended double-precision floating-
4688 | point value `a' to the quadruple-precision floating-point format. The
4689 | conversion is performed according to the IEC/IEEE Standard for Binary
4690 | Floating-Point Arithmetic.
4691 *----------------------------------------------------------------------------*/
4693 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
4697 uint64_t aSig
, zSig0
, zSig1
;
4699 if (floatx80_invalid_encoding(a
)) {
4700 float_raise(float_flag_invalid
, status
);
4701 return float128_default_nan(status
);
4703 aSig
= extractFloatx80Frac( a
);
4704 aExp
= extractFloatx80Exp( a
);
4705 aSign
= extractFloatx80Sign( a
);
4706 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
4707 return commonNaNToFloat128(floatx80ToCommonNaN(a
, status
), status
);
4709 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
4710 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
4714 /*----------------------------------------------------------------------------
4715 | Rounds the extended double-precision floating-point value `a'
4716 | to the precision provided by floatx80_rounding_precision and returns the
4717 | result as an extended double-precision floating-point value.
4718 | The operation is performed according to the IEC/IEEE Standard for Binary
4719 | Floating-Point Arithmetic.
4720 *----------------------------------------------------------------------------*/
4722 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
4724 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
4725 extractFloatx80Sign(a
),
4726 extractFloatx80Exp(a
),
4727 extractFloatx80Frac(a
), 0, status
);
4730 /*----------------------------------------------------------------------------
4731 | Rounds the extended double-precision floating-point value `a' to an integer,
4732 | and returns the result as an extended quadruple-precision floating-point
4733 | value. The operation is performed according to the IEC/IEEE Standard for
4734 | Binary Floating-Point Arithmetic.
4735 *----------------------------------------------------------------------------*/
4737 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
4741 uint64_t lastBitMask
, roundBitsMask
;
4744 if (floatx80_invalid_encoding(a
)) {
4745 float_raise(float_flag_invalid
, status
);
4746 return floatx80_default_nan(status
);
4748 aExp
= extractFloatx80Exp( a
);
4749 if ( 0x403E <= aExp
) {
4750 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
4751 return propagateFloatx80NaN(a
, a
, status
);
4755 if ( aExp
< 0x3FFF ) {
4757 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
4760 status
->float_exception_flags
|= float_flag_inexact
;
4761 aSign
= extractFloatx80Sign( a
);
4762 switch (status
->float_rounding_mode
) {
4763 case float_round_nearest_even
:
4764 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
4767 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
4770 case float_round_ties_away
:
4771 if (aExp
== 0x3FFE) {
4772 return packFloatx80(aSign
, 0x3FFF, LIT64(0x8000000000000000));
4775 case float_round_down
:
4778 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4779 : packFloatx80( 0, 0, 0 );
4780 case float_round_up
:
4782 aSign
? packFloatx80( 1, 0, 0 )
4783 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4785 return packFloatx80( aSign
, 0, 0 );
4788 lastBitMask
<<= 0x403E - aExp
;
4789 roundBitsMask
= lastBitMask
- 1;
4791 switch (status
->float_rounding_mode
) {
4792 case float_round_nearest_even
:
4793 z
.low
+= lastBitMask
>>1;
4794 if ((z
.low
& roundBitsMask
) == 0) {
4795 z
.low
&= ~lastBitMask
;
4798 case float_round_ties_away
:
4799 z
.low
+= lastBitMask
>> 1;
4801 case float_round_to_zero
:
4803 case float_round_up
:
4804 if (!extractFloatx80Sign(z
)) {
4805 z
.low
+= roundBitsMask
;
4808 case float_round_down
:
4809 if (extractFloatx80Sign(z
)) {
4810 z
.low
+= roundBitsMask
;
4816 z
.low
&= ~ roundBitsMask
;
4819 z
.low
= LIT64( 0x8000000000000000 );
4821 if (z
.low
!= a
.low
) {
4822 status
->float_exception_flags
|= float_flag_inexact
;
4828 /*----------------------------------------------------------------------------
4829 | Returns the result of adding the absolute values of the extended double-
4830 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4831 | negated before being returned. `zSign' is ignored if the result is a NaN.
4832 | The addition is performed according to the IEC/IEEE Standard for Binary
4833 | Floating-Point Arithmetic.
4834 *----------------------------------------------------------------------------*/
4836 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
4837 float_status
*status
)
4839 int32_t aExp
, bExp
, zExp
;
4840 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4843 aSig
= extractFloatx80Frac( a
);
4844 aExp
= extractFloatx80Exp( a
);
4845 bSig
= extractFloatx80Frac( b
);
4846 bExp
= extractFloatx80Exp( b
);
4847 expDiff
= aExp
- bExp
;
4848 if ( 0 < expDiff
) {
4849 if ( aExp
== 0x7FFF ) {
4850 if ((uint64_t)(aSig
<< 1)) {
4851 return propagateFloatx80NaN(a
, b
, status
);
4855 if ( bExp
== 0 ) --expDiff
;
4856 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4859 else if ( expDiff
< 0 ) {
4860 if ( bExp
== 0x7FFF ) {
4861 if ((uint64_t)(bSig
<< 1)) {
4862 return propagateFloatx80NaN(a
, b
, status
);
4864 return packFloatx80(zSign
,
4865 floatx80_infinity_high
,
4866 floatx80_infinity_low
);
4868 if ( aExp
== 0 ) ++expDiff
;
4869 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4873 if ( aExp
== 0x7FFF ) {
4874 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
4875 return propagateFloatx80NaN(a
, b
, status
);
4880 zSig0
= aSig
+ bSig
;
4882 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
4888 zSig0
= aSig
+ bSig
;
4889 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
4891 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4892 zSig0
|= LIT64( 0x8000000000000000 );
4895 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
4896 zSign
, zExp
, zSig0
, zSig1
, status
);
4899 /*----------------------------------------------------------------------------
4900 | Returns the result of subtracting the absolute values of the extended
4901 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4902 | difference is negated before being returned. `zSign' is ignored if the
4903 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4904 | Standard for Binary Floating-Point Arithmetic.
4905 *----------------------------------------------------------------------------*/
4907 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
4908 float_status
*status
)
4910 int32_t aExp
, bExp
, zExp
;
4911 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4914 aSig
= extractFloatx80Frac( a
);
4915 aExp
= extractFloatx80Exp( a
);
4916 bSig
= extractFloatx80Frac( b
);
4917 bExp
= extractFloatx80Exp( b
);
4918 expDiff
= aExp
- bExp
;
4919 if ( 0 < expDiff
) goto aExpBigger
;
4920 if ( expDiff
< 0 ) goto bExpBigger
;
4921 if ( aExp
== 0x7FFF ) {
4922 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
4923 return propagateFloatx80NaN(a
, b
, status
);
4925 float_raise(float_flag_invalid
, status
);
4926 return floatx80_default_nan(status
);
4933 if ( bSig
< aSig
) goto aBigger
;
4934 if ( aSig
< bSig
) goto bBigger
;
4935 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
4937 if ( bExp
== 0x7FFF ) {
4938 if ((uint64_t)(bSig
<< 1)) {
4939 return propagateFloatx80NaN(a
, b
, status
);
4941 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
4942 floatx80_infinity_low
);
4944 if ( aExp
== 0 ) ++expDiff
;
4945 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4947 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
4950 goto normalizeRoundAndPack
;
4952 if ( aExp
== 0x7FFF ) {
4953 if ((uint64_t)(aSig
<< 1)) {
4954 return propagateFloatx80NaN(a
, b
, status
);
4958 if ( bExp
== 0 ) --expDiff
;
4959 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4961 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
4963 normalizeRoundAndPack
:
4964 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
4965 zSign
, zExp
, zSig0
, zSig1
, status
);
4968 /*----------------------------------------------------------------------------
4969 | Returns the result of adding the extended double-precision floating-point
4970 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4971 | Standard for Binary Floating-Point Arithmetic.
4972 *----------------------------------------------------------------------------*/
4974 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
4978 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
4979 float_raise(float_flag_invalid
, status
);
4980 return floatx80_default_nan(status
);
4982 aSign
= extractFloatx80Sign( a
);
4983 bSign
= extractFloatx80Sign( b
);
4984 if ( aSign
== bSign
) {
4985 return addFloatx80Sigs(a
, b
, aSign
, status
);
4988 return subFloatx80Sigs(a
, b
, aSign
, status
);
4993 /*----------------------------------------------------------------------------
4994 | Returns the result of subtracting the extended double-precision floating-
4995 | point values `a' and `b'. The operation is performed according to the
4996 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4997 *----------------------------------------------------------------------------*/
4999 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5003 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5004 float_raise(float_flag_invalid
, status
);
5005 return floatx80_default_nan(status
);
5007 aSign
= extractFloatx80Sign( a
);
5008 bSign
= extractFloatx80Sign( b
);
5009 if ( aSign
== bSign
) {
5010 return subFloatx80Sigs(a
, b
, aSign
, status
);
5013 return addFloatx80Sigs(a
, b
, aSign
, status
);
5018 /*----------------------------------------------------------------------------
5019 | Returns the result of multiplying the extended double-precision floating-
5020 | point values `a' and `b'. The operation is performed according to the
5021 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5022 *----------------------------------------------------------------------------*/
5024 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
5026 flag aSign
, bSign
, zSign
;
5027 int32_t aExp
, bExp
, zExp
;
5028 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5030 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5031 float_raise(float_flag_invalid
, status
);
5032 return floatx80_default_nan(status
);
5034 aSig
= extractFloatx80Frac( a
);
5035 aExp
= extractFloatx80Exp( a
);
5036 aSign
= extractFloatx80Sign( a
);
5037 bSig
= extractFloatx80Frac( b
);
5038 bExp
= extractFloatx80Exp( b
);
5039 bSign
= extractFloatx80Sign( b
);
5040 zSign
= aSign
^ bSign
;
5041 if ( aExp
== 0x7FFF ) {
5042 if ( (uint64_t) ( aSig
<<1 )
5043 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5044 return propagateFloatx80NaN(a
, b
, status
);
5046 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
5047 return packFloatx80(zSign
, floatx80_infinity_high
,
5048 floatx80_infinity_low
);
5050 if ( bExp
== 0x7FFF ) {
5051 if ((uint64_t)(bSig
<< 1)) {
5052 return propagateFloatx80NaN(a
, b
, status
);
5054 if ( ( aExp
| aSig
) == 0 ) {
5056 float_raise(float_flag_invalid
, status
);
5057 return floatx80_default_nan(status
);
5059 return packFloatx80(zSign
, floatx80_infinity_high
,
5060 floatx80_infinity_low
);
5063 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5064 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5067 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5068 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5070 zExp
= aExp
+ bExp
- 0x3FFE;
5071 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
5072 if ( 0 < (int64_t) zSig0
) {
5073 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5076 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5077 zSign
, zExp
, zSig0
, zSig1
, status
);
5080 /*----------------------------------------------------------------------------
5081 | Returns the result of dividing the extended double-precision floating-point
5082 | value `a' by the corresponding value `b'. The operation is performed
5083 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5084 *----------------------------------------------------------------------------*/
5086 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
5088 flag aSign
, bSign
, zSign
;
5089 int32_t aExp
, bExp
, zExp
;
5090 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5091 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
5093 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5094 float_raise(float_flag_invalid
, status
);
5095 return floatx80_default_nan(status
);
5097 aSig
= extractFloatx80Frac( a
);
5098 aExp
= extractFloatx80Exp( a
);
5099 aSign
= extractFloatx80Sign( a
);
5100 bSig
= extractFloatx80Frac( b
);
5101 bExp
= extractFloatx80Exp( b
);
5102 bSign
= extractFloatx80Sign( b
);
5103 zSign
= aSign
^ bSign
;
5104 if ( aExp
== 0x7FFF ) {
5105 if ((uint64_t)(aSig
<< 1)) {
5106 return propagateFloatx80NaN(a
, b
, status
);
5108 if ( bExp
== 0x7FFF ) {
5109 if ((uint64_t)(bSig
<< 1)) {
5110 return propagateFloatx80NaN(a
, b
, status
);
5114 return packFloatx80(zSign
, floatx80_infinity_high
,
5115 floatx80_infinity_low
);
5117 if ( bExp
== 0x7FFF ) {
5118 if ((uint64_t)(bSig
<< 1)) {
5119 return propagateFloatx80NaN(a
, b
, status
);
5121 return packFloatx80( zSign
, 0, 0 );
5125 if ( ( aExp
| aSig
) == 0 ) {
5127 float_raise(float_flag_invalid
, status
);
5128 return floatx80_default_nan(status
);
5130 float_raise(float_flag_divbyzero
, status
);
5131 return packFloatx80(zSign
, floatx80_infinity_high
,
5132 floatx80_infinity_low
);
5134 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5137 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5138 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5140 zExp
= aExp
- bExp
+ 0x3FFE;
5142 if ( bSig
<= aSig
) {
5143 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
5146 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
5147 mul64To128( bSig
, zSig0
, &term0
, &term1
);
5148 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
5149 while ( (int64_t) rem0
< 0 ) {
5151 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
5153 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
5154 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
5155 mul64To128( bSig
, zSig1
, &term1
, &term2
);
5156 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5157 while ( (int64_t) rem1
< 0 ) {
5159 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
5161 zSig1
|= ( ( rem1
| rem2
) != 0 );
5163 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5164 zSign
, zExp
, zSig0
, zSig1
, status
);
5167 /*----------------------------------------------------------------------------
5168 | Returns the remainder of the extended double-precision floating-point value
5169 | `a' with respect to the corresponding value `b'. The operation is performed
5170 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5171 *----------------------------------------------------------------------------*/
5173 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
5176 int32_t aExp
, bExp
, expDiff
;
5177 uint64_t aSig0
, aSig1
, bSig
;
5178 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
5180 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5181 float_raise(float_flag_invalid
, status
);
5182 return floatx80_default_nan(status
);
5184 aSig0
= extractFloatx80Frac( a
);
5185 aExp
= extractFloatx80Exp( a
);
5186 aSign
= extractFloatx80Sign( a
);
5187 bSig
= extractFloatx80Frac( b
);
5188 bExp
= extractFloatx80Exp( b
);
5189 if ( aExp
== 0x7FFF ) {
5190 if ( (uint64_t) ( aSig0
<<1 )
5191 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5192 return propagateFloatx80NaN(a
, b
, status
);
5196 if ( bExp
== 0x7FFF ) {
5197 if ((uint64_t)(bSig
<< 1)) {
5198 return propagateFloatx80NaN(a
, b
, status
);
5205 float_raise(float_flag_invalid
, status
);
5206 return floatx80_default_nan(status
);
5208 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5211 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
5212 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5214 bSig
|= LIT64( 0x8000000000000000 );
5216 expDiff
= aExp
- bExp
;
5218 if ( expDiff
< 0 ) {
5219 if ( expDiff
< -1 ) return a
;
5220 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
5223 q
= ( bSig
<= aSig0
);
5224 if ( q
) aSig0
-= bSig
;
5226 while ( 0 < expDiff
) {
5227 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5228 q
= ( 2 < q
) ? q
- 2 : 0;
5229 mul64To128( bSig
, q
, &term0
, &term1
);
5230 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5231 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
5235 if ( 0 < expDiff
) {
5236 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5237 q
= ( 2 < q
) ? q
- 2 : 0;
5239 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
5240 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5241 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
5242 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
5244 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5251 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
5252 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5253 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5256 aSig0
= alternateASig0
;
5257 aSig1
= alternateASig1
;
5261 normalizeRoundAndPackFloatx80(
5262 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
5266 /*----------------------------------------------------------------------------
5267 | Returns the square root of the extended double-precision floating-point
5268 | value `a'. The operation is performed according to the IEC/IEEE Standard
5269 | for Binary Floating-Point Arithmetic.
5270 *----------------------------------------------------------------------------*/
5272 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
5276 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
5277 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5279 if (floatx80_invalid_encoding(a
)) {
5280 float_raise(float_flag_invalid
, status
);
5281 return floatx80_default_nan(status
);
5283 aSig0
= extractFloatx80Frac( a
);
5284 aExp
= extractFloatx80Exp( a
);
5285 aSign
= extractFloatx80Sign( a
);
5286 if ( aExp
== 0x7FFF ) {
5287 if ((uint64_t)(aSig0
<< 1)) {
5288 return propagateFloatx80NaN(a
, a
, status
);
5290 if ( ! aSign
) return a
;
5294 if ( ( aExp
| aSig0
) == 0 ) return a
;
5296 float_raise(float_flag_invalid
, status
);
5297 return floatx80_default_nan(status
);
5300 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
5301 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5303 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
5304 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
5305 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
5306 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5307 doubleZSig0
= zSig0
<<1;
5308 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5309 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5310 while ( (int64_t) rem0
< 0 ) {
5313 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5315 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5316 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5317 if ( zSig1
== 0 ) zSig1
= 1;
5318 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5319 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5320 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5321 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5322 while ( (int64_t) rem1
< 0 ) {
5324 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5326 term2
|= doubleZSig0
;
5327 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5329 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5331 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
5332 zSig0
|= doubleZSig0
;
5333 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5334 0, zExp
, zSig0
, zSig1
, status
);
5337 /*----------------------------------------------------------------------------
5338 | Returns 1 if the extended double-precision floating-point value `a' is equal
5339 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5340 | raised if either operand is a NaN. Otherwise, the comparison is performed
5341 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5342 *----------------------------------------------------------------------------*/
5344 int floatx80_eq(floatx80 a
, floatx80 b
, float_status
*status
)
5347 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5348 || (extractFloatx80Exp(a
) == 0x7FFF
5349 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5350 || (extractFloatx80Exp(b
) == 0x7FFF
5351 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5353 float_raise(float_flag_invalid
, status
);
5358 && ( ( a
.high
== b
.high
)
5360 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5365 /*----------------------------------------------------------------------------
5366 | Returns 1 if the extended double-precision floating-point value `a' is
5367 | less than or equal to the corresponding value `b', and 0 otherwise. The
5368 | invalid exception is raised if either operand is a NaN. The comparison is
5369 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5371 *----------------------------------------------------------------------------*/
5373 int floatx80_le(floatx80 a
, floatx80 b
, float_status
*status
)
5377 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5378 || (extractFloatx80Exp(a
) == 0x7FFF
5379 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5380 || (extractFloatx80Exp(b
) == 0x7FFF
5381 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5383 float_raise(float_flag_invalid
, status
);
5386 aSign
= extractFloatx80Sign( a
);
5387 bSign
= extractFloatx80Sign( b
);
5388 if ( aSign
!= bSign
) {
5391 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5395 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5396 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5400 /*----------------------------------------------------------------------------
5401 | Returns 1 if the extended double-precision floating-point value `a' is
5402 | less than the corresponding value `b', and 0 otherwise. The invalid
5403 | exception is raised if either operand is a NaN. The comparison is performed
5404 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5405 *----------------------------------------------------------------------------*/
5407 int floatx80_lt(floatx80 a
, floatx80 b
, float_status
*status
)
5411 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5412 || (extractFloatx80Exp(a
) == 0x7FFF
5413 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5414 || (extractFloatx80Exp(b
) == 0x7FFF
5415 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5417 float_raise(float_flag_invalid
, status
);
5420 aSign
= extractFloatx80Sign( a
);
5421 bSign
= extractFloatx80Sign( b
);
5422 if ( aSign
!= bSign
) {
5425 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5429 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5430 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5434 /*----------------------------------------------------------------------------
5435 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5436 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5437 | either operand is a NaN. The comparison is performed according to the
5438 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5439 *----------------------------------------------------------------------------*/
5440 int floatx80_unordered(floatx80 a
, floatx80 b
, float_status
*status
)
5442 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5443 || (extractFloatx80Exp(a
) == 0x7FFF
5444 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5445 || (extractFloatx80Exp(b
) == 0x7FFF
5446 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5448 float_raise(float_flag_invalid
, status
);
5454 /*----------------------------------------------------------------------------
5455 | Returns 1 if the extended double-precision floating-point value `a' is
5456 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5457 | cause an exception. The comparison is performed according to the IEC/IEEE
5458 | Standard for Binary Floating-Point Arithmetic.
5459 *----------------------------------------------------------------------------*/
5461 int floatx80_eq_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5464 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5465 float_raise(float_flag_invalid
, status
);
5468 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5469 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5470 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5471 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5473 if (floatx80_is_signaling_nan(a
, status
)
5474 || floatx80_is_signaling_nan(b
, status
)) {
5475 float_raise(float_flag_invalid
, status
);
5481 && ( ( a
.high
== b
.high
)
5483 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5488 /*----------------------------------------------------------------------------
5489 | Returns 1 if the extended double-precision floating-point value `a' is less
5490 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5491 | do not cause an exception. Otherwise, the comparison is performed according
5492 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5493 *----------------------------------------------------------------------------*/
5495 int floatx80_le_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5499 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5500 float_raise(float_flag_invalid
, status
);
5503 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5504 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5505 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5506 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5508 if (floatx80_is_signaling_nan(a
, status
)
5509 || floatx80_is_signaling_nan(b
, status
)) {
5510 float_raise(float_flag_invalid
, status
);
5514 aSign
= extractFloatx80Sign( a
);
5515 bSign
= extractFloatx80Sign( b
);
5516 if ( aSign
!= bSign
) {
5519 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5523 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5524 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5528 /*----------------------------------------------------------------------------
5529 | Returns 1 if the extended double-precision floating-point value `a' is less
5530 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5531 | an exception. Otherwise, the comparison is performed according to the
5532 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5533 *----------------------------------------------------------------------------*/
5535 int floatx80_lt_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5539 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5540 float_raise(float_flag_invalid
, status
);
5543 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5544 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5545 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5546 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5548 if (floatx80_is_signaling_nan(a
, status
)
5549 || floatx80_is_signaling_nan(b
, status
)) {
5550 float_raise(float_flag_invalid
, status
);
5554 aSign
= extractFloatx80Sign( a
);
5555 bSign
= extractFloatx80Sign( b
);
5556 if ( aSign
!= bSign
) {
5559 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5563 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5564 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5568 /*----------------------------------------------------------------------------
5569 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5570 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5571 | The comparison is performed according to the IEC/IEEE Standard for Binary
5572 | Floating-Point Arithmetic.
5573 *----------------------------------------------------------------------------*/
5574 int floatx80_unordered_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5576 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5577 float_raise(float_flag_invalid
, status
);
5580 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5581 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5582 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5583 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5585 if (floatx80_is_signaling_nan(a
, status
)
5586 || floatx80_is_signaling_nan(b
, status
)) {
5587 float_raise(float_flag_invalid
, status
);
5594 /*----------------------------------------------------------------------------
5595 | Returns the result of converting the quadruple-precision floating-point
5596 | value `a' to the 32-bit two's complement integer format. The conversion
5597 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5598 | Arithmetic---which means in particular that the conversion is rounded
5599 | according to the current rounding mode. If `a' is a NaN, the largest
5600 | positive integer is returned. Otherwise, if the conversion overflows, the
5601 | largest integer with the same sign as `a' is returned.
5602 *----------------------------------------------------------------------------*/
5604 int32_t float128_to_int32(float128 a
, float_status
*status
)
5607 int32_t aExp
, shiftCount
;
5608 uint64_t aSig0
, aSig1
;
5610 aSig1
= extractFloat128Frac1( a
);
5611 aSig0
= extractFloat128Frac0( a
);
5612 aExp
= extractFloat128Exp( a
);
5613 aSign
= extractFloat128Sign( a
);
5614 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
5615 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5616 aSig0
|= ( aSig1
!= 0 );
5617 shiftCount
= 0x4028 - aExp
;
5618 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
5619 return roundAndPackInt32(aSign
, aSig0
, status
);
5623 /*----------------------------------------------------------------------------
5624 | Returns the result of converting the quadruple-precision floating-point
5625 | value `a' to the 32-bit two's complement integer format. The conversion
5626 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5627 | Arithmetic, except that the conversion is always rounded toward zero. If
5628 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5629 | conversion overflows, the largest integer with the same sign as `a' is
5631 *----------------------------------------------------------------------------*/
5633 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
5636 int32_t aExp
, shiftCount
;
5637 uint64_t aSig0
, aSig1
, savedASig
;
5640 aSig1
= extractFloat128Frac1( a
);
5641 aSig0
= extractFloat128Frac0( a
);
5642 aExp
= extractFloat128Exp( a
);
5643 aSign
= extractFloat128Sign( a
);
5644 aSig0
|= ( aSig1
!= 0 );
5645 if ( 0x401E < aExp
) {
5646 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
5649 else if ( aExp
< 0x3FFF ) {
5650 if (aExp
|| aSig0
) {
5651 status
->float_exception_flags
|= float_flag_inexact
;
5655 aSig0
|= LIT64( 0x0001000000000000 );
5656 shiftCount
= 0x402F - aExp
;
5658 aSig0
>>= shiftCount
;
5660 if ( aSign
) z
= - z
;
5661 if ( ( z
< 0 ) ^ aSign
) {
5663 float_raise(float_flag_invalid
, status
);
5664 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5666 if ( ( aSig0
<<shiftCount
) != savedASig
) {
5667 status
->float_exception_flags
|= float_flag_inexact
;
5673 /*----------------------------------------------------------------------------
5674 | Returns the result of converting the quadruple-precision floating-point
5675 | value `a' to the 64-bit two's complement integer format. The conversion
5676 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5677 | Arithmetic---which means in particular that the conversion is rounded
5678 | according to the current rounding mode. If `a' is a NaN, the largest
5679 | positive integer is returned. Otherwise, if the conversion overflows, the
5680 | largest integer with the same sign as `a' is returned.
5681 *----------------------------------------------------------------------------*/
5683 int64_t float128_to_int64(float128 a
, float_status
*status
)
5686 int32_t aExp
, shiftCount
;
5687 uint64_t aSig0
, aSig1
;
5689 aSig1
= extractFloat128Frac1( a
);
5690 aSig0
= extractFloat128Frac0( a
);
5691 aExp
= extractFloat128Exp( a
);
5692 aSign
= extractFloat128Sign( a
);
5693 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5694 shiftCount
= 0x402F - aExp
;
5695 if ( shiftCount
<= 0 ) {
5696 if ( 0x403E < aExp
) {
5697 float_raise(float_flag_invalid
, status
);
5699 || ( ( aExp
== 0x7FFF )
5700 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
5703 return LIT64( 0x7FFFFFFFFFFFFFFF );
5705 return (int64_t) LIT64( 0x8000000000000000 );
5707 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
5710 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
5712 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
5716 /*----------------------------------------------------------------------------
5717 | Returns the result of converting the quadruple-precision floating-point
5718 | value `a' to the 64-bit two's complement integer format. The conversion
5719 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5720 | Arithmetic, except that the conversion is always rounded toward zero.
5721 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5722 | the conversion overflows, the largest integer with the same sign as `a' is
5724 *----------------------------------------------------------------------------*/
5726 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
5729 int32_t aExp
, shiftCount
;
5730 uint64_t aSig0
, aSig1
;
5733 aSig1
= extractFloat128Frac1( a
);
5734 aSig0
= extractFloat128Frac0( a
);
5735 aExp
= extractFloat128Exp( a
);
5736 aSign
= extractFloat128Sign( a
);
5737 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5738 shiftCount
= aExp
- 0x402F;
5739 if ( 0 < shiftCount
) {
5740 if ( 0x403E <= aExp
) {
5741 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
5742 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
5743 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
5745 status
->float_exception_flags
|= float_flag_inexact
;
5749 float_raise(float_flag_invalid
, status
);
5750 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
5751 return LIT64( 0x7FFFFFFFFFFFFFFF );
5754 return (int64_t) LIT64( 0x8000000000000000 );
5756 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
5757 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
5758 status
->float_exception_flags
|= float_flag_inexact
;
5762 if ( aExp
< 0x3FFF ) {
5763 if ( aExp
| aSig0
| aSig1
) {
5764 status
->float_exception_flags
|= float_flag_inexact
;
5768 z
= aSig0
>>( - shiftCount
);
5770 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
5771 status
->float_exception_flags
|= float_flag_inexact
;
5774 if ( aSign
) z
= - z
;
5779 /*----------------------------------------------------------------------------
5780 | Returns the result of converting the quadruple-precision floating-point value
5781 | `a' to the 64-bit unsigned integer format. The conversion is
5782 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5783 | Arithmetic---which means in particular that the conversion is rounded
5784 | according to the current rounding mode. If `a' is a NaN, the largest
5785 | positive integer is returned. If the conversion overflows, the
5786 | largest unsigned integer is returned. If 'a' is negative, the value is
5787 | rounded and zero is returned; negative values that do not round to zero
5788 | will raise the inexact exception.
5789 *----------------------------------------------------------------------------*/
5791 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
5796 uint64_t aSig0
, aSig1
;
5798 aSig0
= extractFloat128Frac0(a
);
5799 aSig1
= extractFloat128Frac1(a
);
5800 aExp
= extractFloat128Exp(a
);
5801 aSign
= extractFloat128Sign(a
);
5802 if (aSign
&& (aExp
> 0x3FFE)) {
5803 float_raise(float_flag_invalid
, status
);
5804 if (float128_is_any_nan(a
)) {
5805 return LIT64(0xFFFFFFFFFFFFFFFF);
5811 aSig0
|= LIT64(0x0001000000000000);
5813 shiftCount
= 0x402F - aExp
;
5814 if (shiftCount
<= 0) {
5815 if (0x403E < aExp
) {
5816 float_raise(float_flag_invalid
, status
);
5817 return LIT64(0xFFFFFFFFFFFFFFFF);
5819 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
5821 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
5823 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
5826 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
5829 signed char current_rounding_mode
= status
->float_rounding_mode
;
5831 set_float_rounding_mode(float_round_to_zero
, status
);
5832 v
= float128_to_uint64(a
, status
);
5833 set_float_rounding_mode(current_rounding_mode
, status
);
5838 /*----------------------------------------------------------------------------
5839 | Returns the result of converting the quadruple-precision floating-point
5840 | value `a' to the 32-bit unsigned integer format. The conversion
5841 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5842 | Arithmetic except that the conversion is always rounded toward zero.
5843 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5844 | if the conversion overflows, the largest unsigned integer is returned.
5845 | If 'a' is negative, the value is rounded and zero is returned; negative
5846 | values that do not round to zero will raise the inexact exception.
5847 *----------------------------------------------------------------------------*/
5849 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
5853 int old_exc_flags
= get_float_exception_flags(status
);
5855 v
= float128_to_uint64_round_to_zero(a
, status
);
5856 if (v
> 0xffffffff) {
5861 set_float_exception_flags(old_exc_flags
, status
);
5862 float_raise(float_flag_invalid
, status
);
5866 /*----------------------------------------------------------------------------
5867 | Returns the result of converting the quadruple-precision floating-point
5868 | value `a' to the single-precision floating-point format. The conversion
5869 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5871 *----------------------------------------------------------------------------*/
5873 float32
float128_to_float32(float128 a
, float_status
*status
)
5877 uint64_t aSig0
, aSig1
;
5880 aSig1
= extractFloat128Frac1( a
);
5881 aSig0
= extractFloat128Frac0( a
);
5882 aExp
= extractFloat128Exp( a
);
5883 aSign
= extractFloat128Sign( a
);
5884 if ( aExp
== 0x7FFF ) {
5885 if ( aSig0
| aSig1
) {
5886 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
5888 return packFloat32( aSign
, 0xFF, 0 );
5890 aSig0
|= ( aSig1
!= 0 );
5891 shift64RightJamming( aSig0
, 18, &aSig0
);
5893 if ( aExp
|| zSig
) {
5897 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
5901 /*----------------------------------------------------------------------------
5902 | Returns the result of converting the quadruple-precision floating-point
5903 | value `a' to the double-precision floating-point format. The conversion
5904 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5906 *----------------------------------------------------------------------------*/
5908 float64
float128_to_float64(float128 a
, float_status
*status
)
5912 uint64_t aSig0
, aSig1
;
5914 aSig1
= extractFloat128Frac1( a
);
5915 aSig0
= extractFloat128Frac0( a
);
5916 aExp
= extractFloat128Exp( a
);
5917 aSign
= extractFloat128Sign( a
);
5918 if ( aExp
== 0x7FFF ) {
5919 if ( aSig0
| aSig1
) {
5920 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
5922 return packFloat64( aSign
, 0x7FF, 0 );
5924 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
5925 aSig0
|= ( aSig1
!= 0 );
5926 if ( aExp
|| aSig0
) {
5927 aSig0
|= LIT64( 0x4000000000000000 );
5930 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
5934 /*----------------------------------------------------------------------------
5935 | Returns the result of converting the quadruple-precision floating-point
5936 | value `a' to the extended double-precision floating-point format. The
5937 | conversion is performed according to the IEC/IEEE Standard for Binary
5938 | Floating-Point Arithmetic.
5939 *----------------------------------------------------------------------------*/
5941 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
5945 uint64_t aSig0
, aSig1
;
5947 aSig1
= extractFloat128Frac1( a
);
5948 aSig0
= extractFloat128Frac0( a
);
5949 aExp
= extractFloat128Exp( a
);
5950 aSign
= extractFloat128Sign( a
);
5951 if ( aExp
== 0x7FFF ) {
5952 if ( aSig0
| aSig1
) {
5953 return commonNaNToFloatx80(float128ToCommonNaN(a
, status
), status
);
5955 return packFloatx80(aSign
, floatx80_infinity_high
,
5956 floatx80_infinity_low
);
5959 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
5960 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5963 aSig0
|= LIT64( 0x0001000000000000 );
5965 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
5966 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
5970 /*----------------------------------------------------------------------------
5971 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5972 | returns the result as a quadruple-precision floating-point value. The
5973 | operation is performed according to the IEC/IEEE Standard for Binary
5974 | Floating-Point Arithmetic.
5975 *----------------------------------------------------------------------------*/
5977 float128
float128_round_to_int(float128 a
, float_status
*status
)
5981 uint64_t lastBitMask
, roundBitsMask
;
5984 aExp
= extractFloat128Exp( a
);
5985 if ( 0x402F <= aExp
) {
5986 if ( 0x406F <= aExp
) {
5987 if ( ( aExp
== 0x7FFF )
5988 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
5990 return propagateFloat128NaN(a
, a
, status
);
5995 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
5996 roundBitsMask
= lastBitMask
- 1;
5998 switch (status
->float_rounding_mode
) {
5999 case float_round_nearest_even
:
6000 if ( lastBitMask
) {
6001 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6002 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6005 if ( (int64_t) z
.low
< 0 ) {
6007 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6011 case float_round_ties_away
:
6013 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6015 if ((int64_t) z
.low
< 0) {
6020 case float_round_to_zero
:
6022 case float_round_up
:
6023 if (!extractFloat128Sign(z
)) {
6024 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6027 case float_round_down
:
6028 if (extractFloat128Sign(z
)) {
6029 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6035 z
.low
&= ~ roundBitsMask
;
6038 if ( aExp
< 0x3FFF ) {
6039 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6040 status
->float_exception_flags
|= float_flag_inexact
;
6041 aSign
= extractFloat128Sign( a
);
6042 switch (status
->float_rounding_mode
) {
6043 case float_round_nearest_even
:
6044 if ( ( aExp
== 0x3FFE )
6045 && ( extractFloat128Frac0( a
)
6046 | extractFloat128Frac1( a
) )
6048 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6051 case float_round_ties_away
:
6052 if (aExp
== 0x3FFE) {
6053 return packFloat128(aSign
, 0x3FFF, 0, 0);
6056 case float_round_down
:
6058 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6059 : packFloat128( 0, 0, 0, 0 );
6060 case float_round_up
:
6062 aSign
? packFloat128( 1, 0, 0, 0 )
6063 : packFloat128( 0, 0x3FFF, 0, 0 );
6065 return packFloat128( aSign
, 0, 0, 0 );
6068 lastBitMask
<<= 0x402F - aExp
;
6069 roundBitsMask
= lastBitMask
- 1;
6072 switch (status
->float_rounding_mode
) {
6073 case float_round_nearest_even
:
6074 z
.high
+= lastBitMask
>>1;
6075 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6076 z
.high
&= ~ lastBitMask
;
6079 case float_round_ties_away
:
6080 z
.high
+= lastBitMask
>>1;
6082 case float_round_to_zero
:
6084 case float_round_up
:
6085 if (!extractFloat128Sign(z
)) {
6086 z
.high
|= ( a
.low
!= 0 );
6087 z
.high
+= roundBitsMask
;
6090 case float_round_down
:
6091 if (extractFloat128Sign(z
)) {
6092 z
.high
|= (a
.low
!= 0);
6093 z
.high
+= roundBitsMask
;
6099 z
.high
&= ~ roundBitsMask
;
6101 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
6102 status
->float_exception_flags
|= float_flag_inexact
;
6108 /*----------------------------------------------------------------------------
6109 | Returns the result of adding the absolute values of the quadruple-precision
6110 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6111 | before being returned. `zSign' is ignored if the result is a NaN.
6112 | The addition is performed according to the IEC/IEEE Standard for Binary
6113 | Floating-Point Arithmetic.
6114 *----------------------------------------------------------------------------*/
6116 static float128
addFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6117 float_status
*status
)
6119 int32_t aExp
, bExp
, zExp
;
6120 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6123 aSig1
= extractFloat128Frac1( a
);
6124 aSig0
= extractFloat128Frac0( a
);
6125 aExp
= extractFloat128Exp( a
);
6126 bSig1
= extractFloat128Frac1( b
);
6127 bSig0
= extractFloat128Frac0( b
);
6128 bExp
= extractFloat128Exp( b
);
6129 expDiff
= aExp
- bExp
;
6130 if ( 0 < expDiff
) {
6131 if ( aExp
== 0x7FFF ) {
6132 if (aSig0
| aSig1
) {
6133 return propagateFloat128NaN(a
, b
, status
);
6141 bSig0
|= LIT64( 0x0001000000000000 );
6143 shift128ExtraRightJamming(
6144 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
6147 else if ( expDiff
< 0 ) {
6148 if ( bExp
== 0x7FFF ) {
6149 if (bSig0
| bSig1
) {
6150 return propagateFloat128NaN(a
, b
, status
);
6152 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6158 aSig0
|= LIT64( 0x0001000000000000 );
6160 shift128ExtraRightJamming(
6161 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
6165 if ( aExp
== 0x7FFF ) {
6166 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6167 return propagateFloat128NaN(a
, b
, status
);
6171 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6173 if (status
->flush_to_zero
) {
6174 if (zSig0
| zSig1
) {
6175 float_raise(float_flag_output_denormal
, status
);
6177 return packFloat128(zSign
, 0, 0, 0);
6179 return packFloat128( zSign
, 0, zSig0
, zSig1
);
6182 zSig0
|= LIT64( 0x0002000000000000 );
6186 aSig0
|= LIT64( 0x0001000000000000 );
6187 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6189 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
6192 shift128ExtraRightJamming(
6193 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6195 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6199 /*----------------------------------------------------------------------------
6200 | Returns the result of subtracting the absolute values of the quadruple-
6201 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6202 | difference is negated before being returned. `zSign' is ignored if the
6203 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6204 | Standard for Binary Floating-Point Arithmetic.
6205 *----------------------------------------------------------------------------*/
6207 static float128
subFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6208 float_status
*status
)
6210 int32_t aExp
, bExp
, zExp
;
6211 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
6214 aSig1
= extractFloat128Frac1( a
);
6215 aSig0
= extractFloat128Frac0( a
);
6216 aExp
= extractFloat128Exp( a
);
6217 bSig1
= extractFloat128Frac1( b
);
6218 bSig0
= extractFloat128Frac0( b
);
6219 bExp
= extractFloat128Exp( b
);
6220 expDiff
= aExp
- bExp
;
6221 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6222 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
6223 if ( 0 < expDiff
) goto aExpBigger
;
6224 if ( expDiff
< 0 ) goto bExpBigger
;
6225 if ( aExp
== 0x7FFF ) {
6226 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6227 return propagateFloat128NaN(a
, b
, status
);
6229 float_raise(float_flag_invalid
, status
);
6230 return float128_default_nan(status
);
6236 if ( bSig0
< aSig0
) goto aBigger
;
6237 if ( aSig0
< bSig0
) goto bBigger
;
6238 if ( bSig1
< aSig1
) goto aBigger
;
6239 if ( aSig1
< bSig1
) goto bBigger
;
6240 return packFloat128(status
->float_rounding_mode
== float_round_down
,
6243 if ( bExp
== 0x7FFF ) {
6244 if (bSig0
| bSig1
) {
6245 return propagateFloat128NaN(a
, b
, status
);
6247 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
6253 aSig0
|= LIT64( 0x4000000000000000 );
6255 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6256 bSig0
|= LIT64( 0x4000000000000000 );
6258 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6261 goto normalizeRoundAndPack
;
6263 if ( aExp
== 0x7FFF ) {
6264 if (aSig0
| aSig1
) {
6265 return propagateFloat128NaN(a
, b
, status
);
6273 bSig0
|= LIT64( 0x4000000000000000 );
6275 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
6276 aSig0
|= LIT64( 0x4000000000000000 );
6278 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6280 normalizeRoundAndPack
:
6282 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
6287 /*----------------------------------------------------------------------------
6288 | Returns the result of adding the quadruple-precision floating-point values
6289 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6290 | for Binary Floating-Point Arithmetic.
6291 *----------------------------------------------------------------------------*/
6293 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
6297 aSign
= extractFloat128Sign( a
);
6298 bSign
= extractFloat128Sign( b
);
6299 if ( aSign
== bSign
) {
6300 return addFloat128Sigs(a
, b
, aSign
, status
);
6303 return subFloat128Sigs(a
, b
, aSign
, status
);
6308 /*----------------------------------------------------------------------------
6309 | Returns the result of subtracting the quadruple-precision floating-point
6310 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6311 | Standard for Binary Floating-Point Arithmetic.
6312 *----------------------------------------------------------------------------*/
6314 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
6318 aSign
= extractFloat128Sign( a
);
6319 bSign
= extractFloat128Sign( b
);
6320 if ( aSign
== bSign
) {
6321 return subFloat128Sigs(a
, b
, aSign
, status
);
6324 return addFloat128Sigs(a
, b
, aSign
, status
);
6329 /*----------------------------------------------------------------------------
6330 | Returns the result of multiplying the quadruple-precision floating-point
6331 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6332 | Standard for Binary Floating-Point Arithmetic.
6333 *----------------------------------------------------------------------------*/
6335 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
6337 flag aSign
, bSign
, zSign
;
6338 int32_t aExp
, bExp
, zExp
;
6339 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
6341 aSig1
= extractFloat128Frac1( a
);
6342 aSig0
= extractFloat128Frac0( a
);
6343 aExp
= extractFloat128Exp( a
);
6344 aSign
= extractFloat128Sign( a
);
6345 bSig1
= extractFloat128Frac1( b
);
6346 bSig0
= extractFloat128Frac0( b
);
6347 bExp
= extractFloat128Exp( b
);
6348 bSign
= extractFloat128Sign( b
);
6349 zSign
= aSign
^ bSign
;
6350 if ( aExp
== 0x7FFF ) {
6351 if ( ( aSig0
| aSig1
)
6352 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6353 return propagateFloat128NaN(a
, b
, status
);
6355 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
6356 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6358 if ( bExp
== 0x7FFF ) {
6359 if (bSig0
| bSig1
) {
6360 return propagateFloat128NaN(a
, b
, status
);
6362 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6364 float_raise(float_flag_invalid
, status
);
6365 return float128_default_nan(status
);
6367 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6370 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6371 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6374 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6375 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6377 zExp
= aExp
+ bExp
- 0x4000;
6378 aSig0
|= LIT64( 0x0001000000000000 );
6379 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
6380 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
6381 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6382 zSig2
|= ( zSig3
!= 0 );
6383 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
6384 shift128ExtraRightJamming(
6385 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6388 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6392 /*----------------------------------------------------------------------------
6393 | Returns the result of dividing the quadruple-precision floating-point value
6394 | `a' by the corresponding value `b'. The operation is performed according to
6395 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6396 *----------------------------------------------------------------------------*/
6398 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
6400 flag aSign
, bSign
, zSign
;
6401 int32_t aExp
, bExp
, zExp
;
6402 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6403 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6405 aSig1
= extractFloat128Frac1( a
);
6406 aSig0
= extractFloat128Frac0( a
);
6407 aExp
= extractFloat128Exp( a
);
6408 aSign
= extractFloat128Sign( a
);
6409 bSig1
= extractFloat128Frac1( b
);
6410 bSig0
= extractFloat128Frac0( b
);
6411 bExp
= extractFloat128Exp( b
);
6412 bSign
= extractFloat128Sign( b
);
6413 zSign
= aSign
^ bSign
;
6414 if ( aExp
== 0x7FFF ) {
6415 if (aSig0
| aSig1
) {
6416 return propagateFloat128NaN(a
, b
, status
);
6418 if ( bExp
== 0x7FFF ) {
6419 if (bSig0
| bSig1
) {
6420 return propagateFloat128NaN(a
, b
, status
);
6424 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6426 if ( bExp
== 0x7FFF ) {
6427 if (bSig0
| bSig1
) {
6428 return propagateFloat128NaN(a
, b
, status
);
6430 return packFloat128( zSign
, 0, 0, 0 );
6433 if ( ( bSig0
| bSig1
) == 0 ) {
6434 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6436 float_raise(float_flag_invalid
, status
);
6437 return float128_default_nan(status
);
6439 float_raise(float_flag_divbyzero
, status
);
6440 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6442 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6445 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6446 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6448 zExp
= aExp
- bExp
+ 0x3FFD;
6450 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
6452 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6453 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
6454 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
6457 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6458 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
6459 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
6460 while ( (int64_t) rem0
< 0 ) {
6462 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
6464 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
6465 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
6466 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
6467 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
6468 while ( (int64_t) rem1
< 0 ) {
6470 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
6472 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6474 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
6475 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6479 /*----------------------------------------------------------------------------
6480 | Returns the remainder of the quadruple-precision floating-point value `a'
6481 | with respect to the corresponding value `b'. The operation is performed
6482 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6483 *----------------------------------------------------------------------------*/
6485 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
6488 int32_t aExp
, bExp
, expDiff
;
6489 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
6490 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
6493 aSig1
= extractFloat128Frac1( a
);
6494 aSig0
= extractFloat128Frac0( a
);
6495 aExp
= extractFloat128Exp( a
);
6496 aSign
= extractFloat128Sign( a
);
6497 bSig1
= extractFloat128Frac1( b
);
6498 bSig0
= extractFloat128Frac0( b
);
6499 bExp
= extractFloat128Exp( b
);
6500 if ( aExp
== 0x7FFF ) {
6501 if ( ( aSig0
| aSig1
)
6502 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6503 return propagateFloat128NaN(a
, b
, status
);
6507 if ( bExp
== 0x7FFF ) {
6508 if (bSig0
| bSig1
) {
6509 return propagateFloat128NaN(a
, b
, status
);
6514 if ( ( bSig0
| bSig1
) == 0 ) {
6516 float_raise(float_flag_invalid
, status
);
6517 return float128_default_nan(status
);
6519 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6522 if ( ( aSig0
| aSig1
) == 0 ) return a
;
6523 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6525 expDiff
= aExp
- bExp
;
6526 if ( expDiff
< -1 ) return a
;
6528 aSig0
| LIT64( 0x0001000000000000 ),
6530 15 - ( expDiff
< 0 ),
6535 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6536 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
6537 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6539 while ( 0 < expDiff
) {
6540 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6541 q
= ( 4 < q
) ? q
- 4 : 0;
6542 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6543 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
6544 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
6545 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
6548 if ( -64 < expDiff
) {
6549 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6550 q
= ( 4 < q
) ? q
- 4 : 0;
6552 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6554 if ( expDiff
< 0 ) {
6555 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6558 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
6560 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6561 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
6564 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
6565 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6568 alternateASig0
= aSig0
;
6569 alternateASig1
= aSig1
;
6571 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6572 } while ( 0 <= (int64_t) aSig0
);
6574 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
6575 if ( ( sigMean0
< 0 )
6576 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
6577 aSig0
= alternateASig0
;
6578 aSig1
= alternateASig1
;
6580 zSign
= ( (int64_t) aSig0
< 0 );
6581 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
6582 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
6586 /*----------------------------------------------------------------------------
6587 | Returns the square root of the quadruple-precision floating-point value `a'.
6588 | The operation is performed according to the IEC/IEEE Standard for Binary
6589 | Floating-Point Arithmetic.
6590 *----------------------------------------------------------------------------*/
6592 float128
float128_sqrt(float128 a
, float_status
*status
)
6596 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
6597 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6599 aSig1
= extractFloat128Frac1( a
);
6600 aSig0
= extractFloat128Frac0( a
);
6601 aExp
= extractFloat128Exp( a
);
6602 aSign
= extractFloat128Sign( a
);
6603 if ( aExp
== 0x7FFF ) {
6604 if (aSig0
| aSig1
) {
6605 return propagateFloat128NaN(a
, a
, status
);
6607 if ( ! aSign
) return a
;
6611 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
6613 float_raise(float_flag_invalid
, status
);
6614 return float128_default_nan(status
);
6617 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
6618 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6620 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
6621 aSig0
|= LIT64( 0x0001000000000000 );
6622 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
6623 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
6624 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6625 doubleZSig0
= zSig0
<<1;
6626 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6627 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6628 while ( (int64_t) rem0
< 0 ) {
6631 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6633 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6634 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
6635 if ( zSig1
== 0 ) zSig1
= 1;
6636 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6637 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6638 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6639 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6640 while ( (int64_t) rem1
< 0 ) {
6642 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6644 term2
|= doubleZSig0
;
6645 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6647 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6649 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
6650 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
6654 /*----------------------------------------------------------------------------
6655 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6656 | the corresponding value `b', and 0 otherwise. The invalid exception is
6657 | raised if either operand is a NaN. Otherwise, the comparison is performed
6658 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6659 *----------------------------------------------------------------------------*/
6661 int float128_eq(float128 a
, float128 b
, float_status
*status
)
6664 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6665 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6666 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6667 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6669 float_raise(float_flag_invalid
, status
);
6674 && ( ( a
.high
== b
.high
)
6676 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6681 /*----------------------------------------------------------------------------
6682 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6683 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6684 | exception is raised if either operand is a NaN. The comparison is performed
6685 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6686 *----------------------------------------------------------------------------*/
6688 int float128_le(float128 a
, float128 b
, float_status
*status
)
6692 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6693 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6694 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6695 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6697 float_raise(float_flag_invalid
, status
);
6700 aSign
= extractFloat128Sign( a
);
6701 bSign
= extractFloat128Sign( b
);
6702 if ( aSign
!= bSign
) {
6705 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6709 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6710 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6714 /*----------------------------------------------------------------------------
6715 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6716 | the corresponding value `b', and 0 otherwise. The invalid exception is
6717 | raised if either operand is a NaN. The comparison is performed according
6718 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6719 *----------------------------------------------------------------------------*/
6721 int float128_lt(float128 a
, float128 b
, float_status
*status
)
6725 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6726 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6727 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6728 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6730 float_raise(float_flag_invalid
, status
);
6733 aSign
= extractFloat128Sign( a
);
6734 bSign
= extractFloat128Sign( b
);
6735 if ( aSign
!= bSign
) {
6738 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6742 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6743 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6747 /*----------------------------------------------------------------------------
6748 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6749 | be compared, and 0 otherwise. The invalid exception is raised if either
6750 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6751 | Standard for Binary Floating-Point Arithmetic.
6752 *----------------------------------------------------------------------------*/
6754 int float128_unordered(float128 a
, float128 b
, float_status
*status
)
6756 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6757 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6758 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6759 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6761 float_raise(float_flag_invalid
, status
);
6767 /*----------------------------------------------------------------------------
6768 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6769 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6770 | exception. The comparison is performed according to the IEC/IEEE Standard
6771 | for Binary Floating-Point Arithmetic.
6772 *----------------------------------------------------------------------------*/
6774 int float128_eq_quiet(float128 a
, float128 b
, float_status
*status
)
6777 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6778 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6779 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6780 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6782 if (float128_is_signaling_nan(a
, status
)
6783 || float128_is_signaling_nan(b
, status
)) {
6784 float_raise(float_flag_invalid
, status
);
6790 && ( ( a
.high
== b
.high
)
6792 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6797 /*----------------------------------------------------------------------------
6798 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6799 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6800 | cause an exception. Otherwise, the comparison is performed according to the
6801 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6802 *----------------------------------------------------------------------------*/
6804 int float128_le_quiet(float128 a
, float128 b
, float_status
*status
)
6808 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6809 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6810 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6811 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6813 if (float128_is_signaling_nan(a
, status
)
6814 || float128_is_signaling_nan(b
, status
)) {
6815 float_raise(float_flag_invalid
, status
);
6819 aSign
= extractFloat128Sign( a
);
6820 bSign
= extractFloat128Sign( b
);
6821 if ( aSign
!= bSign
) {
6824 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6828 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6829 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6833 /*----------------------------------------------------------------------------
6834 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6835 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6836 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6837 | Standard for Binary Floating-Point Arithmetic.
6838 *----------------------------------------------------------------------------*/
6840 int float128_lt_quiet(float128 a
, float128 b
, float_status
*status
)
6844 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6845 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6846 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6847 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6849 if (float128_is_signaling_nan(a
, status
)
6850 || float128_is_signaling_nan(b
, status
)) {
6851 float_raise(float_flag_invalid
, status
);
6855 aSign
= extractFloat128Sign( a
);
6856 bSign
= extractFloat128Sign( b
);
6857 if ( aSign
!= bSign
) {
6860 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6864 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6865 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6869 /*----------------------------------------------------------------------------
6870 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6871 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6872 | comparison is performed according to the IEC/IEEE Standard for Binary
6873 | Floating-Point Arithmetic.
6874 *----------------------------------------------------------------------------*/
6876 int float128_unordered_quiet(float128 a
, float128 b
, float_status
*status
)
6878 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6879 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6880 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6881 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6883 if (float128_is_signaling_nan(a
, status
)
6884 || float128_is_signaling_nan(b
, status
)) {
6885 float_raise(float_flag_invalid
, status
);
6892 static inline int floatx80_compare_internal(floatx80 a
, floatx80 b
,
6893 int is_quiet
, float_status
*status
)
6897 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6898 float_raise(float_flag_invalid
, status
);
6899 return float_relation_unordered
;
6901 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
6902 ( extractFloatx80Frac( a
)<<1 ) ) ||
6903 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
6904 ( extractFloatx80Frac( b
)<<1 ) )) {
6906 floatx80_is_signaling_nan(a
, status
) ||
6907 floatx80_is_signaling_nan(b
, status
)) {
6908 float_raise(float_flag_invalid
, status
);
6910 return float_relation_unordered
;
6912 aSign
= extractFloatx80Sign( a
);
6913 bSign
= extractFloatx80Sign( b
);
6914 if ( aSign
!= bSign
) {
6916 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
6917 ( ( a
.low
| b
.low
) == 0 ) ) {
6919 return float_relation_equal
;
6921 return 1 - (2 * aSign
);
6924 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6925 return float_relation_equal
;
6927 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6932 int floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
6934 return floatx80_compare_internal(a
, b
, 0, status
);
6937 int floatx80_compare_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6939 return floatx80_compare_internal(a
, b
, 1, status
);
6942 static inline int float128_compare_internal(float128 a
, float128 b
,
6943 int is_quiet
, float_status
*status
)
6947 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
6948 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
6949 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
6950 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
6952 float128_is_signaling_nan(a
, status
) ||
6953 float128_is_signaling_nan(b
, status
)) {
6954 float_raise(float_flag_invalid
, status
);
6956 return float_relation_unordered
;
6958 aSign
= extractFloat128Sign( a
);
6959 bSign
= extractFloat128Sign( b
);
6960 if ( aSign
!= bSign
) {
6961 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
6963 return float_relation_equal
;
6965 return 1 - (2 * aSign
);
6968 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6969 return float_relation_equal
;
6971 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6976 int float128_compare(float128 a
, float128 b
, float_status
*status
)
6978 return float128_compare_internal(a
, b
, 0, status
);
6981 int float128_compare_quiet(float128 a
, float128 b
, float_status
*status
)
6983 return float128_compare_internal(a
, b
, 1, status
);
6986 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
6992 if (floatx80_invalid_encoding(a
)) {
6993 float_raise(float_flag_invalid
, status
);
6994 return floatx80_default_nan(status
);
6996 aSig
= extractFloatx80Frac( a
);
6997 aExp
= extractFloatx80Exp( a
);
6998 aSign
= extractFloatx80Sign( a
);
7000 if ( aExp
== 0x7FFF ) {
7002 return propagateFloatx80NaN(a
, a
, status
);
7016 } else if (n
< -0x10000) {
7021 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7022 aSign
, aExp
, aSig
, 0, status
);
7025 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7029 uint64_t aSig0
, aSig1
;
7031 aSig1
= extractFloat128Frac1( a
);
7032 aSig0
= extractFloat128Frac0( a
);
7033 aExp
= extractFloat128Exp( a
);
7034 aSign
= extractFloat128Sign( a
);
7035 if ( aExp
== 0x7FFF ) {
7036 if ( aSig0
| aSig1
) {
7037 return propagateFloat128NaN(a
, a
, status
);
7042 aSig0
|= LIT64( 0x0001000000000000 );
7043 } else if (aSig0
== 0 && aSig1
== 0) {
7051 } else if (n
< -0x10000) {
7056 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1