4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine: (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output. These details are target-
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
112 static inline uint32_t extractFloat16Frac(float16 a
)
114 return float16_val(a
) & 0x3ff;
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
121 static inline int extractFloat16Exp(float16 a
)
123 return (float16_val(a
) >> 10) & 0x1f;
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
130 static inline flag
extractFloat16Sign(float16 a
)
132 return float16_val(a
)>>15;
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
139 static inline uint32_t extractFloat32Frac(float32 a
)
141 return float32_val(a
) & 0x007FFFFF;
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
148 static inline int extractFloat32Exp(float32 a
)
150 return (float32_val(a
) >> 23) & 0xFF;
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
157 static inline flag
extractFloat32Sign(float32 a
)
159 return float32_val(a
) >> 31;
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
166 static inline uint64_t extractFloat64Frac(float64 a
)
168 return float64_val(a
) & LIT64(0x000FFFFFFFFFFFFF);
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
175 static inline int extractFloat64Exp(float64 a
)
177 return (float64_val(a
) >> 52) & 0x7FF;
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
184 static inline flag
extractFloat64Sign(float64 a
)
186 return float64_val(a
) >> 63;
190 * Classify a floating point number. Everything above float_class_qnan
191 * is a NaN so cls >= float_class_qnan is any NaN.
194 typedef enum __attribute__ ((__packed__
)) {
195 float_class_unclassified
,
199 float_class_qnan
, /* all NaNs from here */
202 float_class_msnan
, /* maybe silenced */
206 * Structure holding all of the decomposed parts of a float. The
207 * exponent is unbiased and the fraction is normalized. All
208 * calculations are done with a 64 bit fraction and then rounded as
209 * appropriate for the final format.
211 * Thanks to the packed FloatClass a decent compiler should be able to
212 * fit the whole structure into registers and avoid using the stack
213 * for parameter passing.
223 #define DECOMPOSED_BINARY_POINT (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
227 /* Structure holding all of the relevant parameters for a format.
228 * exp_size: the size of the exponent field
229 * exp_bias: the offset applied to the exponent field
230 * exp_max: the maximum normalised exponent
231 * frac_size: the size of the fraction field
232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233 * The following are computed based the size of fraction
234 * frac_lsb: least significant bit of fraction
235 * fram_lsbm1: the bit bellow the least significant bit (for rounding)
236 * round_mask/roundeven_mask: masks used for rounding
247 uint64_t roundeven_mask
;
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F) \
253 .exp_bias = ((1 << E) - 1) >> 1, \
254 .exp_max = (1 << E) - 1, \
256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
262 static const FloatFmt float16_params
= {
266 static const FloatFmt float32_params
= {
270 static const FloatFmt float64_params
= {
274 /* Unpack a float to parts, but do not canonicalize. */
275 static inline FloatParts
unpack_raw(FloatFmt fmt
, uint64_t raw
)
277 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
279 return (FloatParts
) {
280 .cls
= float_class_unclassified
,
281 .sign
= extract64(raw
, sign_pos
, 1),
282 .exp
= extract64(raw
, fmt
.frac_size
, fmt
.exp_size
),
283 .frac
= extract64(raw
, 0, fmt
.frac_size
),
287 static inline FloatParts
float16_unpack_raw(float16 f
)
289 return unpack_raw(float16_params
, f
);
292 static inline FloatParts
float32_unpack_raw(float32 f
)
294 return unpack_raw(float32_params
, f
);
297 static inline FloatParts
float64_unpack_raw(float64 f
)
299 return unpack_raw(float64_params
, f
);
302 /* Pack a float from parts, but do not canonicalize. */
303 static inline uint64_t pack_raw(FloatFmt fmt
, FloatParts p
)
305 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
306 uint64_t ret
= deposit64(p
.frac
, fmt
.frac_size
, fmt
.exp_size
, p
.exp
);
307 return deposit64(ret
, sign_pos
, 1, p
.sign
);
310 static inline float16
float16_pack_raw(FloatParts p
)
312 return make_float16(pack_raw(float16_params
, p
));
315 static inline float32
float32_pack_raw(FloatParts p
)
317 return make_float32(pack_raw(float32_params
, p
));
320 static inline float64
float64_pack_raw(FloatParts p
)
322 return make_float64(pack_raw(float64_params
, p
));
325 /* Canonicalize EXP and FRAC, setting CLS. */
326 static FloatParts
canonicalize(FloatParts part
, const FloatFmt
*parm
,
327 float_status
*status
)
329 if (part
.exp
== parm
->exp_max
) {
330 if (part
.frac
== 0) {
331 part
.cls
= float_class_inf
;
333 #ifdef NO_SIGNALING_NANS
334 part
.cls
= float_class_qnan
;
336 int64_t msb
= part
.frac
<< (parm
->frac_shift
+ 2);
337 if ((msb
< 0) == status
->snan_bit_is_one
) {
338 part
.cls
= float_class_snan
;
340 part
.cls
= float_class_qnan
;
344 } else if (part
.exp
== 0) {
345 if (likely(part
.frac
== 0)) {
346 part
.cls
= float_class_zero
;
347 } else if (status
->flush_inputs_to_zero
) {
348 float_raise(float_flag_input_denormal
, status
);
349 part
.cls
= float_class_zero
;
352 int shift
= clz64(part
.frac
) - 1;
353 part
.cls
= float_class_normal
;
354 part
.exp
= parm
->frac_shift
- parm
->exp_bias
- shift
+ 1;
358 part
.cls
= float_class_normal
;
359 part
.exp
-= parm
->exp_bias
;
360 part
.frac
= DECOMPOSED_IMPLICIT_BIT
+ (part
.frac
<< parm
->frac_shift
);
365 /* Round and uncanonicalize a floating-point number by parts. There
366 * are FRAC_SHIFT bits that may require rounding at the bottom of the
367 * fraction; these bits will be removed. The exponent will be biased
368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
371 static FloatParts
round_canonical(FloatParts p
, float_status
*s
,
372 const FloatFmt
*parm
)
374 const uint64_t frac_lsbm1
= parm
->frac_lsbm1
;
375 const uint64_t round_mask
= parm
->round_mask
;
376 const uint64_t roundeven_mask
= parm
->roundeven_mask
;
377 const int exp_max
= parm
->exp_max
;
378 const int frac_shift
= parm
->frac_shift
;
387 case float_class_normal
:
388 switch (s
->float_rounding_mode
) {
389 case float_round_nearest_even
:
390 overflow_norm
= false;
391 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
393 case float_round_ties_away
:
394 overflow_norm
= false;
397 case float_round_to_zero
:
398 overflow_norm
= true;
402 inc
= p
.sign
? 0 : round_mask
;
403 overflow_norm
= p
.sign
;
405 case float_round_down
:
406 inc
= p
.sign
? round_mask
: 0;
407 overflow_norm
= !p
.sign
;
410 g_assert_not_reached();
413 exp
+= parm
->exp_bias
;
414 if (likely(exp
> 0)) {
415 if (frac
& round_mask
) {
416 flags
|= float_flag_inexact
;
418 if (frac
& DECOMPOSED_OVERFLOW_BIT
) {
425 if (unlikely(exp
>= exp_max
)) {
426 flags
|= float_flag_overflow
| float_flag_inexact
;
431 p
.cls
= float_class_inf
;
435 } else if (s
->flush_to_zero
) {
436 flags
|= float_flag_output_denormal
;
437 p
.cls
= float_class_zero
;
440 bool is_tiny
= (s
->float_detect_tininess
441 == float_tininess_before_rounding
)
443 || !((frac
+ inc
) & DECOMPOSED_OVERFLOW_BIT
);
445 shift64RightJamming(frac
, 1 - exp
, &frac
);
446 if (frac
& round_mask
) {
447 /* Need to recompute round-to-even. */
448 if (s
->float_rounding_mode
== float_round_nearest_even
) {
449 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
452 flags
|= float_flag_inexact
;
456 exp
= (frac
& DECOMPOSED_IMPLICIT_BIT
? 1 : 0);
459 if (is_tiny
&& (flags
& float_flag_inexact
)) {
460 flags
|= float_flag_underflow
;
462 if (exp
== 0 && frac
== 0) {
463 p
.cls
= float_class_zero
;
468 case float_class_zero
:
474 case float_class_inf
:
480 case float_class_qnan
:
481 case float_class_snan
:
486 g_assert_not_reached();
489 float_raise(flags
, s
);
495 static FloatParts
float16_unpack_canonical(float16 f
, float_status
*s
)
497 return canonicalize(float16_unpack_raw(f
), &float16_params
, s
);
500 static float16
float16_round_pack_canonical(FloatParts p
, float_status
*s
)
503 case float_class_dnan
:
504 return float16_default_nan(s
);
505 case float_class_msnan
:
506 return float16_maybe_silence_nan(float16_pack_raw(p
), s
);
508 p
= round_canonical(p
, s
, &float16_params
);
509 return float16_pack_raw(p
);
513 static FloatParts
float32_unpack_canonical(float32 f
, float_status
*s
)
515 return canonicalize(float32_unpack_raw(f
), &float32_params
, s
);
518 static float32
float32_round_pack_canonical(FloatParts p
, float_status
*s
)
521 case float_class_dnan
:
522 return float32_default_nan(s
);
523 case float_class_msnan
:
524 return float32_maybe_silence_nan(float32_pack_raw(p
), s
);
526 p
= round_canonical(p
, s
, &float32_params
);
527 return float32_pack_raw(p
);
531 static FloatParts
float64_unpack_canonical(float64 f
, float_status
*s
)
533 return canonicalize(float64_unpack_raw(f
), &float64_params
, s
);
536 static float64
float64_round_pack_canonical(FloatParts p
, float_status
*s
)
539 case float_class_dnan
:
540 return float64_default_nan(s
);
541 case float_class_msnan
:
542 return float64_maybe_silence_nan(float64_pack_raw(p
), s
);
544 p
= round_canonical(p
, s
, &float64_params
);
545 return float64_pack_raw(p
);
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c
)
552 return unlikely(c
>= float_class_qnan
);
554 static bool is_snan(FloatClass c
)
556 return c
== float_class_snan
;
558 static bool is_qnan(FloatClass c
)
560 return c
== float_class_qnan
;
563 static FloatParts
return_nan(FloatParts a
, float_status
*s
)
566 case float_class_snan
:
567 s
->float_exception_flags
|= float_flag_invalid
;
568 a
.cls
= float_class_msnan
;
570 case float_class_qnan
:
571 if (s
->default_nan_mode
) {
572 a
.cls
= float_class_dnan
;
577 g_assert_not_reached();
582 static FloatParts
pick_nan(FloatParts a
, FloatParts b
, float_status
*s
)
584 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
585 s
->float_exception_flags
|= float_flag_invalid
;
588 if (s
->default_nan_mode
) {
589 a
.cls
= float_class_dnan
;
591 if (pickNaN(is_qnan(a
.cls
), is_snan(a
.cls
),
592 is_qnan(b
.cls
), is_snan(b
.cls
),
594 (a
.frac
== b
.frac
&& a
.sign
< b
.sign
))) {
597 a
.cls
= float_class_msnan
;
602 static FloatParts
pick_nan_muladd(FloatParts a
, FloatParts b
, FloatParts c
,
603 bool inf_zero
, float_status
*s
)
607 if (is_snan(a
.cls
) || is_snan(b
.cls
) || is_snan(c
.cls
)) {
608 s
->float_exception_flags
|= float_flag_invalid
;
611 which
= pickNaNMulAdd(is_qnan(a
.cls
), is_snan(a
.cls
),
612 is_qnan(b
.cls
), is_snan(b
.cls
),
613 is_qnan(c
.cls
), is_snan(c
.cls
),
616 if (s
->default_nan_mode
) {
617 /* Note that this check is after pickNaNMulAdd so that function
618 * has an opportunity to set the Invalid flag.
620 a
.cls
= float_class_dnan
;
634 a
.cls
= float_class_dnan
;
637 g_assert_not_reached();
639 a
.cls
= float_class_msnan
;
645 * Returns the result of adding or subtracting the values of the
646 * floating-point values `a' and `b'. The operation is performed
647 * according to the IEC/IEEE Standard for Binary Floating-Point
651 static FloatParts
addsub_floats(FloatParts a
, FloatParts b
, bool subtract
,
654 bool a_sign
= a
.sign
;
655 bool b_sign
= b
.sign
^ subtract
;
657 if (a_sign
!= b_sign
) {
660 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
661 if (a
.exp
> b
.exp
|| (a
.exp
== b
.exp
&& a
.frac
>= b
.frac
)) {
662 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
663 a
.frac
= a
.frac
- b
.frac
;
665 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
666 a
.frac
= b
.frac
- a
.frac
;
672 a
.cls
= float_class_zero
;
673 a
.sign
= s
->float_rounding_mode
== float_round_down
;
675 int shift
= clz64(a
.frac
) - 1;
676 a
.frac
= a
.frac
<< shift
;
677 a
.exp
= a
.exp
- shift
;
682 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
683 return pick_nan(a
, b
, s
);
685 if (a
.cls
== float_class_inf
) {
686 if (b
.cls
== float_class_inf
) {
687 float_raise(float_flag_invalid
, s
);
688 a
.cls
= float_class_dnan
;
692 if (a
.cls
== float_class_zero
&& b
.cls
== float_class_zero
) {
693 a
.sign
= s
->float_rounding_mode
== float_round_down
;
696 if (a
.cls
== float_class_zero
|| b
.cls
== float_class_inf
) {
700 if (b
.cls
== float_class_zero
) {
705 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
707 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
708 } else if (a
.exp
< b
.exp
) {
709 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
713 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
719 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
720 return pick_nan(a
, b
, s
);
722 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
725 if (b
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
730 g_assert_not_reached();
734 * Returns the result of adding or subtracting the floating-point
735 * values `a' and `b'. The operation is performed according to the
736 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
739 float16
__attribute__((flatten
)) float16_add(float16 a
, float16 b
,
740 float_status
*status
)
742 FloatParts pa
= float16_unpack_canonical(a
, status
);
743 FloatParts pb
= float16_unpack_canonical(b
, status
);
744 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
746 return float16_round_pack_canonical(pr
, status
);
749 float32
__attribute__((flatten
)) float32_add(float32 a
, float32 b
,
750 float_status
*status
)
752 FloatParts pa
= float32_unpack_canonical(a
, status
);
753 FloatParts pb
= float32_unpack_canonical(b
, status
);
754 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
756 return float32_round_pack_canonical(pr
, status
);
759 float64
__attribute__((flatten
)) float64_add(float64 a
, float64 b
,
760 float_status
*status
)
762 FloatParts pa
= float64_unpack_canonical(a
, status
);
763 FloatParts pb
= float64_unpack_canonical(b
, status
);
764 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
766 return float64_round_pack_canonical(pr
, status
);
769 float16
__attribute__((flatten
)) float16_sub(float16 a
, float16 b
,
770 float_status
*status
)
772 FloatParts pa
= float16_unpack_canonical(a
, status
);
773 FloatParts pb
= float16_unpack_canonical(b
, status
);
774 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
776 return float16_round_pack_canonical(pr
, status
);
779 float32
__attribute__((flatten
)) float32_sub(float32 a
, float32 b
,
780 float_status
*status
)
782 FloatParts pa
= float32_unpack_canonical(a
, status
);
783 FloatParts pb
= float32_unpack_canonical(b
, status
);
784 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
786 return float32_round_pack_canonical(pr
, status
);
789 float64
__attribute__((flatten
)) float64_sub(float64 a
, float64 b
,
790 float_status
*status
)
792 FloatParts pa
= float64_unpack_canonical(a
, status
);
793 FloatParts pb
= float64_unpack_canonical(b
, status
);
794 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
796 return float64_round_pack_canonical(pr
, status
);
800 * Returns the result of multiplying the floating-point values `a' and
801 * `b'. The operation is performed according to the IEC/IEEE Standard
802 * for Binary Floating-Point Arithmetic.
805 static FloatParts
mul_floats(FloatParts a
, FloatParts b
, float_status
*s
)
807 bool sign
= a
.sign
^ b
.sign
;
809 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
811 int exp
= a
.exp
+ b
.exp
;
813 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
814 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
815 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
816 shift64RightJamming(lo
, 1, &lo
);
826 /* handle all the NaN cases */
827 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
828 return pick_nan(a
, b
, s
);
830 /* Inf * Zero == NaN */
831 if ((a
.cls
== float_class_inf
&& b
.cls
== float_class_zero
) ||
832 (a
.cls
== float_class_zero
&& b
.cls
== float_class_inf
)) {
833 s
->float_exception_flags
|= float_flag_invalid
;
834 a
.cls
= float_class_dnan
;
838 /* Multiply by 0 or Inf */
839 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
843 if (b
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
847 g_assert_not_reached();
850 float16
__attribute__((flatten
)) float16_mul(float16 a
, float16 b
,
851 float_status
*status
)
853 FloatParts pa
= float16_unpack_canonical(a
, status
);
854 FloatParts pb
= float16_unpack_canonical(b
, status
);
855 FloatParts pr
= mul_floats(pa
, pb
, status
);
857 return float16_round_pack_canonical(pr
, status
);
860 float32
__attribute__((flatten
)) float32_mul(float32 a
, float32 b
,
861 float_status
*status
)
863 FloatParts pa
= float32_unpack_canonical(a
, status
);
864 FloatParts pb
= float32_unpack_canonical(b
, status
);
865 FloatParts pr
= mul_floats(pa
, pb
, status
);
867 return float32_round_pack_canonical(pr
, status
);
870 float64
__attribute__((flatten
)) float64_mul(float64 a
, float64 b
,
871 float_status
*status
)
873 FloatParts pa
= float64_unpack_canonical(a
, status
);
874 FloatParts pb
= float64_unpack_canonical(b
, status
);
875 FloatParts pr
= mul_floats(pa
, pb
, status
);
877 return float64_round_pack_canonical(pr
, status
);
881 * Returns the result of multiplying the floating-point values `a' and
882 * `b' then adding 'c', with no intermediate rounding step after the
883 * multiplication. The operation is performed according to the
884 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
885 * The flags argument allows the caller to select negation of the
886 * addend, the intermediate product, or the final result. (The
887 * difference between this and having the caller do a separate
888 * negation is that negating externally will flip the sign bit on
892 static FloatParts
muladd_floats(FloatParts a
, FloatParts b
, FloatParts c
,
893 int flags
, float_status
*s
)
895 bool inf_zero
= ((1 << a
.cls
) | (1 << b
.cls
)) ==
896 ((1 << float_class_inf
) | (1 << float_class_zero
));
898 bool sign_flip
= flags
& float_muladd_negate_result
;
903 /* It is implementation-defined whether the cases of (0,inf,qnan)
904 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
905 * they return if they do), so we have to hand this information
906 * off to the target-specific pick-a-NaN routine.
908 if (is_nan(a
.cls
) || is_nan(b
.cls
) || is_nan(c
.cls
)) {
909 return pick_nan_muladd(a
, b
, c
, inf_zero
, s
);
913 s
->float_exception_flags
|= float_flag_invalid
;
914 a
.cls
= float_class_dnan
;
918 if (flags
& float_muladd_negate_c
) {
922 p_sign
= a
.sign
^ b
.sign
;
924 if (flags
& float_muladd_negate_product
) {
928 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_inf
) {
929 p_class
= float_class_inf
;
930 } else if (a
.cls
== float_class_zero
|| b
.cls
== float_class_zero
) {
931 p_class
= float_class_zero
;
933 p_class
= float_class_normal
;
936 if (c
.cls
== float_class_inf
) {
937 if (p_class
== float_class_inf
&& p_sign
!= c
.sign
) {
938 s
->float_exception_flags
|= float_flag_invalid
;
939 a
.cls
= float_class_dnan
;
941 a
.cls
= float_class_inf
;
942 a
.sign
= c
.sign
^ sign_flip
;
947 if (p_class
== float_class_inf
) {
948 a
.cls
= float_class_inf
;
949 a
.sign
= p_sign
^ sign_flip
;
953 if (p_class
== float_class_zero
) {
954 if (c
.cls
== float_class_zero
) {
955 if (p_sign
!= c
.sign
) {
956 p_sign
= s
->float_rounding_mode
== float_round_down
;
959 } else if (flags
& float_muladd_halve_result
) {
966 /* a & b should be normals now... */
967 assert(a
.cls
== float_class_normal
&&
968 b
.cls
== float_class_normal
);
970 p_exp
= a
.exp
+ b
.exp
;
972 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
975 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
976 /* binary point now at bit 124 */
978 /* check for overflow */
979 if (hi
& (1ULL << (DECOMPOSED_BINARY_POINT
* 2 + 1 - 64))) {
980 shift128RightJamming(hi
, lo
, 1, &hi
, &lo
);
985 if (c
.cls
== float_class_zero
) {
986 /* move binary point back to 62 */
987 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
989 int exp_diff
= p_exp
- c
.exp
;
990 if (p_sign
== c
.sign
) {
993 shift128RightJamming(hi
, lo
,
994 DECOMPOSED_BINARY_POINT
- exp_diff
,
1000 /* shift c to the same binary point as the product (124) */
1003 shift128RightJamming(c_hi
, c_lo
,
1006 add128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1007 /* move binary point back to 62 */
1008 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1011 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
1012 shift64RightJamming(lo
, 1, &lo
);
1018 uint64_t c_hi
, c_lo
;
1019 /* make C binary point match product at bit 124 */
1023 if (exp_diff
<= 0) {
1024 shift128RightJamming(hi
, lo
, -exp_diff
, &hi
, &lo
);
1027 (hi
> c_hi
|| (hi
== c_hi
&& lo
>= c_lo
))) {
1028 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1030 sub128(c_hi
, c_lo
, hi
, lo
, &hi
, &lo
);
1035 shift128RightJamming(c_hi
, c_lo
,
1038 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1041 if (hi
== 0 && lo
== 0) {
1042 a
.cls
= float_class_zero
;
1043 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1044 a
.sign
^= sign_flip
;
1051 shift
= clz64(lo
) + 64;
1053 /* Normalizing to a binary point of 124 is the
1054 correct adjust for the exponent. However since we're
1055 shifting, we might as well put the binary point back
1056 at 62 where we really want it. Therefore shift as
1057 if we're leaving 1 bit at the top of the word, but
1058 adjust the exponent as if we're leaving 3 bits. */
1061 lo
= lo
<< (shift
- 64);
1063 hi
= (hi
<< shift
) | (lo
>> (64 - shift
));
1064 lo
= hi
| ((lo
<< shift
) != 0);
1071 if (flags
& float_muladd_halve_result
) {
1075 /* finally prepare our result */
1076 a
.cls
= float_class_normal
;
1077 a
.sign
= p_sign
^ sign_flip
;
1084 float16
__attribute__((flatten
)) float16_muladd(float16 a
, float16 b
, float16 c
,
1085 int flags
, float_status
*status
)
1087 FloatParts pa
= float16_unpack_canonical(a
, status
);
1088 FloatParts pb
= float16_unpack_canonical(b
, status
);
1089 FloatParts pc
= float16_unpack_canonical(c
, status
);
1090 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1092 return float16_round_pack_canonical(pr
, status
);
1095 float32
__attribute__((flatten
)) float32_muladd(float32 a
, float32 b
, float32 c
,
1096 int flags
, float_status
*status
)
1098 FloatParts pa
= float32_unpack_canonical(a
, status
);
1099 FloatParts pb
= float32_unpack_canonical(b
, status
);
1100 FloatParts pc
= float32_unpack_canonical(c
, status
);
1101 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1103 return float32_round_pack_canonical(pr
, status
);
1106 float64
__attribute__((flatten
)) float64_muladd(float64 a
, float64 b
, float64 c
,
1107 int flags
, float_status
*status
)
1109 FloatParts pa
= float64_unpack_canonical(a
, status
);
1110 FloatParts pb
= float64_unpack_canonical(b
, status
);
1111 FloatParts pc
= float64_unpack_canonical(c
, status
);
1112 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1114 return float64_round_pack_canonical(pr
, status
);
1118 * Returns the result of dividing the floating-point value `a' by the
1119 * corresponding value `b'. The operation is performed according to
1120 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123 static FloatParts
div_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1125 bool sign
= a
.sign
^ b
.sign
;
1127 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1128 uint64_t temp_lo
, temp_hi
;
1129 int exp
= a
.exp
- b
.exp
;
1130 if (a
.frac
< b
.frac
) {
1132 shortShift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1,
1133 &temp_hi
, &temp_lo
);
1135 shortShift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
,
1136 &temp_hi
, &temp_lo
);
1138 /* LSB of quot is set if inexact which roundandpack will use
1139 * to set flags. Yet again we re-use a for the result */
1140 a
.frac
= div128To64(temp_lo
, temp_hi
, b
.frac
);
1145 /* handle all the NaN cases */
1146 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1147 return pick_nan(a
, b
, s
);
1149 /* 0/0 or Inf/Inf */
1152 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1153 s
->float_exception_flags
|= float_flag_invalid
;
1154 a
.cls
= float_class_dnan
;
1157 /* Inf / x or 0 / x */
1158 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1163 if (b
.cls
== float_class_zero
) {
1164 s
->float_exception_flags
|= float_flag_divbyzero
;
1165 a
.cls
= float_class_inf
;
1170 if (b
.cls
== float_class_inf
) {
1171 a
.cls
= float_class_zero
;
1175 g_assert_not_reached();
1178 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1180 FloatParts pa
= float16_unpack_canonical(a
, status
);
1181 FloatParts pb
= float16_unpack_canonical(b
, status
);
1182 FloatParts pr
= div_floats(pa
, pb
, status
);
1184 return float16_round_pack_canonical(pr
, status
);
1187 float32
float32_div(float32 a
, float32 b
, float_status
*status
)
1189 FloatParts pa
= float32_unpack_canonical(a
, status
);
1190 FloatParts pb
= float32_unpack_canonical(b
, status
);
1191 FloatParts pr
= div_floats(pa
, pb
, status
);
1193 return float32_round_pack_canonical(pr
, status
);
1196 float64
float64_div(float64 a
, float64 b
, float_status
*status
)
1198 FloatParts pa
= float64_unpack_canonical(a
, status
);
1199 FloatParts pb
= float64_unpack_canonical(b
, status
);
1200 FloatParts pr
= div_floats(pa
, pb
, status
);
1202 return float64_round_pack_canonical(pr
, status
);
1206 * Rounds the floating-point value `a' to an integer, and returns the
1207 * result as a floating-point value. The operation is performed
1208 * according to the IEC/IEEE Standard for Binary Floating-Point
1212 static FloatParts
round_to_int(FloatParts a
, int rounding_mode
, float_status
*s
)
1214 if (is_nan(a
.cls
)) {
1215 return return_nan(a
, s
);
1219 case float_class_zero
:
1220 case float_class_inf
:
1221 case float_class_qnan
:
1222 /* already "integral" */
1224 case float_class_normal
:
1225 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
1226 /* already integral */
1231 /* all fractional */
1232 s
->float_exception_flags
|= float_flag_inexact
;
1233 switch (rounding_mode
) {
1234 case float_round_nearest_even
:
1235 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
1237 case float_round_ties_away
:
1238 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
1240 case float_round_to_zero
:
1243 case float_round_up
:
1246 case float_round_down
:
1250 g_assert_not_reached();
1254 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
1257 a
.cls
= float_class_zero
;
1260 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
1261 uint64_t frac_lsbm1
= frac_lsb
>> 1;
1262 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
1263 uint64_t rnd_mask
= rnd_even_mask
>> 1;
1266 switch (rounding_mode
) {
1267 case float_round_nearest_even
:
1268 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
1270 case float_round_ties_away
:
1273 case float_round_to_zero
:
1276 case float_round_up
:
1277 inc
= a
.sign
? 0 : rnd_mask
;
1279 case float_round_down
:
1280 inc
= a
.sign
? rnd_mask
: 0;
1283 g_assert_not_reached();
1286 if (a
.frac
& rnd_mask
) {
1287 s
->float_exception_flags
|= float_flag_inexact
;
1289 a
.frac
&= ~rnd_mask
;
1290 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
1298 g_assert_not_reached();
1303 float16
float16_round_to_int(float16 a
, float_status
*s
)
1305 FloatParts pa
= float16_unpack_canonical(a
, s
);
1306 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, s
);
1307 return float16_round_pack_canonical(pr
, s
);
1310 float32
float32_round_to_int(float32 a
, float_status
*s
)
1312 FloatParts pa
= float32_unpack_canonical(a
, s
);
1313 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, s
);
1314 return float32_round_pack_canonical(pr
, s
);
1317 float64
float64_round_to_int(float64 a
, float_status
*s
)
1319 FloatParts pa
= float64_unpack_canonical(a
, s
);
1320 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, s
);
1321 return float64_round_pack_canonical(pr
, s
);
1324 float64
float64_trunc_to_int(float64 a
, float_status
*s
)
1326 FloatParts pa
= float64_unpack_canonical(a
, s
);
1327 FloatParts pr
= round_to_int(pa
, float_round_to_zero
, s
);
1328 return float64_round_pack_canonical(pr
, s
);
1332 * Returns the result of converting the floating-point value `a' to
1333 * the two's complement integer format. The conversion is performed
1334 * according to the IEC/IEEE Standard for Binary Floating-Point
1335 * Arithmetic---which means in particular that the conversion is
1336 * rounded according to the current rounding mode. If `a' is a NaN,
1337 * the largest positive integer is returned. Otherwise, if the
1338 * conversion overflows, the largest integer with the same sign as `a'
1342 static int64_t round_to_int_and_pack(FloatParts in
, int rmode
,
1343 int64_t min
, int64_t max
,
1347 int orig_flags
= get_float_exception_flags(s
);
1348 FloatParts p
= round_to_int(in
, rmode
, s
);
1351 case float_class_snan
:
1352 case float_class_qnan
:
1353 case float_class_dnan
:
1354 case float_class_msnan
:
1355 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1357 case float_class_inf
:
1358 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1359 return p
.sign
? min
: max
;
1360 case float_class_zero
:
1362 case float_class_normal
:
1363 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
1364 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
1365 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
1366 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
1371 if (r
<= -(uint64_t) min
) {
1374 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1381 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1386 g_assert_not_reached();
1390 #define FLOAT_TO_INT(fsz, isz) \
1391 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1394 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1395 return round_to_int_and_pack(p, s->float_rounding_mode, \
1396 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1400 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1401 (float ## fsz a, float_status *s) \
1403 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1404 return round_to_int_and_pack(p, float_round_to_zero, \
1405 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1409 FLOAT_TO_INT(16, 16)
1410 FLOAT_TO_INT(16, 32)
1411 FLOAT_TO_INT(16, 64)
1413 FLOAT_TO_INT(32, 16)
1414 FLOAT_TO_INT(32, 32)
1415 FLOAT_TO_INT(32, 64)
1417 FLOAT_TO_INT(64, 16)
1418 FLOAT_TO_INT(64, 32)
1419 FLOAT_TO_INT(64, 64)
1424 * Returns the result of converting the floating-point value `a' to
1425 * the unsigned integer format. The conversion is performed according
1426 * to the IEC/IEEE Standard for Binary Floating-Point
1427 * Arithmetic---which means in particular that the conversion is
1428 * rounded according to the current rounding mode. If `a' is a NaN,
1429 * the largest unsigned integer is returned. Otherwise, if the
1430 * conversion overflows, the largest unsigned integer is returned. If
1431 * the 'a' is negative, the result is rounded and zero is returned;
1432 * values that do not round to zero will raise the inexact exception
1436 static uint64_t round_to_uint_and_pack(FloatParts in
, int rmode
, uint64_t max
,
1439 int orig_flags
= get_float_exception_flags(s
);
1440 FloatParts p
= round_to_int(in
, rmode
, s
);
1443 case float_class_snan
:
1444 case float_class_qnan
:
1445 case float_class_dnan
:
1446 case float_class_msnan
:
1447 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1449 case float_class_inf
:
1450 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1451 return p
.sign
? 0 : max
;
1452 case float_class_zero
:
1454 case float_class_normal
:
1458 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1462 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
1463 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
1464 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
1465 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
1467 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1471 /* For uint64 this will never trip, but if p.exp is too large
1472 * to shift a decomposed fraction we shall have exited via the
1476 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1483 g_assert_not_reached();
1487 #define FLOAT_TO_UINT(fsz, isz) \
1488 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1491 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1492 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1493 UINT ## isz ## _MAX, s); \
1496 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1497 (float ## fsz a, float_status *s) \
1499 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1500 return round_to_uint_and_pack(p, float_round_to_zero, \
1501 UINT ## isz ## _MAX, s); \
1504 FLOAT_TO_UINT(16, 16)
1505 FLOAT_TO_UINT(16, 32)
1506 FLOAT_TO_UINT(16, 64)
1508 FLOAT_TO_UINT(32, 16)
1509 FLOAT_TO_UINT(32, 32)
1510 FLOAT_TO_UINT(32, 64)
1512 FLOAT_TO_UINT(64, 16)
1513 FLOAT_TO_UINT(64, 32)
1514 FLOAT_TO_UINT(64, 64)
1516 #undef FLOAT_TO_UINT
1519 * Integer to float conversions
1521 * Returns the result of converting the two's complement integer `a'
1522 * to the floating-point format. The conversion is performed according
1523 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1526 static FloatParts
int_to_float(int64_t a
, float_status
*status
)
1530 r
.cls
= float_class_zero
;
1532 } else if (a
== (1ULL << 63)) {
1533 r
.cls
= float_class_normal
;
1535 r
.frac
= DECOMPOSED_IMPLICIT_BIT
;
1546 int shift
= clz64(f
) - 1;
1547 r
.cls
= float_class_normal
;
1548 r
.exp
= (DECOMPOSED_BINARY_POINT
- shift
);
1549 r
.frac
= f
<< shift
;
1555 float16
int64_to_float16(int64_t a
, float_status
*status
)
1557 FloatParts pa
= int_to_float(a
, status
);
1558 return float16_round_pack_canonical(pa
, status
);
1561 float16
int32_to_float16(int32_t a
, float_status
*status
)
1563 return int64_to_float16(a
, status
);
1566 float16
int16_to_float16(int16_t a
, float_status
*status
)
1568 return int64_to_float16(a
, status
);
1571 float32
int64_to_float32(int64_t a
, float_status
*status
)
1573 FloatParts pa
= int_to_float(a
, status
);
1574 return float32_round_pack_canonical(pa
, status
);
1577 float32
int32_to_float32(int32_t a
, float_status
*status
)
1579 return int64_to_float32(a
, status
);
1582 float32
int16_to_float32(int16_t a
, float_status
*status
)
1584 return int64_to_float32(a
, status
);
1587 float64
int64_to_float64(int64_t a
, float_status
*status
)
1589 FloatParts pa
= int_to_float(a
, status
);
1590 return float64_round_pack_canonical(pa
, status
);
1593 float64
int32_to_float64(int32_t a
, float_status
*status
)
1595 return int64_to_float64(a
, status
);
1598 float64
int16_to_float64(int16_t a
, float_status
*status
)
1600 return int64_to_float64(a
, status
);
1605 * Unsigned Integer to float conversions
1607 * Returns the result of converting the unsigned integer `a' to the
1608 * floating-point format. The conversion is performed according to the
1609 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1612 static FloatParts
uint_to_float(uint64_t a
, float_status
*status
)
1614 FloatParts r
= { .sign
= false};
1617 r
.cls
= float_class_zero
;
1619 int spare_bits
= clz64(a
) - 1;
1620 r
.cls
= float_class_normal
;
1621 r
.exp
= DECOMPOSED_BINARY_POINT
- spare_bits
;
1622 if (spare_bits
< 0) {
1623 shift64RightJamming(a
, -spare_bits
, &a
);
1626 r
.frac
= a
<< spare_bits
;
1633 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
1635 FloatParts pa
= uint_to_float(a
, status
);
1636 return float16_round_pack_canonical(pa
, status
);
1639 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
1641 return uint64_to_float16(a
, status
);
1644 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
1646 return uint64_to_float16(a
, status
);
1649 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
1651 FloatParts pa
= uint_to_float(a
, status
);
1652 return float32_round_pack_canonical(pa
, status
);
1655 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
1657 return uint64_to_float32(a
, status
);
1660 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
1662 return uint64_to_float32(a
, status
);
1665 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
1667 FloatParts pa
= uint_to_float(a
, status
);
1668 return float64_round_pack_canonical(pa
, status
);
1671 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
1673 return uint64_to_float64(a
, status
);
1676 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
1678 return uint64_to_float64(a
, status
);
1682 /* min() and max() functions. These can't be implemented as
1683 * 'compare and pick one input' because that would mishandle
1684 * NaNs and +0 vs -0.
1686 * minnum() and maxnum() functions. These are similar to the min()
1687 * and max() functions but if one of the arguments is a QNaN and
1688 * the other is numerical then the numerical argument is returned.
1689 * SNaNs will get quietened before being returned.
1690 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1691 * and maxNum() operations. min() and max() are the typical min/max
1692 * semantics provided by many CPUs which predate that specification.
1694 * minnummag() and maxnummag() functions correspond to minNumMag()
1695 * and minNumMag() from the IEEE-754 2008.
1697 static FloatParts
minmax_floats(FloatParts a
, FloatParts b
, bool ismin
,
1698 bool ieee
, bool ismag
, float_status
*s
)
1700 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
1702 /* Takes two floating-point values `a' and `b', one of
1703 * which is a NaN, and returns the appropriate NaN
1704 * result. If either `a' or `b' is a signaling NaN,
1705 * the invalid exception is raised.
1707 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
1708 return pick_nan(a
, b
, s
);
1709 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
1711 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
1715 return pick_nan(a
, b
, s
);
1720 case float_class_normal
:
1723 case float_class_inf
:
1726 case float_class_zero
:
1730 g_assert_not_reached();
1734 case float_class_normal
:
1737 case float_class_inf
:
1740 case float_class_zero
:
1744 g_assert_not_reached();
1748 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
1749 bool a_less
= a_exp
< b_exp
;
1750 if (a_exp
== b_exp
) {
1751 a_less
= a
.frac
< b
.frac
;
1753 return a_less
^ ismin
? b
: a
;
1756 if (a
.sign
== b
.sign
) {
1757 bool a_less
= a_exp
< b_exp
;
1758 if (a_exp
== b_exp
) {
1759 a_less
= a
.frac
< b
.frac
;
1761 return a
.sign
^ a_less
^ ismin
? b
: a
;
1763 return a
.sign
^ ismin
? b
: a
;
1768 #define MINMAX(sz, name, ismin, isiee, ismag) \
1769 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1772 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1773 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1774 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1776 return float ## sz ## _round_pack_canonical(pr, s); \
1779 MINMAX(16, min
, true, false, false)
1780 MINMAX(16, minnum
, true, true, false)
1781 MINMAX(16, minnummag
, true, true, true)
1782 MINMAX(16, max
, false, false, false)
1783 MINMAX(16, maxnum
, false, true, false)
1784 MINMAX(16, maxnummag
, false, true, true)
1786 MINMAX(32, min
, true, false, false)
1787 MINMAX(32, minnum
, true, true, false)
1788 MINMAX(32, minnummag
, true, true, true)
1789 MINMAX(32, max
, false, false, false)
1790 MINMAX(32, maxnum
, false, true, false)
1791 MINMAX(32, maxnummag
, false, true, true)
1793 MINMAX(64, min
, true, false, false)
1794 MINMAX(64, minnum
, true, true, false)
1795 MINMAX(64, minnummag
, true, true, true)
1796 MINMAX(64, max
, false, false, false)
1797 MINMAX(64, maxnum
, false, true, false)
1798 MINMAX(64, maxnummag
, false, true, true)
1802 /* Floating point compare */
1803 static int compare_floats(FloatParts a
, FloatParts b
, bool is_quiet
,
1806 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1808 a
.cls
== float_class_snan
||
1809 b
.cls
== float_class_snan
) {
1810 s
->float_exception_flags
|= float_flag_invalid
;
1812 return float_relation_unordered
;
1815 if (a
.cls
== float_class_zero
) {
1816 if (b
.cls
== float_class_zero
) {
1817 return float_relation_equal
;
1819 return b
.sign
? float_relation_greater
: float_relation_less
;
1820 } else if (b
.cls
== float_class_zero
) {
1821 return a
.sign
? float_relation_less
: float_relation_greater
;
1824 /* The only really important thing about infinity is its sign. If
1825 * both are infinities the sign marks the smallest of the two.
1827 if (a
.cls
== float_class_inf
) {
1828 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
1829 return float_relation_equal
;
1831 return a
.sign
? float_relation_less
: float_relation_greater
;
1832 } else if (b
.cls
== float_class_inf
) {
1833 return b
.sign
? float_relation_greater
: float_relation_less
;
1836 if (a
.sign
!= b
.sign
) {
1837 return a
.sign
? float_relation_less
: float_relation_greater
;
1840 if (a
.exp
== b
.exp
) {
1841 if (a
.frac
== b
.frac
) {
1842 return float_relation_equal
;
1845 return a
.frac
> b
.frac
?
1846 float_relation_less
: float_relation_greater
;
1848 return a
.frac
> b
.frac
?
1849 float_relation_greater
: float_relation_less
;
1853 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
1855 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
1860 #define COMPARE(sz) \
1861 int float ## sz ## _compare(float ## sz a, float ## sz b, \
1864 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1865 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1866 return compare_floats(pa, pb, false, s); \
1868 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
1871 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1872 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1873 return compare_floats(pa, pb, true, s); \
1882 /* Multiply A by 2 raised to the power N. */
1883 static FloatParts
scalbn_decomposed(FloatParts a
, int n
, float_status
*s
)
1885 if (unlikely(is_nan(a
.cls
))) {
1886 return return_nan(a
, s
);
1888 if (a
.cls
== float_class_normal
) {
1889 /* The largest float type (even though not supported by FloatParts)
1890 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
1891 * still allows rounding to infinity, without allowing overflow
1892 * within the int32_t that backs FloatParts.exp.
1894 n
= MIN(MAX(n
, -0x10000), 0x10000);
1900 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
1902 FloatParts pa
= float16_unpack_canonical(a
, status
);
1903 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
1904 return float16_round_pack_canonical(pr
, status
);
1907 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
1909 FloatParts pa
= float32_unpack_canonical(a
, status
);
1910 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
1911 return float32_round_pack_canonical(pr
, status
);
1914 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
1916 FloatParts pa
= float64_unpack_canonical(a
, status
);
1917 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
1918 return float64_round_pack_canonical(pr
, status
);
1924 * The old softfloat code did an approximation step before zeroing in
1925 * on the final result. However for simpleness we just compute the
1926 * square root by iterating down from the implicit bit to enough extra
1927 * bits to ensure we get a correctly rounded result.
1929 * This does mean however the calculation is slower than before,
1930 * especially for 64 bit floats.
1933 static FloatParts
sqrt_float(FloatParts a
, float_status
*s
, const FloatFmt
*p
)
1935 uint64_t a_frac
, r_frac
, s_frac
;
1938 if (is_nan(a
.cls
)) {
1939 return return_nan(a
, s
);
1941 if (a
.cls
== float_class_zero
) {
1942 return a
; /* sqrt(+-0) = +-0 */
1945 s
->float_exception_flags
|= float_flag_invalid
;
1946 a
.cls
= float_class_dnan
;
1949 if (a
.cls
== float_class_inf
) {
1950 return a
; /* sqrt(+inf) = +inf */
1953 assert(a
.cls
== float_class_normal
);
1955 /* We need two overflow bits at the top. Adding room for that is a
1956 * right shift. If the exponent is odd, we can discard the low bit
1957 * by multiplying the fraction by 2; that's a left shift. Combine
1958 * those and we shift right if the exponent is even.
1966 /* Bit-by-bit computation of sqrt. */
1970 /* Iterate from implicit bit down to the 3 extra bits to compute a
1971 * properly rounded result. Remember we've inserted one more bit
1972 * at the top, so these positions are one less.
1974 bit
= DECOMPOSED_BINARY_POINT
- 1;
1975 last_bit
= MAX(p
->frac_shift
- 4, 0);
1977 uint64_t q
= 1ULL << bit
;
1978 uint64_t t_frac
= s_frac
+ q
;
1979 if (t_frac
<= a_frac
) {
1980 s_frac
= t_frac
+ q
;
1985 } while (--bit
>= last_bit
);
1987 /* Undo the right shift done above. If there is any remaining
1988 * fraction, the result is inexact. Set the sticky bit.
1990 a
.frac
= (r_frac
<< 1) + (a_frac
!= 0);
1995 float16
__attribute__((flatten
)) float16_sqrt(float16 a
, float_status
*status
)
1997 FloatParts pa
= float16_unpack_canonical(a
, status
);
1998 FloatParts pr
= sqrt_float(pa
, status
, &float16_params
);
1999 return float16_round_pack_canonical(pr
, status
);
2002 float32
__attribute__((flatten
)) float32_sqrt(float32 a
, float_status
*status
)
2004 FloatParts pa
= float32_unpack_canonical(a
, status
);
2005 FloatParts pr
= sqrt_float(pa
, status
, &float32_params
);
2006 return float32_round_pack_canonical(pr
, status
);
2009 float64
__attribute__((flatten
)) float64_sqrt(float64 a
, float_status
*status
)
2011 FloatParts pa
= float64_unpack_canonical(a
, status
);
2012 FloatParts pr
= sqrt_float(pa
, status
, &float64_params
);
2013 return float64_round_pack_canonical(pr
, status
);
2017 /*----------------------------------------------------------------------------
2018 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2019 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2020 | input. If `zSign' is 1, the input is negated before being converted to an
2021 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2022 | is simply rounded to an integer, with the inexact exception raised if the
2023 | input cannot be represented exactly as an integer. However, if the fixed-
2024 | point input is too large, the invalid exception is raised and the largest
2025 | positive or negative integer is returned.
2026 *----------------------------------------------------------------------------*/
2028 static int32_t roundAndPackInt32(flag zSign
, uint64_t absZ
, float_status
*status
)
2030 int8_t roundingMode
;
2031 flag roundNearestEven
;
2032 int8_t roundIncrement
, roundBits
;
2035 roundingMode
= status
->float_rounding_mode
;
2036 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2037 switch (roundingMode
) {
2038 case float_round_nearest_even
:
2039 case float_round_ties_away
:
2040 roundIncrement
= 0x40;
2042 case float_round_to_zero
:
2045 case float_round_up
:
2046 roundIncrement
= zSign
? 0 : 0x7f;
2048 case float_round_down
:
2049 roundIncrement
= zSign
? 0x7f : 0;
2054 roundBits
= absZ
& 0x7F;
2055 absZ
= ( absZ
+ roundIncrement
)>>7;
2056 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
2058 if ( zSign
) z
= - z
;
2059 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
2060 float_raise(float_flag_invalid
, status
);
2061 return zSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
2064 status
->float_exception_flags
|= float_flag_inexact
;
2070 /*----------------------------------------------------------------------------
2071 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2072 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2073 | and returns the properly rounded 64-bit integer corresponding to the input.
2074 | If `zSign' is 1, the input is negated before being converted to an integer.
2075 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2076 | the inexact exception raised if the input cannot be represented exactly as
2077 | an integer. However, if the fixed-point input is too large, the invalid
2078 | exception is raised and the largest positive or negative integer is
2080 *----------------------------------------------------------------------------*/
2082 static int64_t roundAndPackInt64(flag zSign
, uint64_t absZ0
, uint64_t absZ1
,
2083 float_status
*status
)
2085 int8_t roundingMode
;
2086 flag roundNearestEven
, increment
;
2089 roundingMode
= status
->float_rounding_mode
;
2090 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2091 switch (roundingMode
) {
2092 case float_round_nearest_even
:
2093 case float_round_ties_away
:
2094 increment
= ((int64_t) absZ1
< 0);
2096 case float_round_to_zero
:
2099 case float_round_up
:
2100 increment
= !zSign
&& absZ1
;
2102 case float_round_down
:
2103 increment
= zSign
&& absZ1
;
2110 if ( absZ0
== 0 ) goto overflow
;
2111 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
2114 if ( zSign
) z
= - z
;
2115 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
2117 float_raise(float_flag_invalid
, status
);
2119 zSign
? (int64_t) LIT64( 0x8000000000000000 )
2120 : LIT64( 0x7FFFFFFFFFFFFFFF );
2123 status
->float_exception_flags
|= float_flag_inexact
;
2129 /*----------------------------------------------------------------------------
2130 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2131 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2132 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2133 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2134 | with the inexact exception raised if the input cannot be represented exactly
2135 | as an integer. However, if the fixed-point input is too large, the invalid
2136 | exception is raised and the largest unsigned integer is returned.
2137 *----------------------------------------------------------------------------*/
2139 static int64_t roundAndPackUint64(flag zSign
, uint64_t absZ0
,
2140 uint64_t absZ1
, float_status
*status
)
2142 int8_t roundingMode
;
2143 flag roundNearestEven
, increment
;
2145 roundingMode
= status
->float_rounding_mode
;
2146 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
2147 switch (roundingMode
) {
2148 case float_round_nearest_even
:
2149 case float_round_ties_away
:
2150 increment
= ((int64_t)absZ1
< 0);
2152 case float_round_to_zero
:
2155 case float_round_up
:
2156 increment
= !zSign
&& absZ1
;
2158 case float_round_down
:
2159 increment
= zSign
&& absZ1
;
2167 float_raise(float_flag_invalid
, status
);
2168 return LIT64(0xFFFFFFFFFFFFFFFF);
2170 absZ0
&= ~(((uint64_t)(absZ1
<<1) == 0) & roundNearestEven
);
2173 if (zSign
&& absZ0
) {
2174 float_raise(float_flag_invalid
, status
);
2179 status
->float_exception_flags
|= float_flag_inexact
;
2184 /*----------------------------------------------------------------------------
2185 | If `a' is denormal and we are in flush-to-zero mode then set the
2186 | input-denormal exception and return zero. Otherwise just return the value.
2187 *----------------------------------------------------------------------------*/
2188 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
2190 if (status
->flush_inputs_to_zero
) {
2191 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
2192 float_raise(float_flag_input_denormal
, status
);
2193 return make_float32(float32_val(a
) & 0x80000000);
2199 /*----------------------------------------------------------------------------
2200 | Normalizes the subnormal single-precision floating-point value represented
2201 | by the denormalized significand `aSig'. The normalized exponent and
2202 | significand are stored at the locations pointed to by `zExpPtr' and
2203 | `zSigPtr', respectively.
2204 *----------------------------------------------------------------------------*/
2207 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
2211 shiftCount
= countLeadingZeros32( aSig
) - 8;
2212 *zSigPtr
= aSig
<<shiftCount
;
2213 *zExpPtr
= 1 - shiftCount
;
2217 /*----------------------------------------------------------------------------
2218 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2219 | and significand `zSig', and returns the proper single-precision floating-
2220 | point value corresponding to the abstract input. Ordinarily, the abstract
2221 | value is simply rounded and packed into the single-precision format, with
2222 | the inexact exception raised if the abstract input cannot be represented
2223 | exactly. However, if the abstract value is too large, the overflow and
2224 | inexact exceptions are raised and an infinity or maximal finite value is
2225 | returned. If the abstract value is too small, the input value is rounded to
2226 | a subnormal number, and the underflow and inexact exceptions are raised if
2227 | the abstract input cannot be represented exactly as a subnormal single-
2228 | precision floating-point number.
2229 | The input significand `zSig' has its binary point between bits 30
2230 | and 29, which is 7 bits to the left of the usual location. This shifted
2231 | significand must be normalized or smaller. If `zSig' is not normalized,
2232 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2233 | and it must not require rounding. In the usual case that `zSig' is
2234 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2235 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2236 | Binary Floating-Point Arithmetic.
2237 *----------------------------------------------------------------------------*/
2239 static float32
roundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
2240 float_status
*status
)
2242 int8_t roundingMode
;
2243 flag roundNearestEven
;
2244 int8_t roundIncrement
, roundBits
;
2247 roundingMode
= status
->float_rounding_mode
;
2248 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2249 switch (roundingMode
) {
2250 case float_round_nearest_even
:
2251 case float_round_ties_away
:
2252 roundIncrement
= 0x40;
2254 case float_round_to_zero
:
2257 case float_round_up
:
2258 roundIncrement
= zSign
? 0 : 0x7f;
2260 case float_round_down
:
2261 roundIncrement
= zSign
? 0x7f : 0;
2267 roundBits
= zSig
& 0x7F;
2268 if ( 0xFD <= (uint16_t) zExp
) {
2269 if ( ( 0xFD < zExp
)
2270 || ( ( zExp
== 0xFD )
2271 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
2273 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2274 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
2277 if (status
->flush_to_zero
) {
2278 float_raise(float_flag_output_denormal
, status
);
2279 return packFloat32(zSign
, 0, 0);
2282 (status
->float_detect_tininess
2283 == float_tininess_before_rounding
)
2285 || ( zSig
+ roundIncrement
< 0x80000000 );
2286 shift32RightJamming( zSig
, - zExp
, &zSig
);
2288 roundBits
= zSig
& 0x7F;
2289 if (isTiny
&& roundBits
) {
2290 float_raise(float_flag_underflow
, status
);
2295 status
->float_exception_flags
|= float_flag_inexact
;
2297 zSig
= ( zSig
+ roundIncrement
)>>7;
2298 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
2299 if ( zSig
== 0 ) zExp
= 0;
2300 return packFloat32( zSign
, zExp
, zSig
);
2304 /*----------------------------------------------------------------------------
2305 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2306 | and significand `zSig', and returns the proper single-precision floating-
2307 | point value corresponding to the abstract input. This routine is just like
2308 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2309 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2310 | floating-point exponent.
2311 *----------------------------------------------------------------------------*/
2314 normalizeRoundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
2315 float_status
*status
)
2319 shiftCount
= countLeadingZeros32( zSig
) - 1;
2320 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
2325 /*----------------------------------------------------------------------------
2326 | If `a' is denormal and we are in flush-to-zero mode then set the
2327 | input-denormal exception and return zero. Otherwise just return the value.
2328 *----------------------------------------------------------------------------*/
2329 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
2331 if (status
->flush_inputs_to_zero
) {
2332 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
2333 float_raise(float_flag_input_denormal
, status
);
2334 return make_float64(float64_val(a
) & (1ULL << 63));
2340 /*----------------------------------------------------------------------------
2341 | Normalizes the subnormal double-precision floating-point value represented
2342 | by the denormalized significand `aSig'. The normalized exponent and
2343 | significand are stored at the locations pointed to by `zExpPtr' and
2344 | `zSigPtr', respectively.
2345 *----------------------------------------------------------------------------*/
2348 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
2352 shiftCount
= countLeadingZeros64( aSig
) - 11;
2353 *zSigPtr
= aSig
<<shiftCount
;
2354 *zExpPtr
= 1 - shiftCount
;
2358 /*----------------------------------------------------------------------------
2359 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2360 | double-precision floating-point value, returning the result. After being
2361 | shifted into the proper positions, the three fields are simply added
2362 | together to form the result. This means that any integer portion of `zSig'
2363 | will be added into the exponent. Since a properly normalized significand
2364 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2365 | than the desired result exponent whenever `zSig' is a complete, normalized
2367 *----------------------------------------------------------------------------*/
2369 static inline float64
packFloat64(flag zSign
, int zExp
, uint64_t zSig
)
2372 return make_float64(
2373 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
2377 /*----------------------------------------------------------------------------
2378 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2379 | and significand `zSig', and returns the proper double-precision floating-
2380 | point value corresponding to the abstract input. Ordinarily, the abstract
2381 | value is simply rounded and packed into the double-precision format, with
2382 | the inexact exception raised if the abstract input cannot be represented
2383 | exactly. However, if the abstract value is too large, the overflow and
2384 | inexact exceptions are raised and an infinity or maximal finite value is
2385 | returned. If the abstract value is too small, the input value is rounded to
2386 | a subnormal number, and the underflow and inexact exceptions are raised if
2387 | the abstract input cannot be represented exactly as a subnormal double-
2388 | precision floating-point number.
2389 | The input significand `zSig' has its binary point between bits 62
2390 | and 61, which is 10 bits to the left of the usual location. This shifted
2391 | significand must be normalized or smaller. If `zSig' is not normalized,
2392 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2393 | and it must not require rounding. In the usual case that `zSig' is
2394 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2395 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2396 | Binary Floating-Point Arithmetic.
2397 *----------------------------------------------------------------------------*/
2399 static float64
roundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
2400 float_status
*status
)
2402 int8_t roundingMode
;
2403 flag roundNearestEven
;
2404 int roundIncrement
, roundBits
;
2407 roundingMode
= status
->float_rounding_mode
;
2408 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2409 switch (roundingMode
) {
2410 case float_round_nearest_even
:
2411 case float_round_ties_away
:
2412 roundIncrement
= 0x200;
2414 case float_round_to_zero
:
2417 case float_round_up
:
2418 roundIncrement
= zSign
? 0 : 0x3ff;
2420 case float_round_down
:
2421 roundIncrement
= zSign
? 0x3ff : 0;
2423 case float_round_to_odd
:
2424 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
2429 roundBits
= zSig
& 0x3FF;
2430 if ( 0x7FD <= (uint16_t) zExp
) {
2431 if ( ( 0x7FD < zExp
)
2432 || ( ( zExp
== 0x7FD )
2433 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
2435 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
2436 roundIncrement
!= 0;
2437 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2438 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
2441 if (status
->flush_to_zero
) {
2442 float_raise(float_flag_output_denormal
, status
);
2443 return packFloat64(zSign
, 0, 0);
2446 (status
->float_detect_tininess
2447 == float_tininess_before_rounding
)
2449 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
2450 shift64RightJamming( zSig
, - zExp
, &zSig
);
2452 roundBits
= zSig
& 0x3FF;
2453 if (isTiny
&& roundBits
) {
2454 float_raise(float_flag_underflow
, status
);
2456 if (roundingMode
== float_round_to_odd
) {
2458 * For round-to-odd case, the roundIncrement depends on
2459 * zSig which just changed.
2461 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
2466 status
->float_exception_flags
|= float_flag_inexact
;
2468 zSig
= ( zSig
+ roundIncrement
)>>10;
2469 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
2470 if ( zSig
== 0 ) zExp
= 0;
2471 return packFloat64( zSign
, zExp
, zSig
);
2475 /*----------------------------------------------------------------------------
2476 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2477 | and significand `zSig', and returns the proper double-precision floating-
2478 | point value corresponding to the abstract input. This routine is just like
2479 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2480 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2481 | floating-point exponent.
2482 *----------------------------------------------------------------------------*/
2485 normalizeRoundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
2486 float_status
*status
)
2490 shiftCount
= countLeadingZeros64( zSig
) - 1;
2491 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
2496 /*----------------------------------------------------------------------------
2497 | Normalizes the subnormal extended double-precision floating-point value
2498 | represented by the denormalized significand `aSig'. The normalized exponent
2499 | and significand are stored at the locations pointed to by `zExpPtr' and
2500 | `zSigPtr', respectively.
2501 *----------------------------------------------------------------------------*/
2503 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
2508 shiftCount
= countLeadingZeros64( aSig
);
2509 *zSigPtr
= aSig
<<shiftCount
;
2510 *zExpPtr
= 1 - shiftCount
;
2513 /*----------------------------------------------------------------------------
2514 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2515 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2516 | and returns the proper extended double-precision floating-point value
2517 | corresponding to the abstract input. Ordinarily, the abstract value is
2518 | rounded and packed into the extended double-precision format, with the
2519 | inexact exception raised if the abstract input cannot be represented
2520 | exactly. However, if the abstract value is too large, the overflow and
2521 | inexact exceptions are raised and an infinity or maximal finite value is
2522 | returned. If the abstract value is too small, the input value is rounded to
2523 | a subnormal number, and the underflow and inexact exceptions are raised if
2524 | the abstract input cannot be represented exactly as a subnormal extended
2525 | double-precision floating-point number.
2526 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
2527 | number of bits as single or double precision, respectively. Otherwise, the
2528 | result is rounded to the full precision of the extended double-precision
2530 | The input significand must be normalized or smaller. If the input
2531 | significand is not normalized, `zExp' must be 0; in that case, the result
2532 | returned is a subnormal number, and it must not require rounding. The
2533 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2534 | Floating-Point Arithmetic.
2535 *----------------------------------------------------------------------------*/
2537 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, flag zSign
,
2538 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
2539 float_status
*status
)
2541 int8_t roundingMode
;
2542 flag roundNearestEven
, increment
, isTiny
;
2543 int64_t roundIncrement
, roundMask
, roundBits
;
2545 roundingMode
= status
->float_rounding_mode
;
2546 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2547 if ( roundingPrecision
== 80 ) goto precision80
;
2548 if ( roundingPrecision
== 64 ) {
2549 roundIncrement
= LIT64( 0x0000000000000400 );
2550 roundMask
= LIT64( 0x00000000000007FF );
2552 else if ( roundingPrecision
== 32 ) {
2553 roundIncrement
= LIT64( 0x0000008000000000 );
2554 roundMask
= LIT64( 0x000000FFFFFFFFFF );
2559 zSig0
|= ( zSig1
!= 0 );
2560 switch (roundingMode
) {
2561 case float_round_nearest_even
:
2562 case float_round_ties_away
:
2564 case float_round_to_zero
:
2567 case float_round_up
:
2568 roundIncrement
= zSign
? 0 : roundMask
;
2570 case float_round_down
:
2571 roundIncrement
= zSign
? roundMask
: 0;
2576 roundBits
= zSig0
& roundMask
;
2577 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
2578 if ( ( 0x7FFE < zExp
)
2579 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
2584 if (status
->flush_to_zero
) {
2585 float_raise(float_flag_output_denormal
, status
);
2586 return packFloatx80(zSign
, 0, 0);
2589 (status
->float_detect_tininess
2590 == float_tininess_before_rounding
)
2592 || ( zSig0
<= zSig0
+ roundIncrement
);
2593 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
2595 roundBits
= zSig0
& roundMask
;
2596 if (isTiny
&& roundBits
) {
2597 float_raise(float_flag_underflow
, status
);
2600 status
->float_exception_flags
|= float_flag_inexact
;
2602 zSig0
+= roundIncrement
;
2603 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
2604 roundIncrement
= roundMask
+ 1;
2605 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
2606 roundMask
|= roundIncrement
;
2608 zSig0
&= ~ roundMask
;
2609 return packFloatx80( zSign
, zExp
, zSig0
);
2613 status
->float_exception_flags
|= float_flag_inexact
;
2615 zSig0
+= roundIncrement
;
2616 if ( zSig0
< roundIncrement
) {
2618 zSig0
= LIT64( 0x8000000000000000 );
2620 roundIncrement
= roundMask
+ 1;
2621 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
2622 roundMask
|= roundIncrement
;
2624 zSig0
&= ~ roundMask
;
2625 if ( zSig0
== 0 ) zExp
= 0;
2626 return packFloatx80( zSign
, zExp
, zSig0
);
2628 switch (roundingMode
) {
2629 case float_round_nearest_even
:
2630 case float_round_ties_away
:
2631 increment
= ((int64_t)zSig1
< 0);
2633 case float_round_to_zero
:
2636 case float_round_up
:
2637 increment
= !zSign
&& zSig1
;
2639 case float_round_down
:
2640 increment
= zSign
&& zSig1
;
2645 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
2646 if ( ( 0x7FFE < zExp
)
2647 || ( ( zExp
== 0x7FFE )
2648 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
2654 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2655 if ( ( roundingMode
== float_round_to_zero
)
2656 || ( zSign
&& ( roundingMode
== float_round_up
) )
2657 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
2659 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
2661 return packFloatx80(zSign
,
2662 floatx80_infinity_high
,
2663 floatx80_infinity_low
);
2667 (status
->float_detect_tininess
2668 == float_tininess_before_rounding
)
2671 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
2672 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
2674 if (isTiny
&& zSig1
) {
2675 float_raise(float_flag_underflow
, status
);
2678 status
->float_exception_flags
|= float_flag_inexact
;
2680 switch (roundingMode
) {
2681 case float_round_nearest_even
:
2682 case float_round_ties_away
:
2683 increment
= ((int64_t)zSig1
< 0);
2685 case float_round_to_zero
:
2688 case float_round_up
:
2689 increment
= !zSign
&& zSig1
;
2691 case float_round_down
:
2692 increment
= zSign
&& zSig1
;
2700 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
2701 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
2703 return packFloatx80( zSign
, zExp
, zSig0
);
2707 status
->float_exception_flags
|= float_flag_inexact
;
2713 zSig0
= LIT64( 0x8000000000000000 );
2716 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
2720 if ( zSig0
== 0 ) zExp
= 0;
2722 return packFloatx80( zSign
, zExp
, zSig0
);
2726 /*----------------------------------------------------------------------------
2727 | Takes an abstract floating-point value having sign `zSign', exponent
2728 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2729 | and returns the proper extended double-precision floating-point value
2730 | corresponding to the abstract input. This routine is just like
2731 | `roundAndPackFloatx80' except that the input significand does not have to be
2733 *----------------------------------------------------------------------------*/
2735 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
2736 flag zSign
, int32_t zExp
,
2737 uint64_t zSig0
, uint64_t zSig1
,
2738 float_status
*status
)
2747 shiftCount
= countLeadingZeros64( zSig0
);
2748 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
2750 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
2751 zSig0
, zSig1
, status
);
2755 /*----------------------------------------------------------------------------
2756 | Returns the least-significant 64 fraction bits of the quadruple-precision
2757 | floating-point value `a'.
2758 *----------------------------------------------------------------------------*/
2760 static inline uint64_t extractFloat128Frac1( float128 a
)
2767 /*----------------------------------------------------------------------------
2768 | Returns the most-significant 48 fraction bits of the quadruple-precision
2769 | floating-point value `a'.
2770 *----------------------------------------------------------------------------*/
2772 static inline uint64_t extractFloat128Frac0( float128 a
)
2775 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
2779 /*----------------------------------------------------------------------------
2780 | Returns the exponent bits of the quadruple-precision floating-point value
2782 *----------------------------------------------------------------------------*/
2784 static inline int32_t extractFloat128Exp( float128 a
)
2787 return ( a
.high
>>48 ) & 0x7FFF;
2791 /*----------------------------------------------------------------------------
2792 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2793 *----------------------------------------------------------------------------*/
2795 static inline flag
extractFloat128Sign( float128 a
)
2802 /*----------------------------------------------------------------------------
2803 | Normalizes the subnormal quadruple-precision floating-point value
2804 | represented by the denormalized significand formed by the concatenation of
2805 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
2806 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2807 | significand are stored at the location pointed to by `zSig0Ptr', and the
2808 | least significant 64 bits of the normalized significand are stored at the
2809 | location pointed to by `zSig1Ptr'.
2810 *----------------------------------------------------------------------------*/
2813 normalizeFloat128Subnormal(
2824 shiftCount
= countLeadingZeros64( aSig1
) - 15;
2825 if ( shiftCount
< 0 ) {
2826 *zSig0Ptr
= aSig1
>>( - shiftCount
);
2827 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
2830 *zSig0Ptr
= aSig1
<<shiftCount
;
2833 *zExpPtr
= - shiftCount
- 63;
2836 shiftCount
= countLeadingZeros64( aSig0
) - 15;
2837 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
2838 *zExpPtr
= 1 - shiftCount
;
2843 /*----------------------------------------------------------------------------
2844 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2845 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2846 | floating-point value, returning the result. After being shifted into the
2847 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2848 | added together to form the most significant 32 bits of the result. This
2849 | means that any integer portion of `zSig0' will be added into the exponent.
2850 | Since a properly normalized significand will have an integer portion equal
2851 | to 1, the `zExp' input should be 1 less than the desired result exponent
2852 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2854 *----------------------------------------------------------------------------*/
2856 static inline float128
2857 packFloat128( flag zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
2862 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
2867 /*----------------------------------------------------------------------------
2868 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2869 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2870 | and `zSig2', and returns the proper quadruple-precision floating-point value
2871 | corresponding to the abstract input. Ordinarily, the abstract value is
2872 | simply rounded and packed into the quadruple-precision format, with the
2873 | inexact exception raised if the abstract input cannot be represented
2874 | exactly. However, if the abstract value is too large, the overflow and
2875 | inexact exceptions are raised and an infinity or maximal finite value is
2876 | returned. If the abstract value is too small, the input value is rounded to
2877 | a subnormal number, and the underflow and inexact exceptions are raised if
2878 | the abstract input cannot be represented exactly as a subnormal quadruple-
2879 | precision floating-point number.
2880 | The input significand must be normalized or smaller. If the input
2881 | significand is not normalized, `zExp' must be 0; in that case, the result
2882 | returned is a subnormal number, and it must not require rounding. In the
2883 | usual case that the input significand is normalized, `zExp' must be 1 less
2884 | than the ``true'' floating-point exponent. The handling of underflow and
2885 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2886 *----------------------------------------------------------------------------*/
2888 static float128
roundAndPackFloat128(flag zSign
, int32_t zExp
,
2889 uint64_t zSig0
, uint64_t zSig1
,
2890 uint64_t zSig2
, float_status
*status
)
2892 int8_t roundingMode
;
2893 flag roundNearestEven
, increment
, isTiny
;
2895 roundingMode
= status
->float_rounding_mode
;
2896 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2897 switch (roundingMode
) {
2898 case float_round_nearest_even
:
2899 case float_round_ties_away
:
2900 increment
= ((int64_t)zSig2
< 0);
2902 case float_round_to_zero
:
2905 case float_round_up
:
2906 increment
= !zSign
&& zSig2
;
2908 case float_round_down
:
2909 increment
= zSign
&& zSig2
;
2911 case float_round_to_odd
:
2912 increment
= !(zSig1
& 0x1) && zSig2
;
2917 if ( 0x7FFD <= (uint32_t) zExp
) {
2918 if ( ( 0x7FFD < zExp
)
2919 || ( ( zExp
== 0x7FFD )
2921 LIT64( 0x0001FFFFFFFFFFFF ),
2922 LIT64( 0xFFFFFFFFFFFFFFFF ),
2929 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2930 if ( ( roundingMode
== float_round_to_zero
)
2931 || ( zSign
&& ( roundingMode
== float_round_up
) )
2932 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
2933 || (roundingMode
== float_round_to_odd
)
2939 LIT64( 0x0000FFFFFFFFFFFF ),
2940 LIT64( 0xFFFFFFFFFFFFFFFF )
2943 return packFloat128( zSign
, 0x7FFF, 0, 0 );
2946 if (status
->flush_to_zero
) {
2947 float_raise(float_flag_output_denormal
, status
);
2948 return packFloat128(zSign
, 0, 0, 0);
2951 (status
->float_detect_tininess
2952 == float_tininess_before_rounding
)
2958 LIT64( 0x0001FFFFFFFFFFFF ),
2959 LIT64( 0xFFFFFFFFFFFFFFFF )
2961 shift128ExtraRightJamming(
2962 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
2964 if (isTiny
&& zSig2
) {
2965 float_raise(float_flag_underflow
, status
);
2967 switch (roundingMode
) {
2968 case float_round_nearest_even
:
2969 case float_round_ties_away
:
2970 increment
= ((int64_t)zSig2
< 0);
2972 case float_round_to_zero
:
2975 case float_round_up
:
2976 increment
= !zSign
&& zSig2
;
2978 case float_round_down
:
2979 increment
= zSign
&& zSig2
;
2981 case float_round_to_odd
:
2982 increment
= !(zSig1
& 0x1) && zSig2
;
2990 status
->float_exception_flags
|= float_flag_inexact
;
2993 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
2994 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
2997 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
2999 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
3003 /*----------------------------------------------------------------------------
3004 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3005 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3006 | returns the proper quadruple-precision floating-point value corresponding
3007 | to the abstract input. This routine is just like `roundAndPackFloat128'
3008 | except that the input significand has fewer bits and does not have to be
3009 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
3011 *----------------------------------------------------------------------------*/
3013 static float128
normalizeRoundAndPackFloat128(flag zSign
, int32_t zExp
,
3014 uint64_t zSig0
, uint64_t zSig1
,
3015 float_status
*status
)
3025 shiftCount
= countLeadingZeros64( zSig0
) - 15;
3026 if ( 0 <= shiftCount
) {
3028 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
3031 shift128ExtraRightJamming(
3032 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
3035 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
3040 /*----------------------------------------------------------------------------
3041 | Returns the result of converting the 32-bit two's complement integer `a'
3042 | to the extended double-precision floating-point format. The conversion
3043 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3045 *----------------------------------------------------------------------------*/
3047 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
3054 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
3056 absA
= zSign
? - a
: a
;
3057 shiftCount
= countLeadingZeros32( absA
) + 32;
3059 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
3063 /*----------------------------------------------------------------------------
3064 | Returns the result of converting the 32-bit two's complement integer `a' to
3065 | the quadruple-precision floating-point format. The conversion is performed
3066 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3067 *----------------------------------------------------------------------------*/
3069 float128
int32_to_float128(int32_t a
, float_status
*status
)
3076 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
3078 absA
= zSign
? - a
: a
;
3079 shiftCount
= countLeadingZeros32( absA
) + 17;
3081 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
3085 /*----------------------------------------------------------------------------
3086 | Returns the result of converting the 64-bit two's complement integer `a'
3087 | to the extended double-precision floating-point format. The conversion
3088 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3090 *----------------------------------------------------------------------------*/
3092 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
3098 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
3100 absA
= zSign
? - a
: a
;
3101 shiftCount
= countLeadingZeros64( absA
);
3102 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
3106 /*----------------------------------------------------------------------------
3107 | Returns the result of converting the 64-bit two's complement integer `a' to
3108 | the quadruple-precision floating-point format. The conversion is performed
3109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3110 *----------------------------------------------------------------------------*/
3112 float128
int64_to_float128(int64_t a
, float_status
*status
)
3118 uint64_t zSig0
, zSig1
;
3120 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
3122 absA
= zSign
? - a
: a
;
3123 shiftCount
= countLeadingZeros64( absA
) + 49;
3124 zExp
= 0x406E - shiftCount
;
3125 if ( 64 <= shiftCount
) {
3134 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
3135 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
3139 /*----------------------------------------------------------------------------
3140 | Returns the result of converting the 64-bit unsigned integer `a'
3141 | to the quadruple-precision floating-point format. The conversion is performed
3142 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3143 *----------------------------------------------------------------------------*/
3145 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
3148 return float128_zero
;
3150 return normalizeRoundAndPackFloat128(0, 0x406E, a
, 0, status
);
3156 /*----------------------------------------------------------------------------
3157 | Returns the result of converting the single-precision floating-point value
3158 | `a' to the double-precision floating-point format. The conversion is
3159 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3161 *----------------------------------------------------------------------------*/
3163 float64
float32_to_float64(float32 a
, float_status
*status
)
3168 a
= float32_squash_input_denormal(a
, status
);
3170 aSig
= extractFloat32Frac( a
);
3171 aExp
= extractFloat32Exp( a
);
3172 aSign
= extractFloat32Sign( a
);
3173 if ( aExp
== 0xFF ) {
3175 return commonNaNToFloat64(float32ToCommonNaN(a
, status
), status
);
3177 return packFloat64( aSign
, 0x7FF, 0 );
3180 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
3181 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3184 return packFloat64( aSign
, aExp
+ 0x380, ( (uint64_t) aSig
)<<29 );
3188 /*----------------------------------------------------------------------------
3189 | Returns the result of converting the single-precision floating-point value
3190 | `a' to the extended double-precision floating-point format. The conversion
3191 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3193 *----------------------------------------------------------------------------*/
3195 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
3201 a
= float32_squash_input_denormal(a
, status
);
3202 aSig
= extractFloat32Frac( a
);
3203 aExp
= extractFloat32Exp( a
);
3204 aSign
= extractFloat32Sign( a
);
3205 if ( aExp
== 0xFF ) {
3207 return commonNaNToFloatx80(float32ToCommonNaN(a
, status
), status
);
3209 return packFloatx80(aSign
,
3210 floatx80_infinity_high
,
3211 floatx80_infinity_low
);
3214 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
3215 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3218 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
3222 /*----------------------------------------------------------------------------
3223 | Returns the result of converting the single-precision floating-point value
3224 | `a' to the double-precision floating-point format. The conversion is
3225 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3227 *----------------------------------------------------------------------------*/
3229 float128
float32_to_float128(float32 a
, float_status
*status
)
3235 a
= float32_squash_input_denormal(a
, status
);
3236 aSig
= extractFloat32Frac( a
);
3237 aExp
= extractFloat32Exp( a
);
3238 aSign
= extractFloat32Sign( a
);
3239 if ( aExp
== 0xFF ) {
3241 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
3243 return packFloat128( aSign
, 0x7FFF, 0, 0 );
3246 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
3247 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3250 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
3254 /*----------------------------------------------------------------------------
3255 | Returns the remainder of the single-precision floating-point value `a'
3256 | with respect to the corresponding value `b'. The operation is performed
3257 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3258 *----------------------------------------------------------------------------*/
3260 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
3263 int aExp
, bExp
, expDiff
;
3264 uint32_t aSig
, bSig
;
3266 uint64_t aSig64
, bSig64
, q64
;
3267 uint32_t alternateASig
;
3269 a
= float32_squash_input_denormal(a
, status
);
3270 b
= float32_squash_input_denormal(b
, status
);
3272 aSig
= extractFloat32Frac( a
);
3273 aExp
= extractFloat32Exp( a
);
3274 aSign
= extractFloat32Sign( a
);
3275 bSig
= extractFloat32Frac( b
);
3276 bExp
= extractFloat32Exp( b
);
3277 if ( aExp
== 0xFF ) {
3278 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
3279 return propagateFloat32NaN(a
, b
, status
);
3281 float_raise(float_flag_invalid
, status
);
3282 return float32_default_nan(status
);
3284 if ( bExp
== 0xFF ) {
3286 return propagateFloat32NaN(a
, b
, status
);
3292 float_raise(float_flag_invalid
, status
);
3293 return float32_default_nan(status
);
3295 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
3298 if ( aSig
== 0 ) return a
;
3299 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3301 expDiff
= aExp
- bExp
;
3304 if ( expDiff
< 32 ) {
3307 if ( expDiff
< 0 ) {
3308 if ( expDiff
< -1 ) return a
;
3311 q
= ( bSig
<= aSig
);
3312 if ( q
) aSig
-= bSig
;
3313 if ( 0 < expDiff
) {
3314 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
3317 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3325 if ( bSig
<= aSig
) aSig
-= bSig
;
3326 aSig64
= ( (uint64_t) aSig
)<<40;
3327 bSig64
= ( (uint64_t) bSig
)<<40;
3329 while ( 0 < expDiff
) {
3330 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
3331 q64
= ( 2 < q64
) ? q64
- 2 : 0;
3332 aSig64
= - ( ( bSig
* q64
)<<38 );
3336 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
3337 q64
= ( 2 < q64
) ? q64
- 2 : 0;
3338 q
= q64
>>( 64 - expDiff
);
3340 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
3343 alternateASig
= aSig
;
3346 } while ( 0 <= (int32_t) aSig
);
3347 sigMean
= aSig
+ alternateASig
;
3348 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3349 aSig
= alternateASig
;
3351 zSign
= ( (int32_t) aSig
< 0 );
3352 if ( zSign
) aSig
= - aSig
;
3353 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
3358 /*----------------------------------------------------------------------------
3359 | Returns the binary exponential of the single-precision floating-point value
3360 | `a'. The operation is performed according to the IEC/IEEE Standard for
3361 | Binary Floating-Point Arithmetic.
3363 | Uses the following identities:
3365 | 1. -------------------------------------------------------------------------
3369 | 2. -------------------------------------------------------------------------
3372 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3374 *----------------------------------------------------------------------------*/
3376 static const float64 float32_exp2_coefficients
[15] =
3378 const_float64( 0x3ff0000000000000ll
), /* 1 */
3379 const_float64( 0x3fe0000000000000ll
), /* 2 */
3380 const_float64( 0x3fc5555555555555ll
), /* 3 */
3381 const_float64( 0x3fa5555555555555ll
), /* 4 */
3382 const_float64( 0x3f81111111111111ll
), /* 5 */
3383 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
3384 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
3385 const_float64( 0x3efa01a01a01a01all
), /* 8 */
3386 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
3387 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
3388 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
3389 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
3390 const_float64( 0x3de6124613a86d09ll
), /* 13 */
3391 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
3392 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
3395 float32
float32_exp2(float32 a
, float_status
*status
)
3402 a
= float32_squash_input_denormal(a
, status
);
3404 aSig
= extractFloat32Frac( a
);
3405 aExp
= extractFloat32Exp( a
);
3406 aSign
= extractFloat32Sign( a
);
3408 if ( aExp
== 0xFF) {
3410 return propagateFloat32NaN(a
, float32_zero
, status
);
3412 return (aSign
) ? float32_zero
: a
;
3415 if (aSig
== 0) return float32_one
;
3418 float_raise(float_flag_inexact
, status
);
3420 /* ******************************* */
3421 /* using float64 for approximation */
3422 /* ******************************* */
3423 x
= float32_to_float64(a
, status
);
3424 x
= float64_mul(x
, float64_ln2
, status
);
3428 for (i
= 0 ; i
< 15 ; i
++) {
3431 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
3432 r
= float64_add(r
, f
, status
);
3434 xn
= float64_mul(xn
, x
, status
);
3437 return float64_to_float32(r
, status
);
3440 /*----------------------------------------------------------------------------
3441 | Returns the binary log of the single-precision floating-point value `a'.
3442 | The operation is performed according to the IEC/IEEE Standard for Binary
3443 | Floating-Point Arithmetic.
3444 *----------------------------------------------------------------------------*/
3445 float32
float32_log2(float32 a
, float_status
*status
)
3449 uint32_t aSig
, zSig
, i
;
3451 a
= float32_squash_input_denormal(a
, status
);
3452 aSig
= extractFloat32Frac( a
);
3453 aExp
= extractFloat32Exp( a
);
3454 aSign
= extractFloat32Sign( a
);
3457 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
3458 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3461 float_raise(float_flag_invalid
, status
);
3462 return float32_default_nan(status
);
3464 if ( aExp
== 0xFF ) {
3466 return propagateFloat32NaN(a
, float32_zero
, status
);
3476 for (i
= 1 << 22; i
> 0; i
>>= 1) {
3477 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
3478 if ( aSig
& 0x01000000 ) {
3487 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
3490 /*----------------------------------------------------------------------------
3491 | Returns 1 if the single-precision floating-point value `a' is equal to
3492 | the corresponding value `b', and 0 otherwise. The invalid exception is
3493 | raised if either operand is a NaN. Otherwise, the comparison is performed
3494 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3495 *----------------------------------------------------------------------------*/
3497 int float32_eq(float32 a
, float32 b
, float_status
*status
)
3500 a
= float32_squash_input_denormal(a
, status
);
3501 b
= float32_squash_input_denormal(b
, status
);
3503 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3504 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3506 float_raise(float_flag_invalid
, status
);
3509 av
= float32_val(a
);
3510 bv
= float32_val(b
);
3511 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3514 /*----------------------------------------------------------------------------
3515 | Returns 1 if the single-precision floating-point value `a' is less than
3516 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3517 | exception is raised if either operand is a NaN. The comparison is performed
3518 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3519 *----------------------------------------------------------------------------*/
3521 int float32_le(float32 a
, float32 b
, float_status
*status
)
3525 a
= float32_squash_input_denormal(a
, status
);
3526 b
= float32_squash_input_denormal(b
, status
);
3528 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3529 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3531 float_raise(float_flag_invalid
, status
);
3534 aSign
= extractFloat32Sign( a
);
3535 bSign
= extractFloat32Sign( b
);
3536 av
= float32_val(a
);
3537 bv
= float32_val(b
);
3538 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3539 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3543 /*----------------------------------------------------------------------------
3544 | Returns 1 if the single-precision floating-point value `a' is less than
3545 | the corresponding value `b', and 0 otherwise. The invalid exception is
3546 | raised if either operand is a NaN. The comparison is performed according
3547 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3548 *----------------------------------------------------------------------------*/
3550 int float32_lt(float32 a
, float32 b
, float_status
*status
)
3554 a
= float32_squash_input_denormal(a
, status
);
3555 b
= float32_squash_input_denormal(b
, status
);
3557 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3558 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3560 float_raise(float_flag_invalid
, status
);
3563 aSign
= extractFloat32Sign( a
);
3564 bSign
= extractFloat32Sign( b
);
3565 av
= float32_val(a
);
3566 bv
= float32_val(b
);
3567 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
3568 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3572 /*----------------------------------------------------------------------------
3573 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3574 | be compared, and 0 otherwise. The invalid exception is raised if either
3575 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3576 | Standard for Binary Floating-Point Arithmetic.
3577 *----------------------------------------------------------------------------*/
3579 int float32_unordered(float32 a
, float32 b
, float_status
*status
)
3581 a
= float32_squash_input_denormal(a
, status
);
3582 b
= float32_squash_input_denormal(b
, status
);
3584 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3585 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3587 float_raise(float_flag_invalid
, status
);
3593 /*----------------------------------------------------------------------------
3594 | Returns 1 if the single-precision floating-point value `a' is equal to
3595 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3596 | exception. The comparison is performed according to the IEC/IEEE Standard
3597 | for Binary Floating-Point Arithmetic.
3598 *----------------------------------------------------------------------------*/
3600 int float32_eq_quiet(float32 a
, float32 b
, float_status
*status
)
3602 a
= float32_squash_input_denormal(a
, status
);
3603 b
= float32_squash_input_denormal(b
, status
);
3605 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3606 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3608 if (float32_is_signaling_nan(a
, status
)
3609 || float32_is_signaling_nan(b
, status
)) {
3610 float_raise(float_flag_invalid
, status
);
3614 return ( float32_val(a
) == float32_val(b
) ) ||
3615 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
3618 /*----------------------------------------------------------------------------
3619 | Returns 1 if the single-precision floating-point value `a' is less than or
3620 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3621 | cause an exception. Otherwise, the comparison is performed according to the
3622 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3623 *----------------------------------------------------------------------------*/
3625 int float32_le_quiet(float32 a
, float32 b
, float_status
*status
)
3629 a
= float32_squash_input_denormal(a
, status
);
3630 b
= float32_squash_input_denormal(b
, status
);
3632 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3633 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3635 if (float32_is_signaling_nan(a
, status
)
3636 || float32_is_signaling_nan(b
, status
)) {
3637 float_raise(float_flag_invalid
, status
);
3641 aSign
= extractFloat32Sign( a
);
3642 bSign
= extractFloat32Sign( b
);
3643 av
= float32_val(a
);
3644 bv
= float32_val(b
);
3645 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3646 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3650 /*----------------------------------------------------------------------------
3651 | Returns 1 if the single-precision floating-point value `a' is less than
3652 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3653 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3654 | Standard for Binary Floating-Point Arithmetic.
3655 *----------------------------------------------------------------------------*/
3657 int float32_lt_quiet(float32 a
, float32 b
, float_status
*status
)
3661 a
= float32_squash_input_denormal(a
, status
);
3662 b
= float32_squash_input_denormal(b
, status
);
3664 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3665 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3667 if (float32_is_signaling_nan(a
, status
)
3668 || float32_is_signaling_nan(b
, status
)) {
3669 float_raise(float_flag_invalid
, status
);
3673 aSign
= extractFloat32Sign( a
);
3674 bSign
= extractFloat32Sign( b
);
3675 av
= float32_val(a
);
3676 bv
= float32_val(b
);
3677 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
3678 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3682 /*----------------------------------------------------------------------------
3683 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3684 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3685 | comparison is performed according to the IEC/IEEE Standard for Binary
3686 | Floating-Point Arithmetic.
3687 *----------------------------------------------------------------------------*/
3689 int float32_unordered_quiet(float32 a
, float32 b
, float_status
*status
)
3691 a
= float32_squash_input_denormal(a
, status
);
3692 b
= float32_squash_input_denormal(b
, status
);
3694 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3695 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3697 if (float32_is_signaling_nan(a
, status
)
3698 || float32_is_signaling_nan(b
, status
)) {
3699 float_raise(float_flag_invalid
, status
);
3707 /*----------------------------------------------------------------------------
3708 | Returns the result of converting the double-precision floating-point value
3709 | `a' to the single-precision floating-point format. The conversion is
3710 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3712 *----------------------------------------------------------------------------*/
3714 float32
float64_to_float32(float64 a
, float_status
*status
)
3720 a
= float64_squash_input_denormal(a
, status
);
3722 aSig
= extractFloat64Frac( a
);
3723 aExp
= extractFloat64Exp( a
);
3724 aSign
= extractFloat64Sign( a
);
3725 if ( aExp
== 0x7FF ) {
3727 return commonNaNToFloat32(float64ToCommonNaN(a
, status
), status
);
3729 return packFloat32( aSign
, 0xFF, 0 );
3731 shift64RightJamming( aSig
, 22, &aSig
);
3733 if ( aExp
|| zSig
) {
3737 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
3742 /*----------------------------------------------------------------------------
3743 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3744 | half-precision floating-point value, returning the result. After being
3745 | shifted into the proper positions, the three fields are simply added
3746 | together to form the result. This means that any integer portion of `zSig'
3747 | will be added into the exponent. Since a properly normalized significand
3748 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3749 | than the desired result exponent whenever `zSig' is a complete, normalized
3751 *----------------------------------------------------------------------------*/
3752 static float16
packFloat16(flag zSign
, int zExp
, uint16_t zSig
)
3754 return make_float16(
3755 (((uint32_t)zSign
) << 15) + (((uint32_t)zExp
) << 10) + zSig
);
3758 /*----------------------------------------------------------------------------
3759 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3760 | and significand `zSig', and returns the proper half-precision floating-
3761 | point value corresponding to the abstract input. Ordinarily, the abstract
3762 | value is simply rounded and packed into the half-precision format, with
3763 | the inexact exception raised if the abstract input cannot be represented
3764 | exactly. However, if the abstract value is too large, the overflow and
3765 | inexact exceptions are raised and an infinity or maximal finite value is
3766 | returned. If the abstract value is too small, the input value is rounded to
3767 | a subnormal number, and the underflow and inexact exceptions are raised if
3768 | the abstract input cannot be represented exactly as a subnormal half-
3769 | precision floating-point number.
3770 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3771 | ARM-style "alternative representation", which omits the NaN and Inf
3772 | encodings in order to raise the maximum representable exponent by one.
3773 | The input significand `zSig' has its binary point between bits 22
3774 | and 23, which is 13 bits to the left of the usual location. This shifted
3775 | significand must be normalized or smaller. If `zSig' is not normalized,
3776 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3777 | and it must not require rounding. In the usual case that `zSig' is
3778 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3779 | Note the slightly odd position of the binary point in zSig compared with the
3780 | other roundAndPackFloat functions. This should probably be fixed if we
3781 | need to implement more float16 routines than just conversion.
3782 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3783 | Binary Floating-Point Arithmetic.
3784 *----------------------------------------------------------------------------*/
3786 static float16
roundAndPackFloat16(flag zSign
, int zExp
,
3787 uint32_t zSig
, flag ieee
,
3788 float_status
*status
)
3790 int maxexp
= ieee
? 29 : 30;
3793 bool rounding_bumps_exp
;
3794 bool is_tiny
= false;
3796 /* Calculate the mask of bits of the mantissa which are not
3797 * representable in half-precision and will be lost.
3800 /* Will be denormal in halfprec */
3806 /* Normal number in halfprec */
3810 switch (status
->float_rounding_mode
) {
3811 case float_round_nearest_even
:
3812 increment
= (mask
+ 1) >> 1;
3813 if ((zSig
& mask
) == increment
) {
3814 increment
= zSig
& (increment
<< 1);
3817 case float_round_ties_away
:
3818 increment
= (mask
+ 1) >> 1;
3820 case float_round_up
:
3821 increment
= zSign
? 0 : mask
;
3823 case float_round_down
:
3824 increment
= zSign
? mask
: 0;
3826 default: /* round_to_zero */
3831 rounding_bumps_exp
= (zSig
+ increment
>= 0x01000000);
3833 if (zExp
> maxexp
|| (zExp
== maxexp
&& rounding_bumps_exp
)) {
3835 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3836 return packFloat16(zSign
, 0x1f, 0);
3838 float_raise(float_flag_invalid
, status
);
3839 return packFloat16(zSign
, 0x1f, 0x3ff);
3844 /* Note that flush-to-zero does not affect half-precision results */
3846 (status
->float_detect_tininess
== float_tininess_before_rounding
)
3848 || (!rounding_bumps_exp
);
3851 float_raise(float_flag_inexact
, status
);
3853 float_raise(float_flag_underflow
, status
);
3858 if (rounding_bumps_exp
) {
3864 return packFloat16(zSign
, 0, 0);
3870 return packFloat16(zSign
, zExp
, zSig
>> 13);
3873 /*----------------------------------------------------------------------------
3874 | If `a' is denormal and we are in flush-to-zero mode then set the
3875 | input-denormal exception and return zero. Otherwise just return the value.
3876 *----------------------------------------------------------------------------*/
3877 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
3879 if (status
->flush_inputs_to_zero
) {
3880 if (extractFloat16Exp(a
) == 0 && extractFloat16Frac(a
) != 0) {
3881 float_raise(float_flag_input_denormal
, status
);
3882 return make_float16(float16_val(a
) & 0x8000);
3888 static void normalizeFloat16Subnormal(uint32_t aSig
, int *zExpPtr
,
3891 int8_t shiftCount
= countLeadingZeros32(aSig
) - 21;
3892 *zSigPtr
= aSig
<< shiftCount
;
3893 *zExpPtr
= 1 - shiftCount
;
3896 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3897 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3899 float32
float16_to_float32(float16 a
, flag ieee
, float_status
*status
)
3905 aSign
= extractFloat16Sign(a
);
3906 aExp
= extractFloat16Exp(a
);
3907 aSig
= extractFloat16Frac(a
);
3909 if (aExp
== 0x1f && ieee
) {
3911 return commonNaNToFloat32(float16ToCommonNaN(a
, status
), status
);
3913 return packFloat32(aSign
, 0xff, 0);
3917 return packFloat32(aSign
, 0, 0);
3920 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3923 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
3926 float16
float32_to_float16(float32 a
, flag ieee
, float_status
*status
)
3932 a
= float32_squash_input_denormal(a
, status
);
3934 aSig
= extractFloat32Frac( a
);
3935 aExp
= extractFloat32Exp( a
);
3936 aSign
= extractFloat32Sign( a
);
3937 if ( aExp
== 0xFF ) {
3939 /* Input is a NaN */
3941 float_raise(float_flag_invalid
, status
);
3942 return packFloat16(aSign
, 0, 0);
3944 return commonNaNToFloat16(
3945 float32ToCommonNaN(a
, status
), status
);
3949 float_raise(float_flag_invalid
, status
);
3950 return packFloat16(aSign
, 0x1f, 0x3ff);
3952 return packFloat16(aSign
, 0x1f, 0);
3954 if (aExp
== 0 && aSig
== 0) {
3955 return packFloat16(aSign
, 0, 0);
3957 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3958 * even if the input is denormal; however this is harmless because
3959 * the largest possible single-precision denormal is still smaller
3960 * than the smallest representable half-precision denormal, and so we
3961 * will end up ignoring aSig and returning via the "always return zero"
3967 return roundAndPackFloat16(aSign
, aExp
, aSig
, ieee
, status
);
3970 float64
float16_to_float64(float16 a
, flag ieee
, float_status
*status
)
3976 aSign
= extractFloat16Sign(a
);
3977 aExp
= extractFloat16Exp(a
);
3978 aSig
= extractFloat16Frac(a
);
3980 if (aExp
== 0x1f && ieee
) {
3982 return commonNaNToFloat64(
3983 float16ToCommonNaN(a
, status
), status
);
3985 return packFloat64(aSign
, 0x7ff, 0);
3989 return packFloat64(aSign
, 0, 0);
3992 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3995 return packFloat64(aSign
, aExp
+ 0x3f0, ((uint64_t)aSig
) << 42);
3998 float16
float64_to_float16(float64 a
, flag ieee
, float_status
*status
)
4005 a
= float64_squash_input_denormal(a
, status
);
4007 aSig
= extractFloat64Frac(a
);
4008 aExp
= extractFloat64Exp(a
);
4009 aSign
= extractFloat64Sign(a
);
4010 if (aExp
== 0x7FF) {
4012 /* Input is a NaN */
4014 float_raise(float_flag_invalid
, status
);
4015 return packFloat16(aSign
, 0, 0);
4017 return commonNaNToFloat16(
4018 float64ToCommonNaN(a
, status
), status
);
4022 float_raise(float_flag_invalid
, status
);
4023 return packFloat16(aSign
, 0x1f, 0x3ff);
4025 return packFloat16(aSign
, 0x1f, 0);
4027 shift64RightJamming(aSig
, 29, &aSig
);
4029 if (aExp
== 0 && zSig
== 0) {
4030 return packFloat16(aSign
, 0, 0);
4032 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4033 * even if the input is denormal; however this is harmless because
4034 * the largest possible single-precision denormal is still smaller
4035 * than the smallest representable half-precision denormal, and so we
4036 * will end up ignoring aSig and returning via the "always return zero"
4042 return roundAndPackFloat16(aSign
, aExp
, zSig
, ieee
, status
);
4045 /*----------------------------------------------------------------------------
4046 | Returns the result of converting the double-precision floating-point value
4047 | `a' to the extended double-precision floating-point format. The conversion
4048 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4050 *----------------------------------------------------------------------------*/
4052 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
4058 a
= float64_squash_input_denormal(a
, status
);
4059 aSig
= extractFloat64Frac( a
);
4060 aExp
= extractFloat64Exp( a
);
4061 aSign
= extractFloat64Sign( a
);
4062 if ( aExp
== 0x7FF ) {
4064 return commonNaNToFloatx80(float64ToCommonNaN(a
, status
), status
);
4066 return packFloatx80(aSign
,
4067 floatx80_infinity_high
,
4068 floatx80_infinity_low
);
4071 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
4072 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4076 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
4080 /*----------------------------------------------------------------------------
4081 | Returns the result of converting the double-precision floating-point value
4082 | `a' to the quadruple-precision floating-point format. The conversion is
4083 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4085 *----------------------------------------------------------------------------*/
4087 float128
float64_to_float128(float64 a
, float_status
*status
)
4091 uint64_t aSig
, zSig0
, zSig1
;
4093 a
= float64_squash_input_denormal(a
, status
);
4094 aSig
= extractFloat64Frac( a
);
4095 aExp
= extractFloat64Exp( a
);
4096 aSign
= extractFloat64Sign( a
);
4097 if ( aExp
== 0x7FF ) {
4099 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
4101 return packFloat128( aSign
, 0x7FFF, 0, 0 );
4104 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
4105 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4108 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
4109 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
4114 /*----------------------------------------------------------------------------
4115 | Returns the remainder of the double-precision floating-point value `a'
4116 | with respect to the corresponding value `b'. The operation is performed
4117 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4118 *----------------------------------------------------------------------------*/
4120 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
4123 int aExp
, bExp
, expDiff
;
4124 uint64_t aSig
, bSig
;
4125 uint64_t q
, alternateASig
;
4128 a
= float64_squash_input_denormal(a
, status
);
4129 b
= float64_squash_input_denormal(b
, status
);
4130 aSig
= extractFloat64Frac( a
);
4131 aExp
= extractFloat64Exp( a
);
4132 aSign
= extractFloat64Sign( a
);
4133 bSig
= extractFloat64Frac( b
);
4134 bExp
= extractFloat64Exp( b
);
4135 if ( aExp
== 0x7FF ) {
4136 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
4137 return propagateFloat64NaN(a
, b
, status
);
4139 float_raise(float_flag_invalid
, status
);
4140 return float64_default_nan(status
);
4142 if ( bExp
== 0x7FF ) {
4144 return propagateFloat64NaN(a
, b
, status
);
4150 float_raise(float_flag_invalid
, status
);
4151 return float64_default_nan(status
);
4153 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4156 if ( aSig
== 0 ) return a
;
4157 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4159 expDiff
= aExp
- bExp
;
4160 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
4161 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4162 if ( expDiff
< 0 ) {
4163 if ( expDiff
< -1 ) return a
;
4166 q
= ( bSig
<= aSig
);
4167 if ( q
) aSig
-= bSig
;
4169 while ( 0 < expDiff
) {
4170 q
= estimateDiv128To64( aSig
, 0, bSig
);
4171 q
= ( 2 < q
) ? q
- 2 : 0;
4172 aSig
= - ( ( bSig
>>2 ) * q
);
4176 if ( 0 < expDiff
) {
4177 q
= estimateDiv128To64( aSig
, 0, bSig
);
4178 q
= ( 2 < q
) ? q
- 2 : 0;
4181 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4188 alternateASig
= aSig
;
4191 } while ( 0 <= (int64_t) aSig
);
4192 sigMean
= aSig
+ alternateASig
;
4193 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4194 aSig
= alternateASig
;
4196 zSign
= ( (int64_t) aSig
< 0 );
4197 if ( zSign
) aSig
= - aSig
;
4198 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
4202 /*----------------------------------------------------------------------------
4203 | Returns the binary log of the double-precision floating-point value `a'.
4204 | The operation is performed according to the IEC/IEEE Standard for Binary
4205 | Floating-Point Arithmetic.
4206 *----------------------------------------------------------------------------*/
4207 float64
float64_log2(float64 a
, float_status
*status
)
4211 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
4212 a
= float64_squash_input_denormal(a
, status
);
4214 aSig
= extractFloat64Frac( a
);
4215 aExp
= extractFloat64Exp( a
);
4216 aSign
= extractFloat64Sign( a
);
4219 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
4220 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4223 float_raise(float_flag_invalid
, status
);
4224 return float64_default_nan(status
);
4226 if ( aExp
== 0x7FF ) {
4228 return propagateFloat64NaN(a
, float64_zero
, status
);
4234 aSig
|= LIT64( 0x0010000000000000 );
4236 zSig
= (uint64_t)aExp
<< 52;
4237 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
4238 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
4239 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
4240 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
4248 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
4251 /*----------------------------------------------------------------------------
4252 | Returns 1 if the double-precision floating-point value `a' is equal to the
4253 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4254 | if either operand is a NaN. Otherwise, the comparison is performed
4255 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4256 *----------------------------------------------------------------------------*/
4258 int float64_eq(float64 a
, float64 b
, float_status
*status
)
4261 a
= float64_squash_input_denormal(a
, status
);
4262 b
= float64_squash_input_denormal(b
, status
);
4264 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4265 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4267 float_raise(float_flag_invalid
, status
);
4270 av
= float64_val(a
);
4271 bv
= float64_val(b
);
4272 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4276 /*----------------------------------------------------------------------------
4277 | Returns 1 if the double-precision floating-point value `a' is less than or
4278 | equal to the corresponding value `b', and 0 otherwise. The invalid
4279 | exception is raised if either operand is a NaN. The comparison is performed
4280 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4281 *----------------------------------------------------------------------------*/
4283 int float64_le(float64 a
, float64 b
, float_status
*status
)
4287 a
= float64_squash_input_denormal(a
, status
);
4288 b
= float64_squash_input_denormal(b
, status
);
4290 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4291 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4293 float_raise(float_flag_invalid
, status
);
4296 aSign
= extractFloat64Sign( a
);
4297 bSign
= extractFloat64Sign( b
);
4298 av
= float64_val(a
);
4299 bv
= float64_val(b
);
4300 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4301 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4305 /*----------------------------------------------------------------------------
4306 | Returns 1 if the double-precision floating-point value `a' is less than
4307 | the corresponding value `b', and 0 otherwise. The invalid exception is
4308 | raised if either operand is a NaN. The comparison is performed according
4309 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4310 *----------------------------------------------------------------------------*/
4312 int float64_lt(float64 a
, float64 b
, float_status
*status
)
4317 a
= float64_squash_input_denormal(a
, status
);
4318 b
= float64_squash_input_denormal(b
, status
);
4319 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4320 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4322 float_raise(float_flag_invalid
, status
);
4325 aSign
= extractFloat64Sign( a
);
4326 bSign
= extractFloat64Sign( b
);
4327 av
= float64_val(a
);
4328 bv
= float64_val(b
);
4329 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4330 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4334 /*----------------------------------------------------------------------------
4335 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4336 | be compared, and 0 otherwise. The invalid exception is raised if either
4337 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4338 | Standard for Binary Floating-Point Arithmetic.
4339 *----------------------------------------------------------------------------*/
4341 int float64_unordered(float64 a
, float64 b
, float_status
*status
)
4343 a
= float64_squash_input_denormal(a
, status
);
4344 b
= float64_squash_input_denormal(b
, status
);
4346 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4347 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4349 float_raise(float_flag_invalid
, status
);
4355 /*----------------------------------------------------------------------------
4356 | Returns 1 if the double-precision floating-point value `a' is equal to the
4357 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4358 | exception.The comparison is performed according to the IEC/IEEE Standard
4359 | for Binary Floating-Point Arithmetic.
4360 *----------------------------------------------------------------------------*/
4362 int float64_eq_quiet(float64 a
, float64 b
, float_status
*status
)
4365 a
= float64_squash_input_denormal(a
, status
);
4366 b
= float64_squash_input_denormal(b
, status
);
4368 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4369 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4371 if (float64_is_signaling_nan(a
, status
)
4372 || float64_is_signaling_nan(b
, status
)) {
4373 float_raise(float_flag_invalid
, status
);
4377 av
= float64_val(a
);
4378 bv
= float64_val(b
);
4379 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4383 /*----------------------------------------------------------------------------
4384 | Returns 1 if the double-precision floating-point value `a' is less than or
4385 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4386 | cause an exception. Otherwise, the comparison is performed according to the
4387 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4388 *----------------------------------------------------------------------------*/
4390 int float64_le_quiet(float64 a
, float64 b
, float_status
*status
)
4394 a
= float64_squash_input_denormal(a
, status
);
4395 b
= float64_squash_input_denormal(b
, status
);
4397 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4398 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4400 if (float64_is_signaling_nan(a
, status
)
4401 || float64_is_signaling_nan(b
, status
)) {
4402 float_raise(float_flag_invalid
, status
);
4406 aSign
= extractFloat64Sign( a
);
4407 bSign
= extractFloat64Sign( b
);
4408 av
= float64_val(a
);
4409 bv
= float64_val(b
);
4410 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4411 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4415 /*----------------------------------------------------------------------------
4416 | Returns 1 if the double-precision floating-point value `a' is less than
4417 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4418 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4419 | Standard for Binary Floating-Point Arithmetic.
4420 *----------------------------------------------------------------------------*/
4422 int float64_lt_quiet(float64 a
, float64 b
, float_status
*status
)
4426 a
= float64_squash_input_denormal(a
, status
);
4427 b
= float64_squash_input_denormal(b
, status
);
4429 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4430 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4432 if (float64_is_signaling_nan(a
, status
)
4433 || float64_is_signaling_nan(b
, status
)) {
4434 float_raise(float_flag_invalid
, status
);
4438 aSign
= extractFloat64Sign( a
);
4439 bSign
= extractFloat64Sign( b
);
4440 av
= float64_val(a
);
4441 bv
= float64_val(b
);
4442 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4443 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4447 /*----------------------------------------------------------------------------
4448 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4449 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4450 | comparison is performed according to the IEC/IEEE Standard for Binary
4451 | Floating-Point Arithmetic.
4452 *----------------------------------------------------------------------------*/
4454 int float64_unordered_quiet(float64 a
, float64 b
, float_status
*status
)
4456 a
= float64_squash_input_denormal(a
, status
);
4457 b
= float64_squash_input_denormal(b
, status
);
4459 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4460 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4462 if (float64_is_signaling_nan(a
, status
)
4463 || float64_is_signaling_nan(b
, status
)) {
4464 float_raise(float_flag_invalid
, status
);
4471 /*----------------------------------------------------------------------------
4472 | Returns the result of converting the extended double-precision floating-
4473 | point value `a' to the 32-bit two's complement integer format. The
4474 | conversion is performed according to the IEC/IEEE Standard for Binary
4475 | Floating-Point Arithmetic---which means in particular that the conversion
4476 | is rounded according to the current rounding mode. If `a' is a NaN, the
4477 | largest positive integer is returned. Otherwise, if the conversion
4478 | overflows, the largest integer with the same sign as `a' is returned.
4479 *----------------------------------------------------------------------------*/
4481 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
4484 int32_t aExp
, shiftCount
;
4487 if (floatx80_invalid_encoding(a
)) {
4488 float_raise(float_flag_invalid
, status
);
4491 aSig
= extractFloatx80Frac( a
);
4492 aExp
= extractFloatx80Exp( a
);
4493 aSign
= extractFloatx80Sign( a
);
4494 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4495 shiftCount
= 0x4037 - aExp
;
4496 if ( shiftCount
<= 0 ) shiftCount
= 1;
4497 shift64RightJamming( aSig
, shiftCount
, &aSig
);
4498 return roundAndPackInt32(aSign
, aSig
, status
);
4502 /*----------------------------------------------------------------------------
4503 | Returns the result of converting the extended double-precision floating-
4504 | point value `a' to the 32-bit two's complement integer format. The
4505 | conversion is performed according to the IEC/IEEE Standard for Binary
4506 | Floating-Point Arithmetic, except that the conversion is always rounded
4507 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4508 | Otherwise, if the conversion overflows, the largest integer with the same
4509 | sign as `a' is returned.
4510 *----------------------------------------------------------------------------*/
4512 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
4515 int32_t aExp
, shiftCount
;
4516 uint64_t aSig
, savedASig
;
4519 if (floatx80_invalid_encoding(a
)) {
4520 float_raise(float_flag_invalid
, status
);
4523 aSig
= extractFloatx80Frac( a
);
4524 aExp
= extractFloatx80Exp( a
);
4525 aSign
= extractFloatx80Sign( a
);
4526 if ( 0x401E < aExp
) {
4527 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4530 else if ( aExp
< 0x3FFF ) {
4532 status
->float_exception_flags
|= float_flag_inexact
;
4536 shiftCount
= 0x403E - aExp
;
4538 aSig
>>= shiftCount
;
4540 if ( aSign
) z
= - z
;
4541 if ( ( z
< 0 ) ^ aSign
) {
4543 float_raise(float_flag_invalid
, status
);
4544 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
4546 if ( ( aSig
<<shiftCount
) != savedASig
) {
4547 status
->float_exception_flags
|= float_flag_inexact
;
4553 /*----------------------------------------------------------------------------
4554 | Returns the result of converting the extended double-precision floating-
4555 | point value `a' to the 64-bit two's complement integer format. The
4556 | conversion is performed according to the IEC/IEEE Standard for Binary
4557 | Floating-Point Arithmetic---which means in particular that the conversion
4558 | is rounded according to the current rounding mode. If `a' is a NaN,
4559 | the largest positive integer is returned. Otherwise, if the conversion
4560 | overflows, the largest integer with the same sign as `a' is returned.
4561 *----------------------------------------------------------------------------*/
4563 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
4566 int32_t aExp
, shiftCount
;
4567 uint64_t aSig
, aSigExtra
;
4569 if (floatx80_invalid_encoding(a
)) {
4570 float_raise(float_flag_invalid
, status
);
4573 aSig
= extractFloatx80Frac( a
);
4574 aExp
= extractFloatx80Exp( a
);
4575 aSign
= extractFloatx80Sign( a
);
4576 shiftCount
= 0x403E - aExp
;
4577 if ( shiftCount
<= 0 ) {
4579 float_raise(float_flag_invalid
, status
);
4580 if (!aSign
|| floatx80_is_any_nan(a
)) {
4581 return LIT64( 0x7FFFFFFFFFFFFFFF );
4583 return (int64_t) LIT64( 0x8000000000000000 );
4588 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
4590 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
4594 /*----------------------------------------------------------------------------
4595 | Returns the result of converting the extended double-precision floating-
4596 | point value `a' to the 64-bit two's complement integer format. The
4597 | conversion is performed according to the IEC/IEEE Standard for Binary
4598 | Floating-Point Arithmetic, except that the conversion is always rounded
4599 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4600 | Otherwise, if the conversion overflows, the largest integer with the same
4601 | sign as `a' is returned.
4602 *----------------------------------------------------------------------------*/
4604 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
4607 int32_t aExp
, shiftCount
;
4611 if (floatx80_invalid_encoding(a
)) {
4612 float_raise(float_flag_invalid
, status
);
4615 aSig
= extractFloatx80Frac( a
);
4616 aExp
= extractFloatx80Exp( a
);
4617 aSign
= extractFloatx80Sign( a
);
4618 shiftCount
= aExp
- 0x403E;
4619 if ( 0 <= shiftCount
) {
4620 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
4621 if ( ( a
.high
!= 0xC03E ) || aSig
) {
4622 float_raise(float_flag_invalid
, status
);
4623 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
4624 return LIT64( 0x7FFFFFFFFFFFFFFF );
4627 return (int64_t) LIT64( 0x8000000000000000 );
4629 else if ( aExp
< 0x3FFF ) {
4631 status
->float_exception_flags
|= float_flag_inexact
;
4635 z
= aSig
>>( - shiftCount
);
4636 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
4637 status
->float_exception_flags
|= float_flag_inexact
;
4639 if ( aSign
) z
= - z
;
4644 /*----------------------------------------------------------------------------
4645 | Returns the result of converting the extended double-precision floating-
4646 | point value `a' to the single-precision floating-point format. The
4647 | conversion is performed according to the IEC/IEEE Standard for Binary
4648 | Floating-Point Arithmetic.
4649 *----------------------------------------------------------------------------*/
4651 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
4657 if (floatx80_invalid_encoding(a
)) {
4658 float_raise(float_flag_invalid
, status
);
4659 return float32_default_nan(status
);
4661 aSig
= extractFloatx80Frac( a
);
4662 aExp
= extractFloatx80Exp( a
);
4663 aSign
= extractFloatx80Sign( a
);
4664 if ( aExp
== 0x7FFF ) {
4665 if ( (uint64_t) ( aSig
<<1 ) ) {
4666 return commonNaNToFloat32(floatx80ToCommonNaN(a
, status
), status
);
4668 return packFloat32( aSign
, 0xFF, 0 );
4670 shift64RightJamming( aSig
, 33, &aSig
);
4671 if ( aExp
|| aSig
) aExp
-= 0x3F81;
4672 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
4676 /*----------------------------------------------------------------------------
4677 | Returns the result of converting the extended double-precision floating-
4678 | point value `a' to the double-precision floating-point format. The
4679 | conversion is performed according to the IEC/IEEE Standard for Binary
4680 | Floating-Point Arithmetic.
4681 *----------------------------------------------------------------------------*/
4683 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
4687 uint64_t aSig
, zSig
;
4689 if (floatx80_invalid_encoding(a
)) {
4690 float_raise(float_flag_invalid
, status
);
4691 return float64_default_nan(status
);
4693 aSig
= extractFloatx80Frac( a
);
4694 aExp
= extractFloatx80Exp( a
);
4695 aSign
= extractFloatx80Sign( a
);
4696 if ( aExp
== 0x7FFF ) {
4697 if ( (uint64_t) ( aSig
<<1 ) ) {
4698 return commonNaNToFloat64(floatx80ToCommonNaN(a
, status
), status
);
4700 return packFloat64( aSign
, 0x7FF, 0 );
4702 shift64RightJamming( aSig
, 1, &zSig
);
4703 if ( aExp
|| aSig
) aExp
-= 0x3C01;
4704 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
4708 /*----------------------------------------------------------------------------
4709 | Returns the result of converting the extended double-precision floating-
4710 | point value `a' to the quadruple-precision floating-point format. The
4711 | conversion is performed according to the IEC/IEEE Standard for Binary
4712 | Floating-Point Arithmetic.
4713 *----------------------------------------------------------------------------*/
4715 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
4719 uint64_t aSig
, zSig0
, zSig1
;
4721 if (floatx80_invalid_encoding(a
)) {
4722 float_raise(float_flag_invalid
, status
);
4723 return float128_default_nan(status
);
4725 aSig
= extractFloatx80Frac( a
);
4726 aExp
= extractFloatx80Exp( a
);
4727 aSign
= extractFloatx80Sign( a
);
4728 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
4729 return commonNaNToFloat128(floatx80ToCommonNaN(a
, status
), status
);
4731 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
4732 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
4736 /*----------------------------------------------------------------------------
4737 | Rounds the extended double-precision floating-point value `a'
4738 | to the precision provided by floatx80_rounding_precision and returns the
4739 | result as an extended double-precision floating-point value.
4740 | The operation is performed according to the IEC/IEEE Standard for Binary
4741 | Floating-Point Arithmetic.
4742 *----------------------------------------------------------------------------*/
4744 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
4746 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
4747 extractFloatx80Sign(a
),
4748 extractFloatx80Exp(a
),
4749 extractFloatx80Frac(a
), 0, status
);
4752 /*----------------------------------------------------------------------------
4753 | Rounds the extended double-precision floating-point value `a' to an integer,
4754 | and returns the result as an extended quadruple-precision floating-point
4755 | value. The operation is performed according to the IEC/IEEE Standard for
4756 | Binary Floating-Point Arithmetic.
4757 *----------------------------------------------------------------------------*/
4759 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
4763 uint64_t lastBitMask
, roundBitsMask
;
4766 if (floatx80_invalid_encoding(a
)) {
4767 float_raise(float_flag_invalid
, status
);
4768 return floatx80_default_nan(status
);
4770 aExp
= extractFloatx80Exp( a
);
4771 if ( 0x403E <= aExp
) {
4772 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
4773 return propagateFloatx80NaN(a
, a
, status
);
4777 if ( aExp
< 0x3FFF ) {
4779 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
4782 status
->float_exception_flags
|= float_flag_inexact
;
4783 aSign
= extractFloatx80Sign( a
);
4784 switch (status
->float_rounding_mode
) {
4785 case float_round_nearest_even
:
4786 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
4789 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
4792 case float_round_ties_away
:
4793 if (aExp
== 0x3FFE) {
4794 return packFloatx80(aSign
, 0x3FFF, LIT64(0x8000000000000000));
4797 case float_round_down
:
4800 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4801 : packFloatx80( 0, 0, 0 );
4802 case float_round_up
:
4804 aSign
? packFloatx80( 1, 0, 0 )
4805 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4807 return packFloatx80( aSign
, 0, 0 );
4810 lastBitMask
<<= 0x403E - aExp
;
4811 roundBitsMask
= lastBitMask
- 1;
4813 switch (status
->float_rounding_mode
) {
4814 case float_round_nearest_even
:
4815 z
.low
+= lastBitMask
>>1;
4816 if ((z
.low
& roundBitsMask
) == 0) {
4817 z
.low
&= ~lastBitMask
;
4820 case float_round_ties_away
:
4821 z
.low
+= lastBitMask
>> 1;
4823 case float_round_to_zero
:
4825 case float_round_up
:
4826 if (!extractFloatx80Sign(z
)) {
4827 z
.low
+= roundBitsMask
;
4830 case float_round_down
:
4831 if (extractFloatx80Sign(z
)) {
4832 z
.low
+= roundBitsMask
;
4838 z
.low
&= ~ roundBitsMask
;
4841 z
.low
= LIT64( 0x8000000000000000 );
4843 if (z
.low
!= a
.low
) {
4844 status
->float_exception_flags
|= float_flag_inexact
;
4850 /*----------------------------------------------------------------------------
4851 | Returns the result of adding the absolute values of the extended double-
4852 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4853 | negated before being returned. `zSign' is ignored if the result is a NaN.
4854 | The addition is performed according to the IEC/IEEE Standard for Binary
4855 | Floating-Point Arithmetic.
4856 *----------------------------------------------------------------------------*/
4858 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
4859 float_status
*status
)
4861 int32_t aExp
, bExp
, zExp
;
4862 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4865 aSig
= extractFloatx80Frac( a
);
4866 aExp
= extractFloatx80Exp( a
);
4867 bSig
= extractFloatx80Frac( b
);
4868 bExp
= extractFloatx80Exp( b
);
4869 expDiff
= aExp
- bExp
;
4870 if ( 0 < expDiff
) {
4871 if ( aExp
== 0x7FFF ) {
4872 if ((uint64_t)(aSig
<< 1)) {
4873 return propagateFloatx80NaN(a
, b
, status
);
4877 if ( bExp
== 0 ) --expDiff
;
4878 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4881 else if ( expDiff
< 0 ) {
4882 if ( bExp
== 0x7FFF ) {
4883 if ((uint64_t)(bSig
<< 1)) {
4884 return propagateFloatx80NaN(a
, b
, status
);
4886 return packFloatx80(zSign
,
4887 floatx80_infinity_high
,
4888 floatx80_infinity_low
);
4890 if ( aExp
== 0 ) ++expDiff
;
4891 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4895 if ( aExp
== 0x7FFF ) {
4896 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
4897 return propagateFloatx80NaN(a
, b
, status
);
4902 zSig0
= aSig
+ bSig
;
4904 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
4910 zSig0
= aSig
+ bSig
;
4911 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
4913 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4914 zSig0
|= LIT64( 0x8000000000000000 );
4917 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
4918 zSign
, zExp
, zSig0
, zSig1
, status
);
4921 /*----------------------------------------------------------------------------
4922 | Returns the result of subtracting the absolute values of the extended
4923 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4924 | difference is negated before being returned. `zSign' is ignored if the
4925 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4926 | Standard for Binary Floating-Point Arithmetic.
4927 *----------------------------------------------------------------------------*/
4929 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
4930 float_status
*status
)
4932 int32_t aExp
, bExp
, zExp
;
4933 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4936 aSig
= extractFloatx80Frac( a
);
4937 aExp
= extractFloatx80Exp( a
);
4938 bSig
= extractFloatx80Frac( b
);
4939 bExp
= extractFloatx80Exp( b
);
4940 expDiff
= aExp
- bExp
;
4941 if ( 0 < expDiff
) goto aExpBigger
;
4942 if ( expDiff
< 0 ) goto bExpBigger
;
4943 if ( aExp
== 0x7FFF ) {
4944 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
4945 return propagateFloatx80NaN(a
, b
, status
);
4947 float_raise(float_flag_invalid
, status
);
4948 return floatx80_default_nan(status
);
4955 if ( bSig
< aSig
) goto aBigger
;
4956 if ( aSig
< bSig
) goto bBigger
;
4957 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
4959 if ( bExp
== 0x7FFF ) {
4960 if ((uint64_t)(bSig
<< 1)) {
4961 return propagateFloatx80NaN(a
, b
, status
);
4963 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
4964 floatx80_infinity_low
);
4966 if ( aExp
== 0 ) ++expDiff
;
4967 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4969 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
4972 goto normalizeRoundAndPack
;
4974 if ( aExp
== 0x7FFF ) {
4975 if ((uint64_t)(aSig
<< 1)) {
4976 return propagateFloatx80NaN(a
, b
, status
);
4980 if ( bExp
== 0 ) --expDiff
;
4981 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4983 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
4985 normalizeRoundAndPack
:
4986 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
4987 zSign
, zExp
, zSig0
, zSig1
, status
);
4990 /*----------------------------------------------------------------------------
4991 | Returns the result of adding the extended double-precision floating-point
4992 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4993 | Standard for Binary Floating-Point Arithmetic.
4994 *----------------------------------------------------------------------------*/
4996 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
5000 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5001 float_raise(float_flag_invalid
, status
);
5002 return floatx80_default_nan(status
);
5004 aSign
= extractFloatx80Sign( a
);
5005 bSign
= extractFloatx80Sign( b
);
5006 if ( aSign
== bSign
) {
5007 return addFloatx80Sigs(a
, b
, aSign
, status
);
5010 return subFloatx80Sigs(a
, b
, aSign
, status
);
5015 /*----------------------------------------------------------------------------
5016 | Returns the result of subtracting the extended double-precision floating-
5017 | point values `a' and `b'. The operation is performed according to the
5018 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5019 *----------------------------------------------------------------------------*/
5021 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5025 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5026 float_raise(float_flag_invalid
, status
);
5027 return floatx80_default_nan(status
);
5029 aSign
= extractFloatx80Sign( a
);
5030 bSign
= extractFloatx80Sign( b
);
5031 if ( aSign
== bSign
) {
5032 return subFloatx80Sigs(a
, b
, aSign
, status
);
5035 return addFloatx80Sigs(a
, b
, aSign
, status
);
5040 /*----------------------------------------------------------------------------
5041 | Returns the result of multiplying the extended double-precision floating-
5042 | point values `a' and `b'. The operation is performed according to the
5043 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5044 *----------------------------------------------------------------------------*/
5046 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
5048 flag aSign
, bSign
, zSign
;
5049 int32_t aExp
, bExp
, zExp
;
5050 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5052 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5053 float_raise(float_flag_invalid
, status
);
5054 return floatx80_default_nan(status
);
5056 aSig
= extractFloatx80Frac( a
);
5057 aExp
= extractFloatx80Exp( a
);
5058 aSign
= extractFloatx80Sign( a
);
5059 bSig
= extractFloatx80Frac( b
);
5060 bExp
= extractFloatx80Exp( b
);
5061 bSign
= extractFloatx80Sign( b
);
5062 zSign
= aSign
^ bSign
;
5063 if ( aExp
== 0x7FFF ) {
5064 if ( (uint64_t) ( aSig
<<1 )
5065 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5066 return propagateFloatx80NaN(a
, b
, status
);
5068 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
5069 return packFloatx80(zSign
, floatx80_infinity_high
,
5070 floatx80_infinity_low
);
5072 if ( bExp
== 0x7FFF ) {
5073 if ((uint64_t)(bSig
<< 1)) {
5074 return propagateFloatx80NaN(a
, b
, status
);
5076 if ( ( aExp
| aSig
) == 0 ) {
5078 float_raise(float_flag_invalid
, status
);
5079 return floatx80_default_nan(status
);
5081 return packFloatx80(zSign
, floatx80_infinity_high
,
5082 floatx80_infinity_low
);
5085 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5086 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5089 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5090 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5092 zExp
= aExp
+ bExp
- 0x3FFE;
5093 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
5094 if ( 0 < (int64_t) zSig0
) {
5095 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5098 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5099 zSign
, zExp
, zSig0
, zSig1
, status
);
5102 /*----------------------------------------------------------------------------
5103 | Returns the result of dividing the extended double-precision floating-point
5104 | value `a' by the corresponding value `b'. The operation is performed
5105 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5106 *----------------------------------------------------------------------------*/
5108 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
5110 flag aSign
, bSign
, zSign
;
5111 int32_t aExp
, bExp
, zExp
;
5112 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5113 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
5115 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5116 float_raise(float_flag_invalid
, status
);
5117 return floatx80_default_nan(status
);
5119 aSig
= extractFloatx80Frac( a
);
5120 aExp
= extractFloatx80Exp( a
);
5121 aSign
= extractFloatx80Sign( a
);
5122 bSig
= extractFloatx80Frac( b
);
5123 bExp
= extractFloatx80Exp( b
);
5124 bSign
= extractFloatx80Sign( b
);
5125 zSign
= aSign
^ bSign
;
5126 if ( aExp
== 0x7FFF ) {
5127 if ((uint64_t)(aSig
<< 1)) {
5128 return propagateFloatx80NaN(a
, b
, status
);
5130 if ( bExp
== 0x7FFF ) {
5131 if ((uint64_t)(bSig
<< 1)) {
5132 return propagateFloatx80NaN(a
, b
, status
);
5136 return packFloatx80(zSign
, floatx80_infinity_high
,
5137 floatx80_infinity_low
);
5139 if ( bExp
== 0x7FFF ) {
5140 if ((uint64_t)(bSig
<< 1)) {
5141 return propagateFloatx80NaN(a
, b
, status
);
5143 return packFloatx80( zSign
, 0, 0 );
5147 if ( ( aExp
| aSig
) == 0 ) {
5149 float_raise(float_flag_invalid
, status
);
5150 return floatx80_default_nan(status
);
5152 float_raise(float_flag_divbyzero
, status
);
5153 return packFloatx80(zSign
, floatx80_infinity_high
,
5154 floatx80_infinity_low
);
5156 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5159 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5160 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5162 zExp
= aExp
- bExp
+ 0x3FFE;
5164 if ( bSig
<= aSig
) {
5165 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
5168 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
5169 mul64To128( bSig
, zSig0
, &term0
, &term1
);
5170 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
5171 while ( (int64_t) rem0
< 0 ) {
5173 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
5175 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
5176 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
5177 mul64To128( bSig
, zSig1
, &term1
, &term2
);
5178 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5179 while ( (int64_t) rem1
< 0 ) {
5181 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
5183 zSig1
|= ( ( rem1
| rem2
) != 0 );
5185 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5186 zSign
, zExp
, zSig0
, zSig1
, status
);
5189 /*----------------------------------------------------------------------------
5190 | Returns the remainder of the extended double-precision floating-point value
5191 | `a' with respect to the corresponding value `b'. The operation is performed
5192 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5193 *----------------------------------------------------------------------------*/
5195 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
5198 int32_t aExp
, bExp
, expDiff
;
5199 uint64_t aSig0
, aSig1
, bSig
;
5200 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
5202 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5203 float_raise(float_flag_invalid
, status
);
5204 return floatx80_default_nan(status
);
5206 aSig0
= extractFloatx80Frac( a
);
5207 aExp
= extractFloatx80Exp( a
);
5208 aSign
= extractFloatx80Sign( a
);
5209 bSig
= extractFloatx80Frac( b
);
5210 bExp
= extractFloatx80Exp( b
);
5211 if ( aExp
== 0x7FFF ) {
5212 if ( (uint64_t) ( aSig0
<<1 )
5213 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5214 return propagateFloatx80NaN(a
, b
, status
);
5218 if ( bExp
== 0x7FFF ) {
5219 if ((uint64_t)(bSig
<< 1)) {
5220 return propagateFloatx80NaN(a
, b
, status
);
5227 float_raise(float_flag_invalid
, status
);
5228 return floatx80_default_nan(status
);
5230 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5233 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
5234 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5236 bSig
|= LIT64( 0x8000000000000000 );
5238 expDiff
= aExp
- bExp
;
5240 if ( expDiff
< 0 ) {
5241 if ( expDiff
< -1 ) return a
;
5242 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
5245 q
= ( bSig
<= aSig0
);
5246 if ( q
) aSig0
-= bSig
;
5248 while ( 0 < expDiff
) {
5249 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5250 q
= ( 2 < q
) ? q
- 2 : 0;
5251 mul64To128( bSig
, q
, &term0
, &term1
);
5252 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5253 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
5257 if ( 0 < expDiff
) {
5258 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5259 q
= ( 2 < q
) ? q
- 2 : 0;
5261 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
5262 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5263 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
5264 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
5266 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5273 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
5274 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5275 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5278 aSig0
= alternateASig0
;
5279 aSig1
= alternateASig1
;
5283 normalizeRoundAndPackFloatx80(
5284 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
5288 /*----------------------------------------------------------------------------
5289 | Returns the square root of the extended double-precision floating-point
5290 | value `a'. The operation is performed according to the IEC/IEEE Standard
5291 | for Binary Floating-Point Arithmetic.
5292 *----------------------------------------------------------------------------*/
5294 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
5298 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
5299 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5301 if (floatx80_invalid_encoding(a
)) {
5302 float_raise(float_flag_invalid
, status
);
5303 return floatx80_default_nan(status
);
5305 aSig0
= extractFloatx80Frac( a
);
5306 aExp
= extractFloatx80Exp( a
);
5307 aSign
= extractFloatx80Sign( a
);
5308 if ( aExp
== 0x7FFF ) {
5309 if ((uint64_t)(aSig0
<< 1)) {
5310 return propagateFloatx80NaN(a
, a
, status
);
5312 if ( ! aSign
) return a
;
5316 if ( ( aExp
| aSig0
) == 0 ) return a
;
5318 float_raise(float_flag_invalid
, status
);
5319 return floatx80_default_nan(status
);
5322 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
5323 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5325 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
5326 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
5327 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
5328 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5329 doubleZSig0
= zSig0
<<1;
5330 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5331 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5332 while ( (int64_t) rem0
< 0 ) {
5335 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5337 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5338 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5339 if ( zSig1
== 0 ) zSig1
= 1;
5340 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5341 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5342 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5343 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5344 while ( (int64_t) rem1
< 0 ) {
5346 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5348 term2
|= doubleZSig0
;
5349 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5351 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5353 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
5354 zSig0
|= doubleZSig0
;
5355 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5356 0, zExp
, zSig0
, zSig1
, status
);
5359 /*----------------------------------------------------------------------------
5360 | Returns 1 if the extended double-precision floating-point value `a' is equal
5361 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5362 | raised if either operand is a NaN. Otherwise, the comparison is performed
5363 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5364 *----------------------------------------------------------------------------*/
5366 int floatx80_eq(floatx80 a
, floatx80 b
, float_status
*status
)
5369 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5370 || (extractFloatx80Exp(a
) == 0x7FFF
5371 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5372 || (extractFloatx80Exp(b
) == 0x7FFF
5373 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5375 float_raise(float_flag_invalid
, status
);
5380 && ( ( a
.high
== b
.high
)
5382 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5387 /*----------------------------------------------------------------------------
5388 | Returns 1 if the extended double-precision floating-point value `a' is
5389 | less than or equal to the corresponding value `b', and 0 otherwise. The
5390 | invalid exception is raised if either operand is a NaN. The comparison is
5391 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5393 *----------------------------------------------------------------------------*/
5395 int floatx80_le(floatx80 a
, floatx80 b
, float_status
*status
)
5399 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5400 || (extractFloatx80Exp(a
) == 0x7FFF
5401 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5402 || (extractFloatx80Exp(b
) == 0x7FFF
5403 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5405 float_raise(float_flag_invalid
, status
);
5408 aSign
= extractFloatx80Sign( a
);
5409 bSign
= extractFloatx80Sign( b
);
5410 if ( aSign
!= bSign
) {
5413 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5417 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5418 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5422 /*----------------------------------------------------------------------------
5423 | Returns 1 if the extended double-precision floating-point value `a' is
5424 | less than the corresponding value `b', and 0 otherwise. The invalid
5425 | exception is raised if either operand is a NaN. The comparison is performed
5426 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5427 *----------------------------------------------------------------------------*/
5429 int floatx80_lt(floatx80 a
, floatx80 b
, float_status
*status
)
5433 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5434 || (extractFloatx80Exp(a
) == 0x7FFF
5435 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5436 || (extractFloatx80Exp(b
) == 0x7FFF
5437 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5439 float_raise(float_flag_invalid
, status
);
5442 aSign
= extractFloatx80Sign( a
);
5443 bSign
= extractFloatx80Sign( b
);
5444 if ( aSign
!= bSign
) {
5447 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5451 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5452 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5456 /*----------------------------------------------------------------------------
5457 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5458 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5459 | either operand is a NaN. The comparison is performed according to the
5460 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5461 *----------------------------------------------------------------------------*/
5462 int floatx80_unordered(floatx80 a
, floatx80 b
, float_status
*status
)
5464 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5465 || (extractFloatx80Exp(a
) == 0x7FFF
5466 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5467 || (extractFloatx80Exp(b
) == 0x7FFF
5468 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5470 float_raise(float_flag_invalid
, status
);
5476 /*----------------------------------------------------------------------------
5477 | Returns 1 if the extended double-precision floating-point value `a' is
5478 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5479 | cause an exception. The comparison is performed according to the IEC/IEEE
5480 | Standard for Binary Floating-Point Arithmetic.
5481 *----------------------------------------------------------------------------*/
5483 int floatx80_eq_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5486 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5487 float_raise(float_flag_invalid
, status
);
5490 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5491 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5492 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5493 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5495 if (floatx80_is_signaling_nan(a
, status
)
5496 || floatx80_is_signaling_nan(b
, status
)) {
5497 float_raise(float_flag_invalid
, status
);
5503 && ( ( a
.high
== b
.high
)
5505 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5510 /*----------------------------------------------------------------------------
5511 | Returns 1 if the extended double-precision floating-point value `a' is less
5512 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5513 | do not cause an exception. Otherwise, the comparison is performed according
5514 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5515 *----------------------------------------------------------------------------*/
5517 int floatx80_le_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5521 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5522 float_raise(float_flag_invalid
, status
);
5525 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5526 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5527 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5528 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5530 if (floatx80_is_signaling_nan(a
, status
)
5531 || floatx80_is_signaling_nan(b
, status
)) {
5532 float_raise(float_flag_invalid
, status
);
5536 aSign
= extractFloatx80Sign( a
);
5537 bSign
= extractFloatx80Sign( b
);
5538 if ( aSign
!= bSign
) {
5541 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5545 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5546 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5550 /*----------------------------------------------------------------------------
5551 | Returns 1 if the extended double-precision floating-point value `a' is less
5552 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5553 | an exception. Otherwise, the comparison is performed according to the
5554 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5555 *----------------------------------------------------------------------------*/
5557 int floatx80_lt_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5561 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5562 float_raise(float_flag_invalid
, status
);
5565 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5566 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5567 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5568 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5570 if (floatx80_is_signaling_nan(a
, status
)
5571 || floatx80_is_signaling_nan(b
, status
)) {
5572 float_raise(float_flag_invalid
, status
);
5576 aSign
= extractFloatx80Sign( a
);
5577 bSign
= extractFloatx80Sign( b
);
5578 if ( aSign
!= bSign
) {
5581 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5585 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5586 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5590 /*----------------------------------------------------------------------------
5591 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5592 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5593 | The comparison is performed according to the IEC/IEEE Standard for Binary
5594 | Floating-Point Arithmetic.
5595 *----------------------------------------------------------------------------*/
5596 int floatx80_unordered_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5598 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5599 float_raise(float_flag_invalid
, status
);
5602 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5603 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5604 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5605 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5607 if (floatx80_is_signaling_nan(a
, status
)
5608 || floatx80_is_signaling_nan(b
, status
)) {
5609 float_raise(float_flag_invalid
, status
);
5616 /*----------------------------------------------------------------------------
5617 | Returns the result of converting the quadruple-precision floating-point
5618 | value `a' to the 32-bit two's complement integer format. The conversion
5619 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5620 | Arithmetic---which means in particular that the conversion is rounded
5621 | according to the current rounding mode. If `a' is a NaN, the largest
5622 | positive integer is returned. Otherwise, if the conversion overflows, the
5623 | largest integer with the same sign as `a' is returned.
5624 *----------------------------------------------------------------------------*/
5626 int32_t float128_to_int32(float128 a
, float_status
*status
)
5629 int32_t aExp
, shiftCount
;
5630 uint64_t aSig0
, aSig1
;
5632 aSig1
= extractFloat128Frac1( a
);
5633 aSig0
= extractFloat128Frac0( a
);
5634 aExp
= extractFloat128Exp( a
);
5635 aSign
= extractFloat128Sign( a
);
5636 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
5637 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5638 aSig0
|= ( aSig1
!= 0 );
5639 shiftCount
= 0x4028 - aExp
;
5640 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
5641 return roundAndPackInt32(aSign
, aSig0
, status
);
5645 /*----------------------------------------------------------------------------
5646 | Returns the result of converting the quadruple-precision floating-point
5647 | value `a' to the 32-bit two's complement integer format. The conversion
5648 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5649 | Arithmetic, except that the conversion is always rounded toward zero. If
5650 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5651 | conversion overflows, the largest integer with the same sign as `a' is
5653 *----------------------------------------------------------------------------*/
5655 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
5658 int32_t aExp
, shiftCount
;
5659 uint64_t aSig0
, aSig1
, savedASig
;
5662 aSig1
= extractFloat128Frac1( a
);
5663 aSig0
= extractFloat128Frac0( a
);
5664 aExp
= extractFloat128Exp( a
);
5665 aSign
= extractFloat128Sign( a
);
5666 aSig0
|= ( aSig1
!= 0 );
5667 if ( 0x401E < aExp
) {
5668 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
5671 else if ( aExp
< 0x3FFF ) {
5672 if (aExp
|| aSig0
) {
5673 status
->float_exception_flags
|= float_flag_inexact
;
5677 aSig0
|= LIT64( 0x0001000000000000 );
5678 shiftCount
= 0x402F - aExp
;
5680 aSig0
>>= shiftCount
;
5682 if ( aSign
) z
= - z
;
5683 if ( ( z
< 0 ) ^ aSign
) {
5685 float_raise(float_flag_invalid
, status
);
5686 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5688 if ( ( aSig0
<<shiftCount
) != savedASig
) {
5689 status
->float_exception_flags
|= float_flag_inexact
;
5695 /*----------------------------------------------------------------------------
5696 | Returns the result of converting the quadruple-precision floating-point
5697 | value `a' to the 64-bit two's complement integer format. The conversion
5698 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5699 | Arithmetic---which means in particular that the conversion is rounded
5700 | according to the current rounding mode. If `a' is a NaN, the largest
5701 | positive integer is returned. Otherwise, if the conversion overflows, the
5702 | largest integer with the same sign as `a' is returned.
5703 *----------------------------------------------------------------------------*/
5705 int64_t float128_to_int64(float128 a
, float_status
*status
)
5708 int32_t aExp
, shiftCount
;
5709 uint64_t aSig0
, aSig1
;
5711 aSig1
= extractFloat128Frac1( a
);
5712 aSig0
= extractFloat128Frac0( a
);
5713 aExp
= extractFloat128Exp( a
);
5714 aSign
= extractFloat128Sign( a
);
5715 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5716 shiftCount
= 0x402F - aExp
;
5717 if ( shiftCount
<= 0 ) {
5718 if ( 0x403E < aExp
) {
5719 float_raise(float_flag_invalid
, status
);
5721 || ( ( aExp
== 0x7FFF )
5722 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
5725 return LIT64( 0x7FFFFFFFFFFFFFFF );
5727 return (int64_t) LIT64( 0x8000000000000000 );
5729 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
5732 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
5734 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
5738 /*----------------------------------------------------------------------------
5739 | Returns the result of converting the quadruple-precision floating-point
5740 | value `a' to the 64-bit two's complement integer format. The conversion
5741 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5742 | Arithmetic, except that the conversion is always rounded toward zero.
5743 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5744 | the conversion overflows, the largest integer with the same sign as `a' is
5746 *----------------------------------------------------------------------------*/
5748 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
5751 int32_t aExp
, shiftCount
;
5752 uint64_t aSig0
, aSig1
;
5755 aSig1
= extractFloat128Frac1( a
);
5756 aSig0
= extractFloat128Frac0( a
);
5757 aExp
= extractFloat128Exp( a
);
5758 aSign
= extractFloat128Sign( a
);
5759 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5760 shiftCount
= aExp
- 0x402F;
5761 if ( 0 < shiftCount
) {
5762 if ( 0x403E <= aExp
) {
5763 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
5764 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
5765 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
5767 status
->float_exception_flags
|= float_flag_inexact
;
5771 float_raise(float_flag_invalid
, status
);
5772 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
5773 return LIT64( 0x7FFFFFFFFFFFFFFF );
5776 return (int64_t) LIT64( 0x8000000000000000 );
5778 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
5779 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
5780 status
->float_exception_flags
|= float_flag_inexact
;
5784 if ( aExp
< 0x3FFF ) {
5785 if ( aExp
| aSig0
| aSig1
) {
5786 status
->float_exception_flags
|= float_flag_inexact
;
5790 z
= aSig0
>>( - shiftCount
);
5792 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
5793 status
->float_exception_flags
|= float_flag_inexact
;
5796 if ( aSign
) z
= - z
;
5801 /*----------------------------------------------------------------------------
5802 | Returns the result of converting the quadruple-precision floating-point value
5803 | `a' to the 64-bit unsigned integer format. The conversion is
5804 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5805 | Arithmetic---which means in particular that the conversion is rounded
5806 | according to the current rounding mode. If `a' is a NaN, the largest
5807 | positive integer is returned. If the conversion overflows, the
5808 | largest unsigned integer is returned. If 'a' is negative, the value is
5809 | rounded and zero is returned; negative values that do not round to zero
5810 | will raise the inexact exception.
5811 *----------------------------------------------------------------------------*/
5813 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
5818 uint64_t aSig0
, aSig1
;
5820 aSig0
= extractFloat128Frac0(a
);
5821 aSig1
= extractFloat128Frac1(a
);
5822 aExp
= extractFloat128Exp(a
);
5823 aSign
= extractFloat128Sign(a
);
5824 if (aSign
&& (aExp
> 0x3FFE)) {
5825 float_raise(float_flag_invalid
, status
);
5826 if (float128_is_any_nan(a
)) {
5827 return LIT64(0xFFFFFFFFFFFFFFFF);
5833 aSig0
|= LIT64(0x0001000000000000);
5835 shiftCount
= 0x402F - aExp
;
5836 if (shiftCount
<= 0) {
5837 if (0x403E < aExp
) {
5838 float_raise(float_flag_invalid
, status
);
5839 return LIT64(0xFFFFFFFFFFFFFFFF);
5841 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
5843 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
5845 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
5848 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
5851 signed char current_rounding_mode
= status
->float_rounding_mode
;
5853 set_float_rounding_mode(float_round_to_zero
, status
);
5854 v
= float128_to_uint64(a
, status
);
5855 set_float_rounding_mode(current_rounding_mode
, status
);
5860 /*----------------------------------------------------------------------------
5861 | Returns the result of converting the quadruple-precision floating-point
5862 | value `a' to the 32-bit unsigned integer format. The conversion
5863 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5864 | Arithmetic except that the conversion is always rounded toward zero.
5865 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5866 | if the conversion overflows, the largest unsigned integer is returned.
5867 | If 'a' is negative, the value is rounded and zero is returned; negative
5868 | values that do not round to zero will raise the inexact exception.
5869 *----------------------------------------------------------------------------*/
5871 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
5875 int old_exc_flags
= get_float_exception_flags(status
);
5877 v
= float128_to_uint64_round_to_zero(a
, status
);
5878 if (v
> 0xffffffff) {
5883 set_float_exception_flags(old_exc_flags
, status
);
5884 float_raise(float_flag_invalid
, status
);
5888 /*----------------------------------------------------------------------------
5889 | Returns the result of converting the quadruple-precision floating-point
5890 | value `a' to the single-precision floating-point format. The conversion
5891 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5893 *----------------------------------------------------------------------------*/
5895 float32
float128_to_float32(float128 a
, float_status
*status
)
5899 uint64_t aSig0
, aSig1
;
5902 aSig1
= extractFloat128Frac1( a
);
5903 aSig0
= extractFloat128Frac0( a
);
5904 aExp
= extractFloat128Exp( a
);
5905 aSign
= extractFloat128Sign( a
);
5906 if ( aExp
== 0x7FFF ) {
5907 if ( aSig0
| aSig1
) {
5908 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
5910 return packFloat32( aSign
, 0xFF, 0 );
5912 aSig0
|= ( aSig1
!= 0 );
5913 shift64RightJamming( aSig0
, 18, &aSig0
);
5915 if ( aExp
|| zSig
) {
5919 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
5923 /*----------------------------------------------------------------------------
5924 | Returns the result of converting the quadruple-precision floating-point
5925 | value `a' to the double-precision floating-point format. The conversion
5926 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5928 *----------------------------------------------------------------------------*/
5930 float64
float128_to_float64(float128 a
, float_status
*status
)
5934 uint64_t aSig0
, aSig1
;
5936 aSig1
= extractFloat128Frac1( a
);
5937 aSig0
= extractFloat128Frac0( a
);
5938 aExp
= extractFloat128Exp( a
);
5939 aSign
= extractFloat128Sign( a
);
5940 if ( aExp
== 0x7FFF ) {
5941 if ( aSig0
| aSig1
) {
5942 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
5944 return packFloat64( aSign
, 0x7FF, 0 );
5946 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
5947 aSig0
|= ( aSig1
!= 0 );
5948 if ( aExp
|| aSig0
) {
5949 aSig0
|= LIT64( 0x4000000000000000 );
5952 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
5956 /*----------------------------------------------------------------------------
5957 | Returns the result of converting the quadruple-precision floating-point
5958 | value `a' to the extended double-precision floating-point format. The
5959 | conversion is performed according to the IEC/IEEE Standard for Binary
5960 | Floating-Point Arithmetic.
5961 *----------------------------------------------------------------------------*/
5963 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
5967 uint64_t aSig0
, aSig1
;
5969 aSig1
= extractFloat128Frac1( a
);
5970 aSig0
= extractFloat128Frac0( a
);
5971 aExp
= extractFloat128Exp( a
);
5972 aSign
= extractFloat128Sign( a
);
5973 if ( aExp
== 0x7FFF ) {
5974 if ( aSig0
| aSig1
) {
5975 return commonNaNToFloatx80(float128ToCommonNaN(a
, status
), status
);
5977 return packFloatx80(aSign
, floatx80_infinity_high
,
5978 floatx80_infinity_low
);
5981 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
5982 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5985 aSig0
|= LIT64( 0x0001000000000000 );
5987 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
5988 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
5992 /*----------------------------------------------------------------------------
5993 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5994 | returns the result as a quadruple-precision floating-point value. The
5995 | operation is performed according to the IEC/IEEE Standard for Binary
5996 | Floating-Point Arithmetic.
5997 *----------------------------------------------------------------------------*/
5999 float128
float128_round_to_int(float128 a
, float_status
*status
)
6003 uint64_t lastBitMask
, roundBitsMask
;
6006 aExp
= extractFloat128Exp( a
);
6007 if ( 0x402F <= aExp
) {
6008 if ( 0x406F <= aExp
) {
6009 if ( ( aExp
== 0x7FFF )
6010 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6012 return propagateFloat128NaN(a
, a
, status
);
6017 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6018 roundBitsMask
= lastBitMask
- 1;
6020 switch (status
->float_rounding_mode
) {
6021 case float_round_nearest_even
:
6022 if ( lastBitMask
) {
6023 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6024 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6027 if ( (int64_t) z
.low
< 0 ) {
6029 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6033 case float_round_ties_away
:
6035 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6037 if ((int64_t) z
.low
< 0) {
6042 case float_round_to_zero
:
6044 case float_round_up
:
6045 if (!extractFloat128Sign(z
)) {
6046 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6049 case float_round_down
:
6050 if (extractFloat128Sign(z
)) {
6051 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6057 z
.low
&= ~ roundBitsMask
;
6060 if ( aExp
< 0x3FFF ) {
6061 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6062 status
->float_exception_flags
|= float_flag_inexact
;
6063 aSign
= extractFloat128Sign( a
);
6064 switch (status
->float_rounding_mode
) {
6065 case float_round_nearest_even
:
6066 if ( ( aExp
== 0x3FFE )
6067 && ( extractFloat128Frac0( a
)
6068 | extractFloat128Frac1( a
) )
6070 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6073 case float_round_ties_away
:
6074 if (aExp
== 0x3FFE) {
6075 return packFloat128(aSign
, 0x3FFF, 0, 0);
6078 case float_round_down
:
6080 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6081 : packFloat128( 0, 0, 0, 0 );
6082 case float_round_up
:
6084 aSign
? packFloat128( 1, 0, 0, 0 )
6085 : packFloat128( 0, 0x3FFF, 0, 0 );
6087 return packFloat128( aSign
, 0, 0, 0 );
6090 lastBitMask
<<= 0x402F - aExp
;
6091 roundBitsMask
= lastBitMask
- 1;
6094 switch (status
->float_rounding_mode
) {
6095 case float_round_nearest_even
:
6096 z
.high
+= lastBitMask
>>1;
6097 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6098 z
.high
&= ~ lastBitMask
;
6101 case float_round_ties_away
:
6102 z
.high
+= lastBitMask
>>1;
6104 case float_round_to_zero
:
6106 case float_round_up
:
6107 if (!extractFloat128Sign(z
)) {
6108 z
.high
|= ( a
.low
!= 0 );
6109 z
.high
+= roundBitsMask
;
6112 case float_round_down
:
6113 if (extractFloat128Sign(z
)) {
6114 z
.high
|= (a
.low
!= 0);
6115 z
.high
+= roundBitsMask
;
6121 z
.high
&= ~ roundBitsMask
;
6123 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
6124 status
->float_exception_flags
|= float_flag_inexact
;
6130 /*----------------------------------------------------------------------------
6131 | Returns the result of adding the absolute values of the quadruple-precision
6132 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6133 | before being returned. `zSign' is ignored if the result is a NaN.
6134 | The addition is performed according to the IEC/IEEE Standard for Binary
6135 | Floating-Point Arithmetic.
6136 *----------------------------------------------------------------------------*/
6138 static float128
addFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6139 float_status
*status
)
6141 int32_t aExp
, bExp
, zExp
;
6142 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6145 aSig1
= extractFloat128Frac1( a
);
6146 aSig0
= extractFloat128Frac0( a
);
6147 aExp
= extractFloat128Exp( a
);
6148 bSig1
= extractFloat128Frac1( b
);
6149 bSig0
= extractFloat128Frac0( b
);
6150 bExp
= extractFloat128Exp( b
);
6151 expDiff
= aExp
- bExp
;
6152 if ( 0 < expDiff
) {
6153 if ( aExp
== 0x7FFF ) {
6154 if (aSig0
| aSig1
) {
6155 return propagateFloat128NaN(a
, b
, status
);
6163 bSig0
|= LIT64( 0x0001000000000000 );
6165 shift128ExtraRightJamming(
6166 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
6169 else if ( expDiff
< 0 ) {
6170 if ( bExp
== 0x7FFF ) {
6171 if (bSig0
| bSig1
) {
6172 return propagateFloat128NaN(a
, b
, status
);
6174 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6180 aSig0
|= LIT64( 0x0001000000000000 );
6182 shift128ExtraRightJamming(
6183 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
6187 if ( aExp
== 0x7FFF ) {
6188 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6189 return propagateFloat128NaN(a
, b
, status
);
6193 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6195 if (status
->flush_to_zero
) {
6196 if (zSig0
| zSig1
) {
6197 float_raise(float_flag_output_denormal
, status
);
6199 return packFloat128(zSign
, 0, 0, 0);
6201 return packFloat128( zSign
, 0, zSig0
, zSig1
);
6204 zSig0
|= LIT64( 0x0002000000000000 );
6208 aSig0
|= LIT64( 0x0001000000000000 );
6209 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6211 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
6214 shift128ExtraRightJamming(
6215 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6217 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6221 /*----------------------------------------------------------------------------
6222 | Returns the result of subtracting the absolute values of the quadruple-
6223 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6224 | difference is negated before being returned. `zSign' is ignored if the
6225 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6226 | Standard for Binary Floating-Point Arithmetic.
6227 *----------------------------------------------------------------------------*/
6229 static float128
subFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6230 float_status
*status
)
6232 int32_t aExp
, bExp
, zExp
;
6233 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
6236 aSig1
= extractFloat128Frac1( a
);
6237 aSig0
= extractFloat128Frac0( a
);
6238 aExp
= extractFloat128Exp( a
);
6239 bSig1
= extractFloat128Frac1( b
);
6240 bSig0
= extractFloat128Frac0( b
);
6241 bExp
= extractFloat128Exp( b
);
6242 expDiff
= aExp
- bExp
;
6243 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6244 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
6245 if ( 0 < expDiff
) goto aExpBigger
;
6246 if ( expDiff
< 0 ) goto bExpBigger
;
6247 if ( aExp
== 0x7FFF ) {
6248 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6249 return propagateFloat128NaN(a
, b
, status
);
6251 float_raise(float_flag_invalid
, status
);
6252 return float128_default_nan(status
);
6258 if ( bSig0
< aSig0
) goto aBigger
;
6259 if ( aSig0
< bSig0
) goto bBigger
;
6260 if ( bSig1
< aSig1
) goto aBigger
;
6261 if ( aSig1
< bSig1
) goto bBigger
;
6262 return packFloat128(status
->float_rounding_mode
== float_round_down
,
6265 if ( bExp
== 0x7FFF ) {
6266 if (bSig0
| bSig1
) {
6267 return propagateFloat128NaN(a
, b
, status
);
6269 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
6275 aSig0
|= LIT64( 0x4000000000000000 );
6277 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6278 bSig0
|= LIT64( 0x4000000000000000 );
6280 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6283 goto normalizeRoundAndPack
;
6285 if ( aExp
== 0x7FFF ) {
6286 if (aSig0
| aSig1
) {
6287 return propagateFloat128NaN(a
, b
, status
);
6295 bSig0
|= LIT64( 0x4000000000000000 );
6297 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
6298 aSig0
|= LIT64( 0x4000000000000000 );
6300 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6302 normalizeRoundAndPack
:
6304 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
6309 /*----------------------------------------------------------------------------
6310 | Returns the result of adding the quadruple-precision floating-point values
6311 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6312 | for Binary Floating-Point Arithmetic.
6313 *----------------------------------------------------------------------------*/
6315 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
6319 aSign
= extractFloat128Sign( a
);
6320 bSign
= extractFloat128Sign( b
);
6321 if ( aSign
== bSign
) {
6322 return addFloat128Sigs(a
, b
, aSign
, status
);
6325 return subFloat128Sigs(a
, b
, aSign
, status
);
6330 /*----------------------------------------------------------------------------
6331 | Returns the result of subtracting the quadruple-precision floating-point
6332 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6333 | Standard for Binary Floating-Point Arithmetic.
6334 *----------------------------------------------------------------------------*/
6336 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
6340 aSign
= extractFloat128Sign( a
);
6341 bSign
= extractFloat128Sign( b
);
6342 if ( aSign
== bSign
) {
6343 return subFloat128Sigs(a
, b
, aSign
, status
);
6346 return addFloat128Sigs(a
, b
, aSign
, status
);
6351 /*----------------------------------------------------------------------------
6352 | Returns the result of multiplying the quadruple-precision floating-point
6353 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6354 | Standard for Binary Floating-Point Arithmetic.
6355 *----------------------------------------------------------------------------*/
6357 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
6359 flag aSign
, bSign
, zSign
;
6360 int32_t aExp
, bExp
, zExp
;
6361 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
6363 aSig1
= extractFloat128Frac1( a
);
6364 aSig0
= extractFloat128Frac0( a
);
6365 aExp
= extractFloat128Exp( a
);
6366 aSign
= extractFloat128Sign( a
);
6367 bSig1
= extractFloat128Frac1( b
);
6368 bSig0
= extractFloat128Frac0( b
);
6369 bExp
= extractFloat128Exp( b
);
6370 bSign
= extractFloat128Sign( b
);
6371 zSign
= aSign
^ bSign
;
6372 if ( aExp
== 0x7FFF ) {
6373 if ( ( aSig0
| aSig1
)
6374 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6375 return propagateFloat128NaN(a
, b
, status
);
6377 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
6378 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6380 if ( bExp
== 0x7FFF ) {
6381 if (bSig0
| bSig1
) {
6382 return propagateFloat128NaN(a
, b
, status
);
6384 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6386 float_raise(float_flag_invalid
, status
);
6387 return float128_default_nan(status
);
6389 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6392 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6393 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6396 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6397 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6399 zExp
= aExp
+ bExp
- 0x4000;
6400 aSig0
|= LIT64( 0x0001000000000000 );
6401 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
6402 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
6403 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6404 zSig2
|= ( zSig3
!= 0 );
6405 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
6406 shift128ExtraRightJamming(
6407 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6410 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6414 /*----------------------------------------------------------------------------
6415 | Returns the result of dividing the quadruple-precision floating-point value
6416 | `a' by the corresponding value `b'. The operation is performed according to
6417 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6418 *----------------------------------------------------------------------------*/
6420 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
6422 flag aSign
, bSign
, zSign
;
6423 int32_t aExp
, bExp
, zExp
;
6424 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6425 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6427 aSig1
= extractFloat128Frac1( a
);
6428 aSig0
= extractFloat128Frac0( a
);
6429 aExp
= extractFloat128Exp( a
);
6430 aSign
= extractFloat128Sign( a
);
6431 bSig1
= extractFloat128Frac1( b
);
6432 bSig0
= extractFloat128Frac0( b
);
6433 bExp
= extractFloat128Exp( b
);
6434 bSign
= extractFloat128Sign( b
);
6435 zSign
= aSign
^ bSign
;
6436 if ( aExp
== 0x7FFF ) {
6437 if (aSig0
| aSig1
) {
6438 return propagateFloat128NaN(a
, b
, status
);
6440 if ( bExp
== 0x7FFF ) {
6441 if (bSig0
| bSig1
) {
6442 return propagateFloat128NaN(a
, b
, status
);
6446 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6448 if ( bExp
== 0x7FFF ) {
6449 if (bSig0
| bSig1
) {
6450 return propagateFloat128NaN(a
, b
, status
);
6452 return packFloat128( zSign
, 0, 0, 0 );
6455 if ( ( bSig0
| bSig1
) == 0 ) {
6456 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6458 float_raise(float_flag_invalid
, status
);
6459 return float128_default_nan(status
);
6461 float_raise(float_flag_divbyzero
, status
);
6462 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6464 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6467 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6468 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6470 zExp
= aExp
- bExp
+ 0x3FFD;
6472 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
6474 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6475 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
6476 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
6479 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6480 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
6481 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
6482 while ( (int64_t) rem0
< 0 ) {
6484 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
6486 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
6487 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
6488 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
6489 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
6490 while ( (int64_t) rem1
< 0 ) {
6492 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
6494 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6496 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
6497 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6501 /*----------------------------------------------------------------------------
6502 | Returns the remainder of the quadruple-precision floating-point value `a'
6503 | with respect to the corresponding value `b'. The operation is performed
6504 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6505 *----------------------------------------------------------------------------*/
6507 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
6510 int32_t aExp
, bExp
, expDiff
;
6511 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
6512 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
6515 aSig1
= extractFloat128Frac1( a
);
6516 aSig0
= extractFloat128Frac0( a
);
6517 aExp
= extractFloat128Exp( a
);
6518 aSign
= extractFloat128Sign( a
);
6519 bSig1
= extractFloat128Frac1( b
);
6520 bSig0
= extractFloat128Frac0( b
);
6521 bExp
= extractFloat128Exp( b
);
6522 if ( aExp
== 0x7FFF ) {
6523 if ( ( aSig0
| aSig1
)
6524 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6525 return propagateFloat128NaN(a
, b
, status
);
6529 if ( bExp
== 0x7FFF ) {
6530 if (bSig0
| bSig1
) {
6531 return propagateFloat128NaN(a
, b
, status
);
6536 if ( ( bSig0
| bSig1
) == 0 ) {
6538 float_raise(float_flag_invalid
, status
);
6539 return float128_default_nan(status
);
6541 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6544 if ( ( aSig0
| aSig1
) == 0 ) return a
;
6545 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6547 expDiff
= aExp
- bExp
;
6548 if ( expDiff
< -1 ) return a
;
6550 aSig0
| LIT64( 0x0001000000000000 ),
6552 15 - ( expDiff
< 0 ),
6557 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6558 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
6559 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6561 while ( 0 < expDiff
) {
6562 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6563 q
= ( 4 < q
) ? q
- 4 : 0;
6564 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6565 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
6566 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
6567 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
6570 if ( -64 < expDiff
) {
6571 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6572 q
= ( 4 < q
) ? q
- 4 : 0;
6574 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6576 if ( expDiff
< 0 ) {
6577 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6580 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
6582 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6583 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
6586 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
6587 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6590 alternateASig0
= aSig0
;
6591 alternateASig1
= aSig1
;
6593 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6594 } while ( 0 <= (int64_t) aSig0
);
6596 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
6597 if ( ( sigMean0
< 0 )
6598 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
6599 aSig0
= alternateASig0
;
6600 aSig1
= alternateASig1
;
6602 zSign
= ( (int64_t) aSig0
< 0 );
6603 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
6604 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
6608 /*----------------------------------------------------------------------------
6609 | Returns the square root of the quadruple-precision floating-point value `a'.
6610 | The operation is performed according to the IEC/IEEE Standard for Binary
6611 | Floating-Point Arithmetic.
6612 *----------------------------------------------------------------------------*/
6614 float128
float128_sqrt(float128 a
, float_status
*status
)
6618 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
6619 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6621 aSig1
= extractFloat128Frac1( a
);
6622 aSig0
= extractFloat128Frac0( a
);
6623 aExp
= extractFloat128Exp( a
);
6624 aSign
= extractFloat128Sign( a
);
6625 if ( aExp
== 0x7FFF ) {
6626 if (aSig0
| aSig1
) {
6627 return propagateFloat128NaN(a
, a
, status
);
6629 if ( ! aSign
) return a
;
6633 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
6635 float_raise(float_flag_invalid
, status
);
6636 return float128_default_nan(status
);
6639 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
6640 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6642 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
6643 aSig0
|= LIT64( 0x0001000000000000 );
6644 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
6645 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
6646 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6647 doubleZSig0
= zSig0
<<1;
6648 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6649 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6650 while ( (int64_t) rem0
< 0 ) {
6653 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6655 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6656 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
6657 if ( zSig1
== 0 ) zSig1
= 1;
6658 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6659 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6660 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6661 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6662 while ( (int64_t) rem1
< 0 ) {
6664 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6666 term2
|= doubleZSig0
;
6667 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6669 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6671 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
6672 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
6676 /*----------------------------------------------------------------------------
6677 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6678 | the corresponding value `b', and 0 otherwise. The invalid exception is
6679 | raised if either operand is a NaN. Otherwise, the comparison is performed
6680 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6681 *----------------------------------------------------------------------------*/
6683 int float128_eq(float128 a
, float128 b
, float_status
*status
)
6686 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6687 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6688 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6689 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6691 float_raise(float_flag_invalid
, status
);
6696 && ( ( a
.high
== b
.high
)
6698 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6703 /*----------------------------------------------------------------------------
6704 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6705 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6706 | exception is raised if either operand is a NaN. The comparison is performed
6707 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6708 *----------------------------------------------------------------------------*/
6710 int float128_le(float128 a
, float128 b
, float_status
*status
)
6714 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6715 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6716 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6717 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6719 float_raise(float_flag_invalid
, status
);
6722 aSign
= extractFloat128Sign( a
);
6723 bSign
= extractFloat128Sign( b
);
6724 if ( aSign
!= bSign
) {
6727 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6731 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6732 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6736 /*----------------------------------------------------------------------------
6737 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6738 | the corresponding value `b', and 0 otherwise. The invalid exception is
6739 | raised if either operand is a NaN. The comparison is performed according
6740 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6741 *----------------------------------------------------------------------------*/
6743 int float128_lt(float128 a
, float128 b
, float_status
*status
)
6747 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6748 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6749 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6750 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6752 float_raise(float_flag_invalid
, status
);
6755 aSign
= extractFloat128Sign( a
);
6756 bSign
= extractFloat128Sign( b
);
6757 if ( aSign
!= bSign
) {
6760 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6764 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6765 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6769 /*----------------------------------------------------------------------------
6770 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6771 | be compared, and 0 otherwise. The invalid exception is raised if either
6772 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6773 | Standard for Binary Floating-Point Arithmetic.
6774 *----------------------------------------------------------------------------*/
6776 int float128_unordered(float128 a
, float128 b
, float_status
*status
)
6778 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6779 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6780 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6781 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6783 float_raise(float_flag_invalid
, status
);
6789 /*----------------------------------------------------------------------------
6790 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6791 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6792 | exception. The comparison is performed according to the IEC/IEEE Standard
6793 | for Binary Floating-Point Arithmetic.
6794 *----------------------------------------------------------------------------*/
6796 int float128_eq_quiet(float128 a
, float128 b
, float_status
*status
)
6799 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6800 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6801 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6802 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6804 if (float128_is_signaling_nan(a
, status
)
6805 || float128_is_signaling_nan(b
, status
)) {
6806 float_raise(float_flag_invalid
, status
);
6812 && ( ( a
.high
== b
.high
)
6814 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6819 /*----------------------------------------------------------------------------
6820 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6821 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6822 | cause an exception. Otherwise, the comparison is performed according to the
6823 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6824 *----------------------------------------------------------------------------*/
6826 int float128_le_quiet(float128 a
, float128 b
, float_status
*status
)
6830 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6831 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6832 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6833 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6835 if (float128_is_signaling_nan(a
, status
)
6836 || float128_is_signaling_nan(b
, status
)) {
6837 float_raise(float_flag_invalid
, status
);
6841 aSign
= extractFloat128Sign( a
);
6842 bSign
= extractFloat128Sign( b
);
6843 if ( aSign
!= bSign
) {
6846 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6850 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6851 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6855 /*----------------------------------------------------------------------------
6856 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6857 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6858 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6859 | Standard for Binary Floating-Point Arithmetic.
6860 *----------------------------------------------------------------------------*/
6862 int float128_lt_quiet(float128 a
, float128 b
, float_status
*status
)
6866 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6867 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6868 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6869 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6871 if (float128_is_signaling_nan(a
, status
)
6872 || float128_is_signaling_nan(b
, status
)) {
6873 float_raise(float_flag_invalid
, status
);
6877 aSign
= extractFloat128Sign( a
);
6878 bSign
= extractFloat128Sign( b
);
6879 if ( aSign
!= bSign
) {
6882 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6886 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6887 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6891 /*----------------------------------------------------------------------------
6892 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6893 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6894 | comparison is performed according to the IEC/IEEE Standard for Binary
6895 | Floating-Point Arithmetic.
6896 *----------------------------------------------------------------------------*/
6898 int float128_unordered_quiet(float128 a
, float128 b
, float_status
*status
)
6900 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6901 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6902 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6903 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6905 if (float128_is_signaling_nan(a
, status
)
6906 || float128_is_signaling_nan(b
, status
)) {
6907 float_raise(float_flag_invalid
, status
);
6914 static inline int floatx80_compare_internal(floatx80 a
, floatx80 b
,
6915 int is_quiet
, float_status
*status
)
6919 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6920 float_raise(float_flag_invalid
, status
);
6921 return float_relation_unordered
;
6923 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
6924 ( extractFloatx80Frac( a
)<<1 ) ) ||
6925 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
6926 ( extractFloatx80Frac( b
)<<1 ) )) {
6928 floatx80_is_signaling_nan(a
, status
) ||
6929 floatx80_is_signaling_nan(b
, status
)) {
6930 float_raise(float_flag_invalid
, status
);
6932 return float_relation_unordered
;
6934 aSign
= extractFloatx80Sign( a
);
6935 bSign
= extractFloatx80Sign( b
);
6936 if ( aSign
!= bSign
) {
6938 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
6939 ( ( a
.low
| b
.low
) == 0 ) ) {
6941 return float_relation_equal
;
6943 return 1 - (2 * aSign
);
6946 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6947 return float_relation_equal
;
6949 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6954 int floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
6956 return floatx80_compare_internal(a
, b
, 0, status
);
6959 int floatx80_compare_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6961 return floatx80_compare_internal(a
, b
, 1, status
);
6964 static inline int float128_compare_internal(float128 a
, float128 b
,
6965 int is_quiet
, float_status
*status
)
6969 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
6970 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
6971 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
6972 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
6974 float128_is_signaling_nan(a
, status
) ||
6975 float128_is_signaling_nan(b
, status
)) {
6976 float_raise(float_flag_invalid
, status
);
6978 return float_relation_unordered
;
6980 aSign
= extractFloat128Sign( a
);
6981 bSign
= extractFloat128Sign( b
);
6982 if ( aSign
!= bSign
) {
6983 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
6985 return float_relation_equal
;
6987 return 1 - (2 * aSign
);
6990 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6991 return float_relation_equal
;
6993 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6998 int float128_compare(float128 a
, float128 b
, float_status
*status
)
7000 return float128_compare_internal(a
, b
, 0, status
);
7003 int float128_compare_quiet(float128 a
, float128 b
, float_status
*status
)
7005 return float128_compare_internal(a
, b
, 1, status
);
7008 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7014 if (floatx80_invalid_encoding(a
)) {
7015 float_raise(float_flag_invalid
, status
);
7016 return floatx80_default_nan(status
);
7018 aSig
= extractFloatx80Frac( a
);
7019 aExp
= extractFloatx80Exp( a
);
7020 aSign
= extractFloatx80Sign( a
);
7022 if ( aExp
== 0x7FFF ) {
7024 return propagateFloatx80NaN(a
, a
, status
);
7038 } else if (n
< -0x10000) {
7043 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7044 aSign
, aExp
, aSig
, 0, status
);
7047 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7051 uint64_t aSig0
, aSig1
;
7053 aSig1
= extractFloat128Frac1( a
);
7054 aSig0
= extractFloat128Frac0( a
);
7055 aExp
= extractFloat128Exp( a
);
7056 aSign
= extractFloat128Sign( a
);
7057 if ( aExp
== 0x7FFF ) {
7058 if ( aSig0
| aSig1
) {
7059 return propagateFloat128NaN(a
, a
, status
);
7064 aSig0
|= LIT64( 0x0001000000000000 );
7065 } else if (aSig0
== 0 && aSig1
== 0) {
7073 } else if (n
< -0x10000) {
7078 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1