4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Returns the fraction bits of the half-precision floating-point value `a'.
100 *----------------------------------------------------------------------------*/
102 static inline uint32_t extractFloat16Frac(float16 a
)
104 return float16_val(a
) & 0x3ff;
107 /*----------------------------------------------------------------------------
108 | Returns the exponent bits of the half-precision floating-point value `a'.
109 *----------------------------------------------------------------------------*/
111 static inline int extractFloat16Exp(float16 a
)
113 return (float16_val(a
) >> 10) & 0x1f;
116 /*----------------------------------------------------------------------------
117 | Returns the fraction bits of the single-precision floating-point value `a'.
118 *----------------------------------------------------------------------------*/
120 static inline uint32_t extractFloat32Frac(float32 a
)
122 return float32_val(a
) & 0x007FFFFF;
125 /*----------------------------------------------------------------------------
126 | Returns the exponent bits of the single-precision floating-point value `a'.
127 *----------------------------------------------------------------------------*/
129 static inline int extractFloat32Exp(float32 a
)
131 return (float32_val(a
) >> 23) & 0xFF;
134 /*----------------------------------------------------------------------------
135 | Returns the sign bit of the single-precision floating-point value `a'.
136 *----------------------------------------------------------------------------*/
138 static inline flag
extractFloat32Sign(float32 a
)
140 return float32_val(a
) >> 31;
143 /*----------------------------------------------------------------------------
144 | Returns the fraction bits of the double-precision floating-point value `a'.
145 *----------------------------------------------------------------------------*/
147 static inline uint64_t extractFloat64Frac(float64 a
)
149 return float64_val(a
) & LIT64(0x000FFFFFFFFFFFFF);
152 /*----------------------------------------------------------------------------
153 | Returns the exponent bits of the double-precision floating-point value `a'.
154 *----------------------------------------------------------------------------*/
156 static inline int extractFloat64Exp(float64 a
)
158 return (float64_val(a
) >> 52) & 0x7FF;
161 /*----------------------------------------------------------------------------
162 | Returns the sign bit of the double-precision floating-point value `a'.
163 *----------------------------------------------------------------------------*/
165 static inline flag
extractFloat64Sign(float64 a
)
167 return float64_val(a
) >> 63;
171 * Classify a floating point number. Everything above float_class_qnan
172 * is a NaN so cls >= float_class_qnan is any NaN.
175 typedef enum __attribute__ ((__packed__
)) {
176 float_class_unclassified
,
180 float_class_qnan
, /* all NaNs from here */
184 /* Simple helpers for checking if, or what kind of, NaN we have */
185 static inline __attribute__((unused
)) bool is_nan(FloatClass c
)
187 return unlikely(c
>= float_class_qnan
);
190 static inline __attribute__((unused
)) bool is_snan(FloatClass c
)
192 return c
== float_class_snan
;
195 static inline __attribute__((unused
)) bool is_qnan(FloatClass c
)
197 return c
== float_class_qnan
;
201 * Structure holding all of the decomposed parts of a float. The
202 * exponent is unbiased and the fraction is normalized. All
203 * calculations are done with a 64 bit fraction and then rounded as
204 * appropriate for the final format.
206 * Thanks to the packed FloatClass a decent compiler should be able to
207 * fit the whole structure into registers and avoid using the stack
208 * for parameter passing.
218 #define DECOMPOSED_BINARY_POINT (64 - 2)
219 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
220 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
222 /* Structure holding all of the relevant parameters for a format.
223 * exp_size: the size of the exponent field
224 * exp_bias: the offset applied to the exponent field
225 * exp_max: the maximum normalised exponent
226 * frac_size: the size of the fraction field
227 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
228 * The following are computed based the size of fraction
229 * frac_lsb: least significant bit of fraction
230 * frac_lsbm1: the bit below the least significant bit (for rounding)
231 * round_mask/roundeven_mask: masks used for rounding
232 * The following optional modifiers are available:
233 * arm_althp: handle ARM Alternative Half Precision
244 uint64_t roundeven_mask
;
248 /* Expand fields based on the size of exponent and fraction */
249 #define FLOAT_PARAMS(E, F) \
251 .exp_bias = ((1 << E) - 1) >> 1, \
252 .exp_max = (1 << E) - 1, \
254 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
255 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
256 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
257 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
258 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
260 static const FloatFmt float16_params
= {
264 static const FloatFmt float16_params_ahp
= {
269 static const FloatFmt float32_params
= {
273 static const FloatFmt float64_params
= {
277 /* Unpack a float to parts, but do not canonicalize. */
278 static inline FloatParts
unpack_raw(FloatFmt fmt
, uint64_t raw
)
280 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
282 return (FloatParts
) {
283 .cls
= float_class_unclassified
,
284 .sign
= extract64(raw
, sign_pos
, 1),
285 .exp
= extract64(raw
, fmt
.frac_size
, fmt
.exp_size
),
286 .frac
= extract64(raw
, 0, fmt
.frac_size
),
290 static inline FloatParts
float16_unpack_raw(float16 f
)
292 return unpack_raw(float16_params
, f
);
295 static inline FloatParts
float32_unpack_raw(float32 f
)
297 return unpack_raw(float32_params
, f
);
300 static inline FloatParts
float64_unpack_raw(float64 f
)
302 return unpack_raw(float64_params
, f
);
305 /* Pack a float from parts, but do not canonicalize. */
306 static inline uint64_t pack_raw(FloatFmt fmt
, FloatParts p
)
308 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
309 uint64_t ret
= deposit64(p
.frac
, fmt
.frac_size
, fmt
.exp_size
, p
.exp
);
310 return deposit64(ret
, sign_pos
, 1, p
.sign
);
313 static inline float16
float16_pack_raw(FloatParts p
)
315 return make_float16(pack_raw(float16_params
, p
));
318 static inline float32
float32_pack_raw(FloatParts p
)
320 return make_float32(pack_raw(float32_params
, p
));
323 static inline float64
float64_pack_raw(FloatParts p
)
325 return make_float64(pack_raw(float64_params
, p
));
328 /*----------------------------------------------------------------------------
329 | Functions and definitions to determine: (1) whether tininess for underflow
330 | is detected before or after rounding by default, (2) what (if anything)
331 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
332 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
333 | are propagated from function inputs to output. These details are target-
335 *----------------------------------------------------------------------------*/
336 #include "softfloat-specialize.h"
338 /* Canonicalize EXP and FRAC, setting CLS. */
339 static FloatParts
canonicalize(FloatParts part
, const FloatFmt
*parm
,
340 float_status
*status
)
342 if (part
.exp
== parm
->exp_max
&& !parm
->arm_althp
) {
343 if (part
.frac
== 0) {
344 part
.cls
= float_class_inf
;
346 part
.frac
<<= parm
->frac_shift
;
347 part
.cls
= (parts_is_snan_frac(part
.frac
, status
)
348 ? float_class_snan
: float_class_qnan
);
350 } else if (part
.exp
== 0) {
351 if (likely(part
.frac
== 0)) {
352 part
.cls
= float_class_zero
;
353 } else if (status
->flush_inputs_to_zero
) {
354 float_raise(float_flag_input_denormal
, status
);
355 part
.cls
= float_class_zero
;
358 int shift
= clz64(part
.frac
) - 1;
359 part
.cls
= float_class_normal
;
360 part
.exp
= parm
->frac_shift
- parm
->exp_bias
- shift
+ 1;
364 part
.cls
= float_class_normal
;
365 part
.exp
-= parm
->exp_bias
;
366 part
.frac
= DECOMPOSED_IMPLICIT_BIT
+ (part
.frac
<< parm
->frac_shift
);
371 /* Round and uncanonicalize a floating-point number by parts. There
372 * are FRAC_SHIFT bits that may require rounding at the bottom of the
373 * fraction; these bits will be removed. The exponent will be biased
374 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
377 static FloatParts
round_canonical(FloatParts p
, float_status
*s
,
378 const FloatFmt
*parm
)
380 const uint64_t frac_lsbm1
= parm
->frac_lsbm1
;
381 const uint64_t round_mask
= parm
->round_mask
;
382 const uint64_t roundeven_mask
= parm
->roundeven_mask
;
383 const int exp_max
= parm
->exp_max
;
384 const int frac_shift
= parm
->frac_shift
;
393 case float_class_normal
:
394 switch (s
->float_rounding_mode
) {
395 case float_round_nearest_even
:
396 overflow_norm
= false;
397 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
399 case float_round_ties_away
:
400 overflow_norm
= false;
403 case float_round_to_zero
:
404 overflow_norm
= true;
408 inc
= p
.sign
? 0 : round_mask
;
409 overflow_norm
= p
.sign
;
411 case float_round_down
:
412 inc
= p
.sign
? round_mask
: 0;
413 overflow_norm
= !p
.sign
;
416 g_assert_not_reached();
419 exp
+= parm
->exp_bias
;
420 if (likely(exp
> 0)) {
421 if (frac
& round_mask
) {
422 flags
|= float_flag_inexact
;
424 if (frac
& DECOMPOSED_OVERFLOW_BIT
) {
431 if (parm
->arm_althp
) {
432 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
433 if (unlikely(exp
> exp_max
)) {
434 /* Overflow. Return the maximum normal. */
435 flags
= float_flag_invalid
;
439 } else if (unlikely(exp
>= exp_max
)) {
440 flags
|= float_flag_overflow
| float_flag_inexact
;
445 p
.cls
= float_class_inf
;
449 } else if (s
->flush_to_zero
) {
450 flags
|= float_flag_output_denormal
;
451 p
.cls
= float_class_zero
;
454 bool is_tiny
= (s
->float_detect_tininess
455 == float_tininess_before_rounding
)
457 || !((frac
+ inc
) & DECOMPOSED_OVERFLOW_BIT
);
459 shift64RightJamming(frac
, 1 - exp
, &frac
);
460 if (frac
& round_mask
) {
461 /* Need to recompute round-to-even. */
462 if (s
->float_rounding_mode
== float_round_nearest_even
) {
463 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
466 flags
|= float_flag_inexact
;
470 exp
= (frac
& DECOMPOSED_IMPLICIT_BIT
? 1 : 0);
473 if (is_tiny
&& (flags
& float_flag_inexact
)) {
474 flags
|= float_flag_underflow
;
476 if (exp
== 0 && frac
== 0) {
477 p
.cls
= float_class_zero
;
482 case float_class_zero
:
488 case float_class_inf
:
490 assert(!parm
->arm_althp
);
495 case float_class_qnan
:
496 case float_class_snan
:
497 assert(!parm
->arm_althp
);
499 frac
>>= parm
->frac_shift
;
503 g_assert_not_reached();
506 float_raise(flags
, s
);
512 /* Explicit FloatFmt version */
513 static FloatParts
float16a_unpack_canonical(float16 f
, float_status
*s
,
514 const FloatFmt
*params
)
516 return canonicalize(float16_unpack_raw(f
), params
, s
);
519 static FloatParts
float16_unpack_canonical(float16 f
, float_status
*s
)
521 return float16a_unpack_canonical(f
, s
, &float16_params
);
524 static float16
float16a_round_pack_canonical(FloatParts p
, float_status
*s
,
525 const FloatFmt
*params
)
527 return float16_pack_raw(round_canonical(p
, s
, params
));
530 static float16
float16_round_pack_canonical(FloatParts p
, float_status
*s
)
532 return float16a_round_pack_canonical(p
, s
, &float16_params
);
535 static FloatParts
float32_unpack_canonical(float32 f
, float_status
*s
)
537 return canonicalize(float32_unpack_raw(f
), &float32_params
, s
);
540 static float32
float32_round_pack_canonical(FloatParts p
, float_status
*s
)
542 return float32_pack_raw(round_canonical(p
, s
, &float32_params
));
545 static FloatParts
float64_unpack_canonical(float64 f
, float_status
*s
)
547 return canonicalize(float64_unpack_raw(f
), &float64_params
, s
);
550 static float64
float64_round_pack_canonical(FloatParts p
, float_status
*s
)
552 return float64_pack_raw(round_canonical(p
, s
, &float64_params
));
555 static FloatParts
return_nan(FloatParts a
, float_status
*s
)
558 case float_class_snan
:
559 s
->float_exception_flags
|= float_flag_invalid
;
560 a
= parts_silence_nan(a
, s
);
562 case float_class_qnan
:
563 if (s
->default_nan_mode
) {
564 return parts_default_nan(s
);
569 g_assert_not_reached();
574 static FloatParts
pick_nan(FloatParts a
, FloatParts b
, float_status
*s
)
576 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
577 s
->float_exception_flags
|= float_flag_invalid
;
580 if (s
->default_nan_mode
) {
581 return parts_default_nan(s
);
583 if (pickNaN(a
.cls
, b
.cls
,
585 (a
.frac
== b
.frac
&& a
.sign
< b
.sign
))) {
588 if (is_snan(a
.cls
)) {
589 return parts_silence_nan(a
, s
);
595 static FloatParts
pick_nan_muladd(FloatParts a
, FloatParts b
, FloatParts c
,
596 bool inf_zero
, float_status
*s
)
600 if (is_snan(a
.cls
) || is_snan(b
.cls
) || is_snan(c
.cls
)) {
601 s
->float_exception_flags
|= float_flag_invalid
;
604 which
= pickNaNMulAdd(a
.cls
, b
.cls
, c
.cls
, inf_zero
, s
);
606 if (s
->default_nan_mode
) {
607 /* Note that this check is after pickNaNMulAdd so that function
608 * has an opportunity to set the Invalid flag.
623 return parts_default_nan(s
);
625 g_assert_not_reached();
628 if (is_snan(a
.cls
)) {
629 return parts_silence_nan(a
, s
);
635 * Returns the result of adding or subtracting the values of the
636 * floating-point values `a' and `b'. The operation is performed
637 * according to the IEC/IEEE Standard for Binary Floating-Point
641 static FloatParts
addsub_floats(FloatParts a
, FloatParts b
, bool subtract
,
644 bool a_sign
= a
.sign
;
645 bool b_sign
= b
.sign
^ subtract
;
647 if (a_sign
!= b_sign
) {
650 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
651 if (a
.exp
> b
.exp
|| (a
.exp
== b
.exp
&& a
.frac
>= b
.frac
)) {
652 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
653 a
.frac
= a
.frac
- b
.frac
;
655 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
656 a
.frac
= b
.frac
- a
.frac
;
662 a
.cls
= float_class_zero
;
663 a
.sign
= s
->float_rounding_mode
== float_round_down
;
665 int shift
= clz64(a
.frac
) - 1;
666 a
.frac
= a
.frac
<< shift
;
667 a
.exp
= a
.exp
- shift
;
672 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
673 return pick_nan(a
, b
, s
);
675 if (a
.cls
== float_class_inf
) {
676 if (b
.cls
== float_class_inf
) {
677 float_raise(float_flag_invalid
, s
);
678 return parts_default_nan(s
);
682 if (a
.cls
== float_class_zero
&& b
.cls
== float_class_zero
) {
683 a
.sign
= s
->float_rounding_mode
== float_round_down
;
686 if (a
.cls
== float_class_zero
|| b
.cls
== float_class_inf
) {
690 if (b
.cls
== float_class_zero
) {
695 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
697 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
698 } else if (a
.exp
< b
.exp
) {
699 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
703 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
704 shift64RightJamming(a
.frac
, 1, &a
.frac
);
709 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
710 return pick_nan(a
, b
, s
);
712 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
715 if (b
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
720 g_assert_not_reached();
724 * Returns the result of adding or subtracting the floating-point
725 * values `a' and `b'. The operation is performed according to the
726 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
729 float16
__attribute__((flatten
)) float16_add(float16 a
, float16 b
,
730 float_status
*status
)
732 FloatParts pa
= float16_unpack_canonical(a
, status
);
733 FloatParts pb
= float16_unpack_canonical(b
, status
);
734 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
736 return float16_round_pack_canonical(pr
, status
);
739 float32
__attribute__((flatten
)) float32_add(float32 a
, float32 b
,
740 float_status
*status
)
742 FloatParts pa
= float32_unpack_canonical(a
, status
);
743 FloatParts pb
= float32_unpack_canonical(b
, status
);
744 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
746 return float32_round_pack_canonical(pr
, status
);
749 float64
__attribute__((flatten
)) float64_add(float64 a
, float64 b
,
750 float_status
*status
)
752 FloatParts pa
= float64_unpack_canonical(a
, status
);
753 FloatParts pb
= float64_unpack_canonical(b
, status
);
754 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
756 return float64_round_pack_canonical(pr
, status
);
759 float16
__attribute__((flatten
)) float16_sub(float16 a
, float16 b
,
760 float_status
*status
)
762 FloatParts pa
= float16_unpack_canonical(a
, status
);
763 FloatParts pb
= float16_unpack_canonical(b
, status
);
764 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
766 return float16_round_pack_canonical(pr
, status
);
769 float32
__attribute__((flatten
)) float32_sub(float32 a
, float32 b
,
770 float_status
*status
)
772 FloatParts pa
= float32_unpack_canonical(a
, status
);
773 FloatParts pb
= float32_unpack_canonical(b
, status
);
774 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
776 return float32_round_pack_canonical(pr
, status
);
779 float64
__attribute__((flatten
)) float64_sub(float64 a
, float64 b
,
780 float_status
*status
)
782 FloatParts pa
= float64_unpack_canonical(a
, status
);
783 FloatParts pb
= float64_unpack_canonical(b
, status
);
784 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
786 return float64_round_pack_canonical(pr
, status
);
790 * Returns the result of multiplying the floating-point values `a' and
791 * `b'. The operation is performed according to the IEC/IEEE Standard
792 * for Binary Floating-Point Arithmetic.
795 static FloatParts
mul_floats(FloatParts a
, FloatParts b
, float_status
*s
)
797 bool sign
= a
.sign
^ b
.sign
;
799 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
801 int exp
= a
.exp
+ b
.exp
;
803 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
804 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
805 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
806 shift64RightJamming(lo
, 1, &lo
);
816 /* handle all the NaN cases */
817 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
818 return pick_nan(a
, b
, s
);
820 /* Inf * Zero == NaN */
821 if ((a
.cls
== float_class_inf
&& b
.cls
== float_class_zero
) ||
822 (a
.cls
== float_class_zero
&& b
.cls
== float_class_inf
)) {
823 s
->float_exception_flags
|= float_flag_invalid
;
824 return parts_default_nan(s
);
826 /* Multiply by 0 or Inf */
827 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
831 if (b
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
835 g_assert_not_reached();
838 float16
__attribute__((flatten
)) float16_mul(float16 a
, float16 b
,
839 float_status
*status
)
841 FloatParts pa
= float16_unpack_canonical(a
, status
);
842 FloatParts pb
= float16_unpack_canonical(b
, status
);
843 FloatParts pr
= mul_floats(pa
, pb
, status
);
845 return float16_round_pack_canonical(pr
, status
);
848 float32
__attribute__((flatten
)) float32_mul(float32 a
, float32 b
,
849 float_status
*status
)
851 FloatParts pa
= float32_unpack_canonical(a
, status
);
852 FloatParts pb
= float32_unpack_canonical(b
, status
);
853 FloatParts pr
= mul_floats(pa
, pb
, status
);
855 return float32_round_pack_canonical(pr
, status
);
858 float64
__attribute__((flatten
)) float64_mul(float64 a
, float64 b
,
859 float_status
*status
)
861 FloatParts pa
= float64_unpack_canonical(a
, status
);
862 FloatParts pb
= float64_unpack_canonical(b
, status
);
863 FloatParts pr
= mul_floats(pa
, pb
, status
);
865 return float64_round_pack_canonical(pr
, status
);
869 * Returns the result of multiplying the floating-point values `a' and
870 * `b' then adding 'c', with no intermediate rounding step after the
871 * multiplication. The operation is performed according to the
872 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
873 * The flags argument allows the caller to select negation of the
874 * addend, the intermediate product, or the final result. (The
875 * difference between this and having the caller do a separate
876 * negation is that negating externally will flip the sign bit on
880 static FloatParts
muladd_floats(FloatParts a
, FloatParts b
, FloatParts c
,
881 int flags
, float_status
*s
)
883 bool inf_zero
= ((1 << a
.cls
) | (1 << b
.cls
)) ==
884 ((1 << float_class_inf
) | (1 << float_class_zero
));
886 bool sign_flip
= flags
& float_muladd_negate_result
;
891 /* It is implementation-defined whether the cases of (0,inf,qnan)
892 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
893 * they return if they do), so we have to hand this information
894 * off to the target-specific pick-a-NaN routine.
896 if (is_nan(a
.cls
) || is_nan(b
.cls
) || is_nan(c
.cls
)) {
897 return pick_nan_muladd(a
, b
, c
, inf_zero
, s
);
901 s
->float_exception_flags
|= float_flag_invalid
;
902 return parts_default_nan(s
);
905 if (flags
& float_muladd_negate_c
) {
909 p_sign
= a
.sign
^ b
.sign
;
911 if (flags
& float_muladd_negate_product
) {
915 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_inf
) {
916 p_class
= float_class_inf
;
917 } else if (a
.cls
== float_class_zero
|| b
.cls
== float_class_zero
) {
918 p_class
= float_class_zero
;
920 p_class
= float_class_normal
;
923 if (c
.cls
== float_class_inf
) {
924 if (p_class
== float_class_inf
&& p_sign
!= c
.sign
) {
925 s
->float_exception_flags
|= float_flag_invalid
;
926 return parts_default_nan(s
);
928 a
.cls
= float_class_inf
;
929 a
.sign
= c
.sign
^ sign_flip
;
934 if (p_class
== float_class_inf
) {
935 a
.cls
= float_class_inf
;
936 a
.sign
= p_sign
^ sign_flip
;
940 if (p_class
== float_class_zero
) {
941 if (c
.cls
== float_class_zero
) {
942 if (p_sign
!= c
.sign
) {
943 p_sign
= s
->float_rounding_mode
== float_round_down
;
946 } else if (flags
& float_muladd_halve_result
) {
953 /* a & b should be normals now... */
954 assert(a
.cls
== float_class_normal
&&
955 b
.cls
== float_class_normal
);
957 p_exp
= a
.exp
+ b
.exp
;
959 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
962 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
963 /* binary point now at bit 124 */
965 /* check for overflow */
966 if (hi
& (1ULL << (DECOMPOSED_BINARY_POINT
* 2 + 1 - 64))) {
967 shift128RightJamming(hi
, lo
, 1, &hi
, &lo
);
972 if (c
.cls
== float_class_zero
) {
973 /* move binary point back to 62 */
974 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
976 int exp_diff
= p_exp
- c
.exp
;
977 if (p_sign
== c
.sign
) {
980 shift128RightJamming(hi
, lo
,
981 DECOMPOSED_BINARY_POINT
- exp_diff
,
987 /* shift c to the same binary point as the product (124) */
990 shift128RightJamming(c_hi
, c_lo
,
993 add128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
994 /* move binary point back to 62 */
995 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
998 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
999 shift64RightJamming(lo
, 1, &lo
);
1005 uint64_t c_hi
, c_lo
;
1006 /* make C binary point match product at bit 124 */
1010 if (exp_diff
<= 0) {
1011 shift128RightJamming(hi
, lo
, -exp_diff
, &hi
, &lo
);
1014 (hi
> c_hi
|| (hi
== c_hi
&& lo
>= c_lo
))) {
1015 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1017 sub128(c_hi
, c_lo
, hi
, lo
, &hi
, &lo
);
1022 shift128RightJamming(c_hi
, c_lo
,
1025 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1028 if (hi
== 0 && lo
== 0) {
1029 a
.cls
= float_class_zero
;
1030 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1031 a
.sign
^= sign_flip
;
1038 shift
= clz64(lo
) + 64;
1040 /* Normalizing to a binary point of 124 is the
1041 correct adjust for the exponent. However since we're
1042 shifting, we might as well put the binary point back
1043 at 62 where we really want it. Therefore shift as
1044 if we're leaving 1 bit at the top of the word, but
1045 adjust the exponent as if we're leaving 3 bits. */
1048 lo
= lo
<< (shift
- 64);
1050 hi
= (hi
<< shift
) | (lo
>> (64 - shift
));
1051 lo
= hi
| ((lo
<< shift
) != 0);
1058 if (flags
& float_muladd_halve_result
) {
1062 /* finally prepare our result */
1063 a
.cls
= float_class_normal
;
1064 a
.sign
= p_sign
^ sign_flip
;
1071 float16
__attribute__((flatten
)) float16_muladd(float16 a
, float16 b
, float16 c
,
1072 int flags
, float_status
*status
)
1074 FloatParts pa
= float16_unpack_canonical(a
, status
);
1075 FloatParts pb
= float16_unpack_canonical(b
, status
);
1076 FloatParts pc
= float16_unpack_canonical(c
, status
);
1077 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1079 return float16_round_pack_canonical(pr
, status
);
1082 float32
__attribute__((flatten
)) float32_muladd(float32 a
, float32 b
, float32 c
,
1083 int flags
, float_status
*status
)
1085 FloatParts pa
= float32_unpack_canonical(a
, status
);
1086 FloatParts pb
= float32_unpack_canonical(b
, status
);
1087 FloatParts pc
= float32_unpack_canonical(c
, status
);
1088 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1090 return float32_round_pack_canonical(pr
, status
);
1093 float64
__attribute__((flatten
)) float64_muladd(float64 a
, float64 b
, float64 c
,
1094 int flags
, float_status
*status
)
1096 FloatParts pa
= float64_unpack_canonical(a
, status
);
1097 FloatParts pb
= float64_unpack_canonical(b
, status
);
1098 FloatParts pc
= float64_unpack_canonical(c
, status
);
1099 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1101 return float64_round_pack_canonical(pr
, status
);
1105 * Returns the result of dividing the floating-point value `a' by the
1106 * corresponding value `b'. The operation is performed according to
1107 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1110 static FloatParts
div_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1112 bool sign
= a
.sign
^ b
.sign
;
1114 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1115 uint64_t n0
, n1
, q
, r
;
1116 int exp
= a
.exp
- b
.exp
;
1119 * We want a 2*N / N-bit division to produce exactly an N-bit
1120 * result, so that we do not lose any precision and so that we
1121 * do not have to renormalize afterward. If A.frac < B.frac,
1122 * then division would produce an (N-1)-bit result; shift A left
1123 * by one to produce the an N-bit result, and decrement the
1124 * exponent to match.
1126 * The udiv_qrnnd algorithm that we're using requires normalization,
1127 * i.e. the msb of the denominator must be set. Since we know that
1128 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1129 * by one (more), and the remainder must be shifted right by one.
1131 if (a
.frac
< b
.frac
) {
1133 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 2, &n1
, &n0
);
1135 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1, &n1
, &n0
);
1137 q
= udiv_qrnnd(&r
, n1
, n0
, b
.frac
<< 1);
1140 * Set lsb if there is a remainder, to set inexact.
1141 * As mentioned above, to find the actual value of the remainder we
1142 * would need to shift right, but (1) we are only concerned about
1143 * non-zero-ness, and (2) the remainder will always be even because
1144 * both inputs to the division primitive are even.
1146 a
.frac
= q
| (r
!= 0);
1151 /* handle all the NaN cases */
1152 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1153 return pick_nan(a
, b
, s
);
1155 /* 0/0 or Inf/Inf */
1158 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1159 s
->float_exception_flags
|= float_flag_invalid
;
1160 return parts_default_nan(s
);
1162 /* Inf / x or 0 / x */
1163 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1168 if (b
.cls
== float_class_zero
) {
1169 s
->float_exception_flags
|= float_flag_divbyzero
;
1170 a
.cls
= float_class_inf
;
1175 if (b
.cls
== float_class_inf
) {
1176 a
.cls
= float_class_zero
;
1180 g_assert_not_reached();
1183 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1185 FloatParts pa
= float16_unpack_canonical(a
, status
);
1186 FloatParts pb
= float16_unpack_canonical(b
, status
);
1187 FloatParts pr
= div_floats(pa
, pb
, status
);
1189 return float16_round_pack_canonical(pr
, status
);
1192 float32
float32_div(float32 a
, float32 b
, float_status
*status
)
1194 FloatParts pa
= float32_unpack_canonical(a
, status
);
1195 FloatParts pb
= float32_unpack_canonical(b
, status
);
1196 FloatParts pr
= div_floats(pa
, pb
, status
);
1198 return float32_round_pack_canonical(pr
, status
);
1201 float64
float64_div(float64 a
, float64 b
, float_status
*status
)
1203 FloatParts pa
= float64_unpack_canonical(a
, status
);
1204 FloatParts pb
= float64_unpack_canonical(b
, status
);
1205 FloatParts pr
= div_floats(pa
, pb
, status
);
1207 return float64_round_pack_canonical(pr
, status
);
1211 * Float to Float conversions
1213 * Returns the result of converting one float format to another. The
1214 * conversion is performed according to the IEC/IEEE Standard for
1215 * Binary Floating-Point Arithmetic.
1217 * The float_to_float helper only needs to take care of raising
1218 * invalid exceptions and handling the conversion on NaNs.
1221 static FloatParts
float_to_float(FloatParts a
, const FloatFmt
*dstf
,
1224 if (dstf
->arm_althp
) {
1226 case float_class_qnan
:
1227 case float_class_snan
:
1228 /* There is no NaN in the destination format. Raise Invalid
1229 * and return a zero with the sign of the input NaN.
1231 s
->float_exception_flags
|= float_flag_invalid
;
1232 a
.cls
= float_class_zero
;
1237 case float_class_inf
:
1238 /* There is no Inf in the destination format. Raise Invalid
1239 * and return the maximum normal with the correct sign.
1241 s
->float_exception_flags
|= float_flag_invalid
;
1242 a
.cls
= float_class_normal
;
1243 a
.exp
= dstf
->exp_max
;
1244 a
.frac
= ((1ull << dstf
->frac_size
) - 1) << dstf
->frac_shift
;
1250 } else if (is_nan(a
.cls
)) {
1251 if (is_snan(a
.cls
)) {
1252 s
->float_exception_flags
|= float_flag_invalid
;
1253 a
= parts_silence_nan(a
, s
);
1255 if (s
->default_nan_mode
) {
1256 return parts_default_nan(s
);
1262 float32
float16_to_float32(float16 a
, bool ieee
, float_status
*s
)
1264 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1265 FloatParts p
= float16a_unpack_canonical(a
, s
, fmt16
);
1266 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
1267 return float32_round_pack_canonical(pr
, s
);
1270 float64
float16_to_float64(float16 a
, bool ieee
, float_status
*s
)
1272 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1273 FloatParts p
= float16a_unpack_canonical(a
, s
, fmt16
);
1274 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
1275 return float64_round_pack_canonical(pr
, s
);
1278 float16
float32_to_float16(float32 a
, bool ieee
, float_status
*s
)
1280 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1281 FloatParts p
= float32_unpack_canonical(a
, s
);
1282 FloatParts pr
= float_to_float(p
, fmt16
, s
);
1283 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1286 float64
float32_to_float64(float32 a
, float_status
*s
)
1288 FloatParts p
= float32_unpack_canonical(a
, s
);
1289 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
1290 return float64_round_pack_canonical(pr
, s
);
1293 float16
float64_to_float16(float64 a
, bool ieee
, float_status
*s
)
1295 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1296 FloatParts p
= float64_unpack_canonical(a
, s
);
1297 FloatParts pr
= float_to_float(p
, fmt16
, s
);
1298 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1301 float32
float64_to_float32(float64 a
, float_status
*s
)
1303 FloatParts p
= float64_unpack_canonical(a
, s
);
1304 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
1305 return float32_round_pack_canonical(pr
, s
);
1309 * Rounds the floating-point value `a' to an integer, and returns the
1310 * result as a floating-point value. The operation is performed
1311 * according to the IEC/IEEE Standard for Binary Floating-Point
1315 static FloatParts
round_to_int(FloatParts a
, int rmode
,
1316 int scale
, float_status
*s
)
1319 case float_class_qnan
:
1320 case float_class_snan
:
1321 return return_nan(a
, s
);
1323 case float_class_zero
:
1324 case float_class_inf
:
1325 /* already "integral" */
1328 case float_class_normal
:
1329 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
1332 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
1333 /* already integral */
1338 /* all fractional */
1339 s
->float_exception_flags
|= float_flag_inexact
;
1341 case float_round_nearest_even
:
1342 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
1344 case float_round_ties_away
:
1345 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
1347 case float_round_to_zero
:
1350 case float_round_up
:
1353 case float_round_down
:
1357 g_assert_not_reached();
1361 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
1364 a
.cls
= float_class_zero
;
1367 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
1368 uint64_t frac_lsbm1
= frac_lsb
>> 1;
1369 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
1370 uint64_t rnd_mask
= rnd_even_mask
>> 1;
1374 case float_round_nearest_even
:
1375 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
1377 case float_round_ties_away
:
1380 case float_round_to_zero
:
1383 case float_round_up
:
1384 inc
= a
.sign
? 0 : rnd_mask
;
1386 case float_round_down
:
1387 inc
= a
.sign
? rnd_mask
: 0;
1390 g_assert_not_reached();
1393 if (a
.frac
& rnd_mask
) {
1394 s
->float_exception_flags
|= float_flag_inexact
;
1396 a
.frac
&= ~rnd_mask
;
1397 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
1405 g_assert_not_reached();
1410 float16
float16_round_to_int(float16 a
, float_status
*s
)
1412 FloatParts pa
= float16_unpack_canonical(a
, s
);
1413 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
1414 return float16_round_pack_canonical(pr
, s
);
1417 float32
float32_round_to_int(float32 a
, float_status
*s
)
1419 FloatParts pa
= float32_unpack_canonical(a
, s
);
1420 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
1421 return float32_round_pack_canonical(pr
, s
);
1424 float64
float64_round_to_int(float64 a
, float_status
*s
)
1426 FloatParts pa
= float64_unpack_canonical(a
, s
);
1427 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
1428 return float64_round_pack_canonical(pr
, s
);
1432 * Returns the result of converting the floating-point value `a' to
1433 * the two's complement integer format. The conversion is performed
1434 * according to the IEC/IEEE Standard for Binary Floating-Point
1435 * Arithmetic---which means in particular that the conversion is
1436 * rounded according to the current rounding mode. If `a' is a NaN,
1437 * the largest positive integer is returned. Otherwise, if the
1438 * conversion overflows, the largest integer with the same sign as `a'
1442 static int64_t round_to_int_and_pack(FloatParts in
, int rmode
, int scale
,
1443 int64_t min
, int64_t max
,
1447 int orig_flags
= get_float_exception_flags(s
);
1448 FloatParts p
= round_to_int(in
, rmode
, scale
, s
);
1451 case float_class_snan
:
1452 case float_class_qnan
:
1453 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1455 case float_class_inf
:
1456 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1457 return p
.sign
? min
: max
;
1458 case float_class_zero
:
1460 case float_class_normal
:
1461 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
1462 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
1463 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
1464 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
1469 if (r
<= -(uint64_t) min
) {
1472 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1479 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1484 g_assert_not_reached();
1488 int16_t float16_to_int16_scalbn(float16 a
, int rmode
, int scale
,
1491 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
1492 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
1495 int32_t float16_to_int32_scalbn(float16 a
, int rmode
, int scale
,
1498 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
1499 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
1502 int64_t float16_to_int64_scalbn(float16 a
, int rmode
, int scale
,
1505 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
1506 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
1509 int16_t float32_to_int16_scalbn(float32 a
, int rmode
, int scale
,
1512 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
1513 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
1516 int32_t float32_to_int32_scalbn(float32 a
, int rmode
, int scale
,
1519 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
1520 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
1523 int64_t float32_to_int64_scalbn(float32 a
, int rmode
, int scale
,
1526 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
1527 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
1530 int16_t float64_to_int16_scalbn(float64 a
, int rmode
, int scale
,
1533 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
1534 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
1537 int32_t float64_to_int32_scalbn(float64 a
, int rmode
, int scale
,
1540 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
1541 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
1544 int64_t float64_to_int64_scalbn(float64 a
, int rmode
, int scale
,
1547 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
1548 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
1551 int16_t float16_to_int16(float16 a
, float_status
*s
)
1553 return float16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1556 int32_t float16_to_int32(float16 a
, float_status
*s
)
1558 return float16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1561 int64_t float16_to_int64(float16 a
, float_status
*s
)
1563 return float16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1566 int16_t float32_to_int16(float32 a
, float_status
*s
)
1568 return float32_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1571 int32_t float32_to_int32(float32 a
, float_status
*s
)
1573 return float32_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1576 int64_t float32_to_int64(float32 a
, float_status
*s
)
1578 return float32_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1581 int16_t float64_to_int16(float64 a
, float_status
*s
)
1583 return float64_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1586 int32_t float64_to_int32(float64 a
, float_status
*s
)
1588 return float64_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1591 int64_t float64_to_int64(float64 a
, float_status
*s
)
1593 return float64_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1596 int16_t float16_to_int16_round_to_zero(float16 a
, float_status
*s
)
1598 return float16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
1601 int32_t float16_to_int32_round_to_zero(float16 a
, float_status
*s
)
1603 return float16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
1606 int64_t float16_to_int64_round_to_zero(float16 a
, float_status
*s
)
1608 return float16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
1611 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*s
)
1613 return float32_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
1616 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*s
)
1618 return float32_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
1621 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*s
)
1623 return float32_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
1626 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*s
)
1628 return float64_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
1631 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*s
)
1633 return float64_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
1636 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*s
)
1638 return float64_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
1642 * Returns the result of converting the floating-point value `a' to
1643 * the unsigned integer format. The conversion is performed according
1644 * to the IEC/IEEE Standard for Binary Floating-Point
1645 * Arithmetic---which means in particular that the conversion is
1646 * rounded according to the current rounding mode. If `a' is a NaN,
1647 * the largest unsigned integer is returned. Otherwise, if the
1648 * conversion overflows, the largest unsigned integer is returned. If
1649 * the 'a' is negative, the result is rounded and zero is returned;
1650 * values that do not round to zero will raise the inexact exception
1654 static uint64_t round_to_uint_and_pack(FloatParts in
, int rmode
, int scale
,
1655 uint64_t max
, float_status
*s
)
1657 int orig_flags
= get_float_exception_flags(s
);
1658 FloatParts p
= round_to_int(in
, rmode
, scale
, s
);
1662 case float_class_snan
:
1663 case float_class_qnan
:
1664 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1666 case float_class_inf
:
1667 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1668 return p
.sign
? 0 : max
;
1669 case float_class_zero
:
1671 case float_class_normal
:
1673 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1677 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
1678 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
1679 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
1680 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
1682 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1686 /* For uint64 this will never trip, but if p.exp is too large
1687 * to shift a decomposed fraction we shall have exited via the
1691 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1696 g_assert_not_reached();
1700 uint16_t float16_to_uint16_scalbn(float16 a
, int rmode
, int scale
,
1703 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
1704 rmode
, scale
, UINT16_MAX
, s
);
1707 uint32_t float16_to_uint32_scalbn(float16 a
, int rmode
, int scale
,
1710 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
1711 rmode
, scale
, UINT32_MAX
, s
);
1714 uint64_t float16_to_uint64_scalbn(float16 a
, int rmode
, int scale
,
1717 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
1718 rmode
, scale
, UINT64_MAX
, s
);
1721 uint16_t float32_to_uint16_scalbn(float32 a
, int rmode
, int scale
,
1724 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
1725 rmode
, scale
, UINT16_MAX
, s
);
1728 uint32_t float32_to_uint32_scalbn(float32 a
, int rmode
, int scale
,
1731 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
1732 rmode
, scale
, UINT32_MAX
, s
);
1735 uint64_t float32_to_uint64_scalbn(float32 a
, int rmode
, int scale
,
1738 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
1739 rmode
, scale
, UINT64_MAX
, s
);
1742 uint16_t float64_to_uint16_scalbn(float64 a
, int rmode
, int scale
,
1745 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
1746 rmode
, scale
, UINT16_MAX
, s
);
1749 uint32_t float64_to_uint32_scalbn(float64 a
, int rmode
, int scale
,
1752 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
1753 rmode
, scale
, UINT32_MAX
, s
);
1756 uint64_t float64_to_uint64_scalbn(float64 a
, int rmode
, int scale
,
1759 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
1760 rmode
, scale
, UINT64_MAX
, s
);
1763 uint16_t float16_to_uint16(float16 a
, float_status
*s
)
1765 return float16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1768 uint32_t float16_to_uint32(float16 a
, float_status
*s
)
1770 return float16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1773 uint64_t float16_to_uint64(float16 a
, float_status
*s
)
1775 return float16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1778 uint16_t float32_to_uint16(float32 a
, float_status
*s
)
1780 return float32_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1783 uint32_t float32_to_uint32(float32 a
, float_status
*s
)
1785 return float32_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1788 uint64_t float32_to_uint64(float32 a
, float_status
*s
)
1790 return float32_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1793 uint16_t float64_to_uint16(float64 a
, float_status
*s
)
1795 return float64_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1798 uint32_t float64_to_uint32(float64 a
, float_status
*s
)
1800 return float64_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1803 uint64_t float64_to_uint64(float64 a
, float_status
*s
)
1805 return float64_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1808 uint16_t float16_to_uint16_round_to_zero(float16 a
, float_status
*s
)
1810 return float16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
1813 uint32_t float16_to_uint32_round_to_zero(float16 a
, float_status
*s
)
1815 return float16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
1818 uint64_t float16_to_uint64_round_to_zero(float16 a
, float_status
*s
)
1820 return float16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
1823 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*s
)
1825 return float32_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
1828 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*s
)
1830 return float32_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
1833 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*s
)
1835 return float32_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
1838 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*s
)
1840 return float64_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
1843 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*s
)
1845 return float64_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
1848 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*s
)
1850 return float64_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
1854 * Integer to float conversions
1856 * Returns the result of converting the two's complement integer `a'
1857 * to the floating-point format. The conversion is performed according
1858 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1861 static FloatParts
int_to_float(int64_t a
, int scale
, float_status
*status
)
1863 FloatParts r
= { .sign
= false };
1866 r
.cls
= float_class_zero
;
1871 r
.cls
= float_class_normal
;
1876 shift
= clz64(f
) - 1;
1877 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
1879 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
1880 r
.frac
= (shift
< 0 ? DECOMPOSED_IMPLICIT_BIT
: f
<< shift
);
1886 float16
int64_to_float16_scalbn(int64_t a
, int scale
, float_status
*status
)
1888 FloatParts pa
= int_to_float(a
, scale
, status
);
1889 return float16_round_pack_canonical(pa
, status
);
1892 float16
int32_to_float16_scalbn(int32_t a
, int scale
, float_status
*status
)
1894 return int64_to_float16_scalbn(a
, scale
, status
);
1897 float16
int16_to_float16_scalbn(int16_t a
, int scale
, float_status
*status
)
1899 return int64_to_float16_scalbn(a
, scale
, status
);
1902 float16
int64_to_float16(int64_t a
, float_status
*status
)
1904 return int64_to_float16_scalbn(a
, 0, status
);
1907 float16
int32_to_float16(int32_t a
, float_status
*status
)
1909 return int64_to_float16_scalbn(a
, 0, status
);
1912 float16
int16_to_float16(int16_t a
, float_status
*status
)
1914 return int64_to_float16_scalbn(a
, 0, status
);
1917 float32
int64_to_float32_scalbn(int64_t a
, int scale
, float_status
*status
)
1919 FloatParts pa
= int_to_float(a
, scale
, status
);
1920 return float32_round_pack_canonical(pa
, status
);
1923 float32
int32_to_float32_scalbn(int32_t a
, int scale
, float_status
*status
)
1925 return int64_to_float32_scalbn(a
, scale
, status
);
1928 float32
int16_to_float32_scalbn(int16_t a
, int scale
, float_status
*status
)
1930 return int64_to_float32_scalbn(a
, scale
, status
);
1933 float32
int64_to_float32(int64_t a
, float_status
*status
)
1935 return int64_to_float32_scalbn(a
, 0, status
);
1938 float32
int32_to_float32(int32_t a
, float_status
*status
)
1940 return int64_to_float32_scalbn(a
, 0, status
);
1943 float32
int16_to_float32(int16_t a
, float_status
*status
)
1945 return int64_to_float32_scalbn(a
, 0, status
);
1948 float64
int64_to_float64_scalbn(int64_t a
, int scale
, float_status
*status
)
1950 FloatParts pa
= int_to_float(a
, scale
, status
);
1951 return float64_round_pack_canonical(pa
, status
);
1954 float64
int32_to_float64_scalbn(int32_t a
, int scale
, float_status
*status
)
1956 return int64_to_float64_scalbn(a
, scale
, status
);
1959 float64
int16_to_float64_scalbn(int16_t a
, int scale
, float_status
*status
)
1961 return int64_to_float64_scalbn(a
, scale
, status
);
1964 float64
int64_to_float64(int64_t a
, float_status
*status
)
1966 return int64_to_float64_scalbn(a
, 0, status
);
1969 float64
int32_to_float64(int32_t a
, float_status
*status
)
1971 return int64_to_float64_scalbn(a
, 0, status
);
1974 float64
int16_to_float64(int16_t a
, float_status
*status
)
1976 return int64_to_float64_scalbn(a
, 0, status
);
1981 * Unsigned Integer to float conversions
1983 * Returns the result of converting the unsigned integer `a' to the
1984 * floating-point format. The conversion is performed according to the
1985 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1988 static FloatParts
uint_to_float(uint64_t a
, int scale
, float_status
*status
)
1990 FloatParts r
= { .sign
= false };
1993 r
.cls
= float_class_zero
;
1995 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
1996 r
.cls
= float_class_normal
;
1997 if ((int64_t)a
< 0) {
1998 r
.exp
= DECOMPOSED_BINARY_POINT
+ 1 + scale
;
1999 shift64RightJamming(a
, 1, &a
);
2002 int shift
= clz64(a
) - 1;
2003 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2004 r
.frac
= a
<< shift
;
2011 float16
uint64_to_float16_scalbn(uint64_t a
, int scale
, float_status
*status
)
2013 FloatParts pa
= uint_to_float(a
, scale
, status
);
2014 return float16_round_pack_canonical(pa
, status
);
2017 float16
uint32_to_float16_scalbn(uint32_t a
, int scale
, float_status
*status
)
2019 return uint64_to_float16_scalbn(a
, scale
, status
);
2022 float16
uint16_to_float16_scalbn(uint16_t a
, int scale
, float_status
*status
)
2024 return uint64_to_float16_scalbn(a
, scale
, status
);
2027 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
2029 return uint64_to_float16_scalbn(a
, 0, status
);
2032 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
2034 return uint64_to_float16_scalbn(a
, 0, status
);
2037 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
2039 return uint64_to_float16_scalbn(a
, 0, status
);
2042 float32
uint64_to_float32_scalbn(uint64_t a
, int scale
, float_status
*status
)
2044 FloatParts pa
= uint_to_float(a
, scale
, status
);
2045 return float32_round_pack_canonical(pa
, status
);
2048 float32
uint32_to_float32_scalbn(uint32_t a
, int scale
, float_status
*status
)
2050 return uint64_to_float32_scalbn(a
, scale
, status
);
2053 float32
uint16_to_float32_scalbn(uint16_t a
, int scale
, float_status
*status
)
2055 return uint64_to_float32_scalbn(a
, scale
, status
);
2058 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
2060 return uint64_to_float32_scalbn(a
, 0, status
);
2063 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
2065 return uint64_to_float32_scalbn(a
, 0, status
);
2068 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
2070 return uint64_to_float32_scalbn(a
, 0, status
);
2073 float64
uint64_to_float64_scalbn(uint64_t a
, int scale
, float_status
*status
)
2075 FloatParts pa
= uint_to_float(a
, scale
, status
);
2076 return float64_round_pack_canonical(pa
, status
);
2079 float64
uint32_to_float64_scalbn(uint32_t a
, int scale
, float_status
*status
)
2081 return uint64_to_float64_scalbn(a
, scale
, status
);
2084 float64
uint16_to_float64_scalbn(uint16_t a
, int scale
, float_status
*status
)
2086 return uint64_to_float64_scalbn(a
, scale
, status
);
2089 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
2091 return uint64_to_float64_scalbn(a
, 0, status
);
2094 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
2096 return uint64_to_float64_scalbn(a
, 0, status
);
2099 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
2101 return uint64_to_float64_scalbn(a
, 0, status
);
2105 /* min() and max() functions. These can't be implemented as
2106 * 'compare and pick one input' because that would mishandle
2107 * NaNs and +0 vs -0.
2109 * minnum() and maxnum() functions. These are similar to the min()
2110 * and max() functions but if one of the arguments is a QNaN and
2111 * the other is numerical then the numerical argument is returned.
2112 * SNaNs will get quietened before being returned.
2113 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2114 * and maxNum() operations. min() and max() are the typical min/max
2115 * semantics provided by many CPUs which predate that specification.
2117 * minnummag() and maxnummag() functions correspond to minNumMag()
2118 * and minNumMag() from the IEEE-754 2008.
2120 static FloatParts
minmax_floats(FloatParts a
, FloatParts b
, bool ismin
,
2121 bool ieee
, bool ismag
, float_status
*s
)
2123 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
2125 /* Takes two floating-point values `a' and `b', one of
2126 * which is a NaN, and returns the appropriate NaN
2127 * result. If either `a' or `b' is a signaling NaN,
2128 * the invalid exception is raised.
2130 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
2131 return pick_nan(a
, b
, s
);
2132 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
2134 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
2138 return pick_nan(a
, b
, s
);
2143 case float_class_normal
:
2146 case float_class_inf
:
2149 case float_class_zero
:
2153 g_assert_not_reached();
2157 case float_class_normal
:
2160 case float_class_inf
:
2163 case float_class_zero
:
2167 g_assert_not_reached();
2171 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
2172 bool a_less
= a_exp
< b_exp
;
2173 if (a_exp
== b_exp
) {
2174 a_less
= a
.frac
< b
.frac
;
2176 return a_less
^ ismin
? b
: a
;
2179 if (a
.sign
== b
.sign
) {
2180 bool a_less
= a_exp
< b_exp
;
2181 if (a_exp
== b_exp
) {
2182 a_less
= a
.frac
< b
.frac
;
2184 return a
.sign
^ a_less
^ ismin
? b
: a
;
2186 return a
.sign
^ ismin
? b
: a
;
2191 #define MINMAX(sz, name, ismin, isiee, ismag) \
2192 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
2195 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2196 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2197 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
2199 return float ## sz ## _round_pack_canonical(pr, s); \
2202 MINMAX(16, min
, true, false, false)
2203 MINMAX(16, minnum
, true, true, false)
2204 MINMAX(16, minnummag
, true, true, true)
2205 MINMAX(16, max
, false, false, false)
2206 MINMAX(16, maxnum
, false, true, false)
2207 MINMAX(16, maxnummag
, false, true, true)
2209 MINMAX(32, min
, true, false, false)
2210 MINMAX(32, minnum
, true, true, false)
2211 MINMAX(32, minnummag
, true, true, true)
2212 MINMAX(32, max
, false, false, false)
2213 MINMAX(32, maxnum
, false, true, false)
2214 MINMAX(32, maxnummag
, false, true, true)
2216 MINMAX(64, min
, true, false, false)
2217 MINMAX(64, minnum
, true, true, false)
2218 MINMAX(64, minnummag
, true, true, true)
2219 MINMAX(64, max
, false, false, false)
2220 MINMAX(64, maxnum
, false, true, false)
2221 MINMAX(64, maxnummag
, false, true, true)
2225 /* Floating point compare */
2226 static int compare_floats(FloatParts a
, FloatParts b
, bool is_quiet
,
2229 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
2231 a
.cls
== float_class_snan
||
2232 b
.cls
== float_class_snan
) {
2233 s
->float_exception_flags
|= float_flag_invalid
;
2235 return float_relation_unordered
;
2238 if (a
.cls
== float_class_zero
) {
2239 if (b
.cls
== float_class_zero
) {
2240 return float_relation_equal
;
2242 return b
.sign
? float_relation_greater
: float_relation_less
;
2243 } else if (b
.cls
== float_class_zero
) {
2244 return a
.sign
? float_relation_less
: float_relation_greater
;
2247 /* The only really important thing about infinity is its sign. If
2248 * both are infinities the sign marks the smallest of the two.
2250 if (a
.cls
== float_class_inf
) {
2251 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
2252 return float_relation_equal
;
2254 return a
.sign
? float_relation_less
: float_relation_greater
;
2255 } else if (b
.cls
== float_class_inf
) {
2256 return b
.sign
? float_relation_greater
: float_relation_less
;
2259 if (a
.sign
!= b
.sign
) {
2260 return a
.sign
? float_relation_less
: float_relation_greater
;
2263 if (a
.exp
== b
.exp
) {
2264 if (a
.frac
== b
.frac
) {
2265 return float_relation_equal
;
2268 return a
.frac
> b
.frac
?
2269 float_relation_less
: float_relation_greater
;
2271 return a
.frac
> b
.frac
?
2272 float_relation_greater
: float_relation_less
;
2276 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
2278 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
2283 #define COMPARE(sz) \
2284 int float ## sz ## _compare(float ## sz a, float ## sz b, \
2287 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2288 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2289 return compare_floats(pa, pb, false, s); \
2291 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
2294 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2295 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2296 return compare_floats(pa, pb, true, s); \
2305 /* Multiply A by 2 raised to the power N. */
2306 static FloatParts
scalbn_decomposed(FloatParts a
, int n
, float_status
*s
)
2308 if (unlikely(is_nan(a
.cls
))) {
2309 return return_nan(a
, s
);
2311 if (a
.cls
== float_class_normal
) {
2312 /* The largest float type (even though not supported by FloatParts)
2313 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
2314 * still allows rounding to infinity, without allowing overflow
2315 * within the int32_t that backs FloatParts.exp.
2317 n
= MIN(MAX(n
, -0x10000), 0x10000);
2323 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
2325 FloatParts pa
= float16_unpack_canonical(a
, status
);
2326 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
2327 return float16_round_pack_canonical(pr
, status
);
2330 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
2332 FloatParts pa
= float32_unpack_canonical(a
, status
);
2333 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
2334 return float32_round_pack_canonical(pr
, status
);
2337 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
2339 FloatParts pa
= float64_unpack_canonical(a
, status
);
2340 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
2341 return float64_round_pack_canonical(pr
, status
);
2347 * The old softfloat code did an approximation step before zeroing in
2348 * on the final result. However for simpleness we just compute the
2349 * square root by iterating down from the implicit bit to enough extra
2350 * bits to ensure we get a correctly rounded result.
2352 * This does mean however the calculation is slower than before,
2353 * especially for 64 bit floats.
2356 static FloatParts
sqrt_float(FloatParts a
, float_status
*s
, const FloatFmt
*p
)
2358 uint64_t a_frac
, r_frac
, s_frac
;
2361 if (is_nan(a
.cls
)) {
2362 return return_nan(a
, s
);
2364 if (a
.cls
== float_class_zero
) {
2365 return a
; /* sqrt(+-0) = +-0 */
2368 s
->float_exception_flags
|= float_flag_invalid
;
2369 return parts_default_nan(s
);
2371 if (a
.cls
== float_class_inf
) {
2372 return a
; /* sqrt(+inf) = +inf */
2375 assert(a
.cls
== float_class_normal
);
2377 /* We need two overflow bits at the top. Adding room for that is a
2378 * right shift. If the exponent is odd, we can discard the low bit
2379 * by multiplying the fraction by 2; that's a left shift. Combine
2380 * those and we shift right if the exponent is even.
2388 /* Bit-by-bit computation of sqrt. */
2392 /* Iterate from implicit bit down to the 3 extra bits to compute a
2393 * properly rounded result. Remember we've inserted one more bit
2394 * at the top, so these positions are one less.
2396 bit
= DECOMPOSED_BINARY_POINT
- 1;
2397 last_bit
= MAX(p
->frac_shift
- 4, 0);
2399 uint64_t q
= 1ULL << bit
;
2400 uint64_t t_frac
= s_frac
+ q
;
2401 if (t_frac
<= a_frac
) {
2402 s_frac
= t_frac
+ q
;
2407 } while (--bit
>= last_bit
);
2409 /* Undo the right shift done above. If there is any remaining
2410 * fraction, the result is inexact. Set the sticky bit.
2412 a
.frac
= (r_frac
<< 1) + (a_frac
!= 0);
2417 float16
__attribute__((flatten
)) float16_sqrt(float16 a
, float_status
*status
)
2419 FloatParts pa
= float16_unpack_canonical(a
, status
);
2420 FloatParts pr
= sqrt_float(pa
, status
, &float16_params
);
2421 return float16_round_pack_canonical(pr
, status
);
2424 float32
__attribute__((flatten
)) float32_sqrt(float32 a
, float_status
*status
)
2426 FloatParts pa
= float32_unpack_canonical(a
, status
);
2427 FloatParts pr
= sqrt_float(pa
, status
, &float32_params
);
2428 return float32_round_pack_canonical(pr
, status
);
2431 float64
__attribute__((flatten
)) float64_sqrt(float64 a
, float_status
*status
)
2433 FloatParts pa
= float64_unpack_canonical(a
, status
);
2434 FloatParts pr
= sqrt_float(pa
, status
, &float64_params
);
2435 return float64_round_pack_canonical(pr
, status
);
2438 /*----------------------------------------------------------------------------
2439 | The pattern for a default generated NaN.
2440 *----------------------------------------------------------------------------*/
2442 float16
float16_default_nan(float_status
*status
)
2444 FloatParts p
= parts_default_nan(status
);
2445 p
.frac
>>= float16_params
.frac_shift
;
2446 return float16_pack_raw(p
);
2449 float32
float32_default_nan(float_status
*status
)
2451 FloatParts p
= parts_default_nan(status
);
2452 p
.frac
>>= float32_params
.frac_shift
;
2453 return float32_pack_raw(p
);
2456 float64
float64_default_nan(float_status
*status
)
2458 FloatParts p
= parts_default_nan(status
);
2459 p
.frac
>>= float64_params
.frac_shift
;
2460 return float64_pack_raw(p
);
2463 float128
float128_default_nan(float_status
*status
)
2465 FloatParts p
= parts_default_nan(status
);
2468 /* Extrapolate from the choices made by parts_default_nan to fill
2469 * in the quad-floating format. If the low bit is set, assume we
2470 * want to set all non-snan bits.
2472 r
.low
= -(p
.frac
& 1);
2473 r
.high
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- 48);
2474 r
.high
|= LIT64(0x7FFF000000000000);
2475 r
.high
|= (uint64_t)p
.sign
<< 63;
2480 /*----------------------------------------------------------------------------
2481 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
2482 *----------------------------------------------------------------------------*/
2484 float16
float16_silence_nan(float16 a
, float_status
*status
)
2486 FloatParts p
= float16_unpack_raw(a
);
2487 p
.frac
<<= float16_params
.frac_shift
;
2488 p
= parts_silence_nan(p
, status
);
2489 p
.frac
>>= float16_params
.frac_shift
;
2490 return float16_pack_raw(p
);
2493 float32
float32_silence_nan(float32 a
, float_status
*status
)
2495 FloatParts p
= float32_unpack_raw(a
);
2496 p
.frac
<<= float32_params
.frac_shift
;
2497 p
= parts_silence_nan(p
, status
);
2498 p
.frac
>>= float32_params
.frac_shift
;
2499 return float32_pack_raw(p
);
2502 float64
float64_silence_nan(float64 a
, float_status
*status
)
2504 FloatParts p
= float64_unpack_raw(a
);
2505 p
.frac
<<= float64_params
.frac_shift
;
2506 p
= parts_silence_nan(p
, status
);
2507 p
.frac
>>= float64_params
.frac_shift
;
2508 return float64_pack_raw(p
);
2511 /*----------------------------------------------------------------------------
2512 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2513 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2514 | input. If `zSign' is 1, the input is negated before being converted to an
2515 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2516 | is simply rounded to an integer, with the inexact exception raised if the
2517 | input cannot be represented exactly as an integer. However, if the fixed-
2518 | point input is too large, the invalid exception is raised and the largest
2519 | positive or negative integer is returned.
2520 *----------------------------------------------------------------------------*/
2522 static int32_t roundAndPackInt32(flag zSign
, uint64_t absZ
, float_status
*status
)
2524 int8_t roundingMode
;
2525 flag roundNearestEven
;
2526 int8_t roundIncrement
, roundBits
;
2529 roundingMode
= status
->float_rounding_mode
;
2530 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2531 switch (roundingMode
) {
2532 case float_round_nearest_even
:
2533 case float_round_ties_away
:
2534 roundIncrement
= 0x40;
2536 case float_round_to_zero
:
2539 case float_round_up
:
2540 roundIncrement
= zSign
? 0 : 0x7f;
2542 case float_round_down
:
2543 roundIncrement
= zSign
? 0x7f : 0;
2548 roundBits
= absZ
& 0x7F;
2549 absZ
= ( absZ
+ roundIncrement
)>>7;
2550 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
2552 if ( zSign
) z
= - z
;
2553 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
2554 float_raise(float_flag_invalid
, status
);
2555 return zSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
2558 status
->float_exception_flags
|= float_flag_inexact
;
2564 /*----------------------------------------------------------------------------
2565 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2566 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2567 | and returns the properly rounded 64-bit integer corresponding to the input.
2568 | If `zSign' is 1, the input is negated before being converted to an integer.
2569 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2570 | the inexact exception raised if the input cannot be represented exactly as
2571 | an integer. However, if the fixed-point input is too large, the invalid
2572 | exception is raised and the largest positive or negative integer is
2574 *----------------------------------------------------------------------------*/
2576 static int64_t roundAndPackInt64(flag zSign
, uint64_t absZ0
, uint64_t absZ1
,
2577 float_status
*status
)
2579 int8_t roundingMode
;
2580 flag roundNearestEven
, increment
;
2583 roundingMode
= status
->float_rounding_mode
;
2584 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2585 switch (roundingMode
) {
2586 case float_round_nearest_even
:
2587 case float_round_ties_away
:
2588 increment
= ((int64_t) absZ1
< 0);
2590 case float_round_to_zero
:
2593 case float_round_up
:
2594 increment
= !zSign
&& absZ1
;
2596 case float_round_down
:
2597 increment
= zSign
&& absZ1
;
2604 if ( absZ0
== 0 ) goto overflow
;
2605 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
2608 if ( zSign
) z
= - z
;
2609 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
2611 float_raise(float_flag_invalid
, status
);
2613 zSign
? (int64_t) LIT64( 0x8000000000000000 )
2614 : LIT64( 0x7FFFFFFFFFFFFFFF );
2617 status
->float_exception_flags
|= float_flag_inexact
;
2623 /*----------------------------------------------------------------------------
2624 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2625 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2626 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2627 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2628 | with the inexact exception raised if the input cannot be represented exactly
2629 | as an integer. However, if the fixed-point input is too large, the invalid
2630 | exception is raised and the largest unsigned integer is returned.
2631 *----------------------------------------------------------------------------*/
2633 static int64_t roundAndPackUint64(flag zSign
, uint64_t absZ0
,
2634 uint64_t absZ1
, float_status
*status
)
2636 int8_t roundingMode
;
2637 flag roundNearestEven
, increment
;
2639 roundingMode
= status
->float_rounding_mode
;
2640 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
2641 switch (roundingMode
) {
2642 case float_round_nearest_even
:
2643 case float_round_ties_away
:
2644 increment
= ((int64_t)absZ1
< 0);
2646 case float_round_to_zero
:
2649 case float_round_up
:
2650 increment
= !zSign
&& absZ1
;
2652 case float_round_down
:
2653 increment
= zSign
&& absZ1
;
2661 float_raise(float_flag_invalid
, status
);
2662 return LIT64(0xFFFFFFFFFFFFFFFF);
2664 absZ0
&= ~(((uint64_t)(absZ1
<<1) == 0) & roundNearestEven
);
2667 if (zSign
&& absZ0
) {
2668 float_raise(float_flag_invalid
, status
);
2673 status
->float_exception_flags
|= float_flag_inexact
;
2678 /*----------------------------------------------------------------------------
2679 | If `a' is denormal and we are in flush-to-zero mode then set the
2680 | input-denormal exception and return zero. Otherwise just return the value.
2681 *----------------------------------------------------------------------------*/
2682 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
2684 if (status
->flush_inputs_to_zero
) {
2685 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
2686 float_raise(float_flag_input_denormal
, status
);
2687 return make_float32(float32_val(a
) & 0x80000000);
2693 /*----------------------------------------------------------------------------
2694 | Normalizes the subnormal single-precision floating-point value represented
2695 | by the denormalized significand `aSig'. The normalized exponent and
2696 | significand are stored at the locations pointed to by `zExpPtr' and
2697 | `zSigPtr', respectively.
2698 *----------------------------------------------------------------------------*/
2701 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
2705 shiftCount
= clz32(aSig
) - 8;
2706 *zSigPtr
= aSig
<<shiftCount
;
2707 *zExpPtr
= 1 - shiftCount
;
2711 /*----------------------------------------------------------------------------
2712 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2713 | and significand `zSig', and returns the proper single-precision floating-
2714 | point value corresponding to the abstract input. Ordinarily, the abstract
2715 | value is simply rounded and packed into the single-precision format, with
2716 | the inexact exception raised if the abstract input cannot be represented
2717 | exactly. However, if the abstract value is too large, the overflow and
2718 | inexact exceptions are raised and an infinity or maximal finite value is
2719 | returned. If the abstract value is too small, the input value is rounded to
2720 | a subnormal number, and the underflow and inexact exceptions are raised if
2721 | the abstract input cannot be represented exactly as a subnormal single-
2722 | precision floating-point number.
2723 | The input significand `zSig' has its binary point between bits 30
2724 | and 29, which is 7 bits to the left of the usual location. This shifted
2725 | significand must be normalized or smaller. If `zSig' is not normalized,
2726 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2727 | and it must not require rounding. In the usual case that `zSig' is
2728 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2729 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2730 | Binary Floating-Point Arithmetic.
2731 *----------------------------------------------------------------------------*/
2733 static float32
roundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
2734 float_status
*status
)
2736 int8_t roundingMode
;
2737 flag roundNearestEven
;
2738 int8_t roundIncrement
, roundBits
;
2741 roundingMode
= status
->float_rounding_mode
;
2742 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2743 switch (roundingMode
) {
2744 case float_round_nearest_even
:
2745 case float_round_ties_away
:
2746 roundIncrement
= 0x40;
2748 case float_round_to_zero
:
2751 case float_round_up
:
2752 roundIncrement
= zSign
? 0 : 0x7f;
2754 case float_round_down
:
2755 roundIncrement
= zSign
? 0x7f : 0;
2761 roundBits
= zSig
& 0x7F;
2762 if ( 0xFD <= (uint16_t) zExp
) {
2763 if ( ( 0xFD < zExp
)
2764 || ( ( zExp
== 0xFD )
2765 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
2767 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2768 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
2771 if (status
->flush_to_zero
) {
2772 float_raise(float_flag_output_denormal
, status
);
2773 return packFloat32(zSign
, 0, 0);
2776 (status
->float_detect_tininess
2777 == float_tininess_before_rounding
)
2779 || ( zSig
+ roundIncrement
< 0x80000000 );
2780 shift32RightJamming( zSig
, - zExp
, &zSig
);
2782 roundBits
= zSig
& 0x7F;
2783 if (isTiny
&& roundBits
) {
2784 float_raise(float_flag_underflow
, status
);
2789 status
->float_exception_flags
|= float_flag_inexact
;
2791 zSig
= ( zSig
+ roundIncrement
)>>7;
2792 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
2793 if ( zSig
== 0 ) zExp
= 0;
2794 return packFloat32( zSign
, zExp
, zSig
);
2798 /*----------------------------------------------------------------------------
2799 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2800 | and significand `zSig', and returns the proper single-precision floating-
2801 | point value corresponding to the abstract input. This routine is just like
2802 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2803 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2804 | floating-point exponent.
2805 *----------------------------------------------------------------------------*/
2808 normalizeRoundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
2809 float_status
*status
)
2813 shiftCount
= clz32(zSig
) - 1;
2814 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
2819 /*----------------------------------------------------------------------------
2820 | If `a' is denormal and we are in flush-to-zero mode then set the
2821 | input-denormal exception and return zero. Otherwise just return the value.
2822 *----------------------------------------------------------------------------*/
2823 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
2825 if (status
->flush_inputs_to_zero
) {
2826 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
2827 float_raise(float_flag_input_denormal
, status
);
2828 return make_float64(float64_val(a
) & (1ULL << 63));
2834 /*----------------------------------------------------------------------------
2835 | Normalizes the subnormal double-precision floating-point value represented
2836 | by the denormalized significand `aSig'. The normalized exponent and
2837 | significand are stored at the locations pointed to by `zExpPtr' and
2838 | `zSigPtr', respectively.
2839 *----------------------------------------------------------------------------*/
2842 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
2846 shiftCount
= clz64(aSig
) - 11;
2847 *zSigPtr
= aSig
<<shiftCount
;
2848 *zExpPtr
= 1 - shiftCount
;
2852 /*----------------------------------------------------------------------------
2853 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2854 | double-precision floating-point value, returning the result. After being
2855 | shifted into the proper positions, the three fields are simply added
2856 | together to form the result. This means that any integer portion of `zSig'
2857 | will be added into the exponent. Since a properly normalized significand
2858 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2859 | than the desired result exponent whenever `zSig' is a complete, normalized
2861 *----------------------------------------------------------------------------*/
2863 static inline float64
packFloat64(flag zSign
, int zExp
, uint64_t zSig
)
2866 return make_float64(
2867 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
2871 /*----------------------------------------------------------------------------
2872 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2873 | and significand `zSig', and returns the proper double-precision floating-
2874 | point value corresponding to the abstract input. Ordinarily, the abstract
2875 | value is simply rounded and packed into the double-precision format, with
2876 | the inexact exception raised if the abstract input cannot be represented
2877 | exactly. However, if the abstract value is too large, the overflow and
2878 | inexact exceptions are raised and an infinity or maximal finite value is
2879 | returned. If the abstract value is too small, the input value is rounded to
2880 | a subnormal number, and the underflow and inexact exceptions are raised if
2881 | the abstract input cannot be represented exactly as a subnormal double-
2882 | precision floating-point number.
2883 | The input significand `zSig' has its binary point between bits 62
2884 | and 61, which is 10 bits to the left of the usual location. This shifted
2885 | significand must be normalized or smaller. If `zSig' is not normalized,
2886 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2887 | and it must not require rounding. In the usual case that `zSig' is
2888 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2889 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2890 | Binary Floating-Point Arithmetic.
2891 *----------------------------------------------------------------------------*/
2893 static float64
roundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
2894 float_status
*status
)
2896 int8_t roundingMode
;
2897 flag roundNearestEven
;
2898 int roundIncrement
, roundBits
;
2901 roundingMode
= status
->float_rounding_mode
;
2902 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2903 switch (roundingMode
) {
2904 case float_round_nearest_even
:
2905 case float_round_ties_away
:
2906 roundIncrement
= 0x200;
2908 case float_round_to_zero
:
2911 case float_round_up
:
2912 roundIncrement
= zSign
? 0 : 0x3ff;
2914 case float_round_down
:
2915 roundIncrement
= zSign
? 0x3ff : 0;
2917 case float_round_to_odd
:
2918 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
2923 roundBits
= zSig
& 0x3FF;
2924 if ( 0x7FD <= (uint16_t) zExp
) {
2925 if ( ( 0x7FD < zExp
)
2926 || ( ( zExp
== 0x7FD )
2927 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
2929 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
2930 roundIncrement
!= 0;
2931 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2932 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
2935 if (status
->flush_to_zero
) {
2936 float_raise(float_flag_output_denormal
, status
);
2937 return packFloat64(zSign
, 0, 0);
2940 (status
->float_detect_tininess
2941 == float_tininess_before_rounding
)
2943 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
2944 shift64RightJamming( zSig
, - zExp
, &zSig
);
2946 roundBits
= zSig
& 0x3FF;
2947 if (isTiny
&& roundBits
) {
2948 float_raise(float_flag_underflow
, status
);
2950 if (roundingMode
== float_round_to_odd
) {
2952 * For round-to-odd case, the roundIncrement depends on
2953 * zSig which just changed.
2955 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
2960 status
->float_exception_flags
|= float_flag_inexact
;
2962 zSig
= ( zSig
+ roundIncrement
)>>10;
2963 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
2964 if ( zSig
== 0 ) zExp
= 0;
2965 return packFloat64( zSign
, zExp
, zSig
);
2969 /*----------------------------------------------------------------------------
2970 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2971 | and significand `zSig', and returns the proper double-precision floating-
2972 | point value corresponding to the abstract input. This routine is just like
2973 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2974 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2975 | floating-point exponent.
2976 *----------------------------------------------------------------------------*/
2979 normalizeRoundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
2980 float_status
*status
)
2984 shiftCount
= clz64(zSig
) - 1;
2985 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
2990 /*----------------------------------------------------------------------------
2991 | Normalizes the subnormal extended double-precision floating-point value
2992 | represented by the denormalized significand `aSig'. The normalized exponent
2993 | and significand are stored at the locations pointed to by `zExpPtr' and
2994 | `zSigPtr', respectively.
2995 *----------------------------------------------------------------------------*/
2997 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
3002 shiftCount
= clz64(aSig
);
3003 *zSigPtr
= aSig
<<shiftCount
;
3004 *zExpPtr
= 1 - shiftCount
;
3007 /*----------------------------------------------------------------------------
3008 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3009 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3010 | and returns the proper extended double-precision floating-point value
3011 | corresponding to the abstract input. Ordinarily, the abstract value is
3012 | rounded and packed into the extended double-precision format, with the
3013 | inexact exception raised if the abstract input cannot be represented
3014 | exactly. However, if the abstract value is too large, the overflow and
3015 | inexact exceptions are raised and an infinity or maximal finite value is
3016 | returned. If the abstract value is too small, the input value is rounded to
3017 | a subnormal number, and the underflow and inexact exceptions are raised if
3018 | the abstract input cannot be represented exactly as a subnormal extended
3019 | double-precision floating-point number.
3020 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
3021 | number of bits as single or double precision, respectively. Otherwise, the
3022 | result is rounded to the full precision of the extended double-precision
3024 | The input significand must be normalized or smaller. If the input
3025 | significand is not normalized, `zExp' must be 0; in that case, the result
3026 | returned is a subnormal number, and it must not require rounding. The
3027 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3028 | Floating-Point Arithmetic.
3029 *----------------------------------------------------------------------------*/
3031 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, flag zSign
,
3032 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
3033 float_status
*status
)
3035 int8_t roundingMode
;
3036 flag roundNearestEven
, increment
, isTiny
;
3037 int64_t roundIncrement
, roundMask
, roundBits
;
3039 roundingMode
= status
->float_rounding_mode
;
3040 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3041 if ( roundingPrecision
== 80 ) goto precision80
;
3042 if ( roundingPrecision
== 64 ) {
3043 roundIncrement
= LIT64( 0x0000000000000400 );
3044 roundMask
= LIT64( 0x00000000000007FF );
3046 else if ( roundingPrecision
== 32 ) {
3047 roundIncrement
= LIT64( 0x0000008000000000 );
3048 roundMask
= LIT64( 0x000000FFFFFFFFFF );
3053 zSig0
|= ( zSig1
!= 0 );
3054 switch (roundingMode
) {
3055 case float_round_nearest_even
:
3056 case float_round_ties_away
:
3058 case float_round_to_zero
:
3061 case float_round_up
:
3062 roundIncrement
= zSign
? 0 : roundMask
;
3064 case float_round_down
:
3065 roundIncrement
= zSign
? roundMask
: 0;
3070 roundBits
= zSig0
& roundMask
;
3071 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
3072 if ( ( 0x7FFE < zExp
)
3073 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
3078 if (status
->flush_to_zero
) {
3079 float_raise(float_flag_output_denormal
, status
);
3080 return packFloatx80(zSign
, 0, 0);
3083 (status
->float_detect_tininess
3084 == float_tininess_before_rounding
)
3086 || ( zSig0
<= zSig0
+ roundIncrement
);
3087 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
3089 roundBits
= zSig0
& roundMask
;
3090 if (isTiny
&& roundBits
) {
3091 float_raise(float_flag_underflow
, status
);
3094 status
->float_exception_flags
|= float_flag_inexact
;
3096 zSig0
+= roundIncrement
;
3097 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
3098 roundIncrement
= roundMask
+ 1;
3099 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
3100 roundMask
|= roundIncrement
;
3102 zSig0
&= ~ roundMask
;
3103 return packFloatx80( zSign
, zExp
, zSig0
);
3107 status
->float_exception_flags
|= float_flag_inexact
;
3109 zSig0
+= roundIncrement
;
3110 if ( zSig0
< roundIncrement
) {
3112 zSig0
= LIT64( 0x8000000000000000 );
3114 roundIncrement
= roundMask
+ 1;
3115 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
3116 roundMask
|= roundIncrement
;
3118 zSig0
&= ~ roundMask
;
3119 if ( zSig0
== 0 ) zExp
= 0;
3120 return packFloatx80( zSign
, zExp
, zSig0
);
3122 switch (roundingMode
) {
3123 case float_round_nearest_even
:
3124 case float_round_ties_away
:
3125 increment
= ((int64_t)zSig1
< 0);
3127 case float_round_to_zero
:
3130 case float_round_up
:
3131 increment
= !zSign
&& zSig1
;
3133 case float_round_down
:
3134 increment
= zSign
&& zSig1
;
3139 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
3140 if ( ( 0x7FFE < zExp
)
3141 || ( ( zExp
== 0x7FFE )
3142 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
3148 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3149 if ( ( roundingMode
== float_round_to_zero
)
3150 || ( zSign
&& ( roundingMode
== float_round_up
) )
3151 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
3153 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
3155 return packFloatx80(zSign
,
3156 floatx80_infinity_high
,
3157 floatx80_infinity_low
);
3161 (status
->float_detect_tininess
3162 == float_tininess_before_rounding
)
3165 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
3166 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
3168 if (isTiny
&& zSig1
) {
3169 float_raise(float_flag_underflow
, status
);
3172 status
->float_exception_flags
|= float_flag_inexact
;
3174 switch (roundingMode
) {
3175 case float_round_nearest_even
:
3176 case float_round_ties_away
:
3177 increment
= ((int64_t)zSig1
< 0);
3179 case float_round_to_zero
:
3182 case float_round_up
:
3183 increment
= !zSign
&& zSig1
;
3185 case float_round_down
:
3186 increment
= zSign
&& zSig1
;
3194 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
3195 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
3197 return packFloatx80( zSign
, zExp
, zSig0
);
3201 status
->float_exception_flags
|= float_flag_inexact
;
3207 zSig0
= LIT64( 0x8000000000000000 );
3210 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
3214 if ( zSig0
== 0 ) zExp
= 0;
3216 return packFloatx80( zSign
, zExp
, zSig0
);
3220 /*----------------------------------------------------------------------------
3221 | Takes an abstract floating-point value having sign `zSign', exponent
3222 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3223 | and returns the proper extended double-precision floating-point value
3224 | corresponding to the abstract input. This routine is just like
3225 | `roundAndPackFloatx80' except that the input significand does not have to be
3227 *----------------------------------------------------------------------------*/
3229 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
3230 flag zSign
, int32_t zExp
,
3231 uint64_t zSig0
, uint64_t zSig1
,
3232 float_status
*status
)
3241 shiftCount
= clz64(zSig0
);
3242 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
3244 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
3245 zSig0
, zSig1
, status
);
3249 /*----------------------------------------------------------------------------
3250 | Returns the least-significant 64 fraction bits of the quadruple-precision
3251 | floating-point value `a'.
3252 *----------------------------------------------------------------------------*/
3254 static inline uint64_t extractFloat128Frac1( float128 a
)
3261 /*----------------------------------------------------------------------------
3262 | Returns the most-significant 48 fraction bits of the quadruple-precision
3263 | floating-point value `a'.
3264 *----------------------------------------------------------------------------*/
3266 static inline uint64_t extractFloat128Frac0( float128 a
)
3269 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
3273 /*----------------------------------------------------------------------------
3274 | Returns the exponent bits of the quadruple-precision floating-point value
3276 *----------------------------------------------------------------------------*/
3278 static inline int32_t extractFloat128Exp( float128 a
)
3281 return ( a
.high
>>48 ) & 0x7FFF;
3285 /*----------------------------------------------------------------------------
3286 | Returns the sign bit of the quadruple-precision floating-point value `a'.
3287 *----------------------------------------------------------------------------*/
3289 static inline flag
extractFloat128Sign( float128 a
)
3296 /*----------------------------------------------------------------------------
3297 | Normalizes the subnormal quadruple-precision floating-point value
3298 | represented by the denormalized significand formed by the concatenation of
3299 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
3300 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
3301 | significand are stored at the location pointed to by `zSig0Ptr', and the
3302 | least significant 64 bits of the normalized significand are stored at the
3303 | location pointed to by `zSig1Ptr'.
3304 *----------------------------------------------------------------------------*/
3307 normalizeFloat128Subnormal(
3318 shiftCount
= clz64(aSig1
) - 15;
3319 if ( shiftCount
< 0 ) {
3320 *zSig0Ptr
= aSig1
>>( - shiftCount
);
3321 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
3324 *zSig0Ptr
= aSig1
<<shiftCount
;
3327 *zExpPtr
= - shiftCount
- 63;
3330 shiftCount
= clz64(aSig0
) - 15;
3331 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
3332 *zExpPtr
= 1 - shiftCount
;
3337 /*----------------------------------------------------------------------------
3338 | Packs the sign `zSign', the exponent `zExp', and the significand formed
3339 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
3340 | floating-point value, returning the result. After being shifted into the
3341 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
3342 | added together to form the most significant 32 bits of the result. This
3343 | means that any integer portion of `zSig0' will be added into the exponent.
3344 | Since a properly normalized significand will have an integer portion equal
3345 | to 1, the `zExp' input should be 1 less than the desired result exponent
3346 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
3348 *----------------------------------------------------------------------------*/
3350 static inline float128
3351 packFloat128( flag zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
3356 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
3361 /*----------------------------------------------------------------------------
3362 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3363 | and extended significand formed by the concatenation of `zSig0', `zSig1',
3364 | and `zSig2', and returns the proper quadruple-precision floating-point value
3365 | corresponding to the abstract input. Ordinarily, the abstract value is
3366 | simply rounded and packed into the quadruple-precision format, with the
3367 | inexact exception raised if the abstract input cannot be represented
3368 | exactly. However, if the abstract value is too large, the overflow and
3369 | inexact exceptions are raised and an infinity or maximal finite value is
3370 | returned. If the abstract value is too small, the input value is rounded to
3371 | a subnormal number, and the underflow and inexact exceptions are raised if
3372 | the abstract input cannot be represented exactly as a subnormal quadruple-
3373 | precision floating-point number.
3374 | The input significand must be normalized or smaller. If the input
3375 | significand is not normalized, `zExp' must be 0; in that case, the result
3376 | returned is a subnormal number, and it must not require rounding. In the
3377 | usual case that the input significand is normalized, `zExp' must be 1 less
3378 | than the ``true'' floating-point exponent. The handling of underflow and
3379 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3380 *----------------------------------------------------------------------------*/
3382 static float128
roundAndPackFloat128(flag zSign
, int32_t zExp
,
3383 uint64_t zSig0
, uint64_t zSig1
,
3384 uint64_t zSig2
, float_status
*status
)
3386 int8_t roundingMode
;
3387 flag roundNearestEven
, increment
, isTiny
;
3389 roundingMode
= status
->float_rounding_mode
;
3390 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3391 switch (roundingMode
) {
3392 case float_round_nearest_even
:
3393 case float_round_ties_away
:
3394 increment
= ((int64_t)zSig2
< 0);
3396 case float_round_to_zero
:
3399 case float_round_up
:
3400 increment
= !zSign
&& zSig2
;
3402 case float_round_down
:
3403 increment
= zSign
&& zSig2
;
3405 case float_round_to_odd
:
3406 increment
= !(zSig1
& 0x1) && zSig2
;
3411 if ( 0x7FFD <= (uint32_t) zExp
) {
3412 if ( ( 0x7FFD < zExp
)
3413 || ( ( zExp
== 0x7FFD )
3415 LIT64( 0x0001FFFFFFFFFFFF ),
3416 LIT64( 0xFFFFFFFFFFFFFFFF ),
3423 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3424 if ( ( roundingMode
== float_round_to_zero
)
3425 || ( zSign
&& ( roundingMode
== float_round_up
) )
3426 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
3427 || (roundingMode
== float_round_to_odd
)
3433 LIT64( 0x0000FFFFFFFFFFFF ),
3434 LIT64( 0xFFFFFFFFFFFFFFFF )
3437 return packFloat128( zSign
, 0x7FFF, 0, 0 );
3440 if (status
->flush_to_zero
) {
3441 float_raise(float_flag_output_denormal
, status
);
3442 return packFloat128(zSign
, 0, 0, 0);
3445 (status
->float_detect_tininess
3446 == float_tininess_before_rounding
)
3452 LIT64( 0x0001FFFFFFFFFFFF ),
3453 LIT64( 0xFFFFFFFFFFFFFFFF )
3455 shift128ExtraRightJamming(
3456 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
3458 if (isTiny
&& zSig2
) {
3459 float_raise(float_flag_underflow
, status
);
3461 switch (roundingMode
) {
3462 case float_round_nearest_even
:
3463 case float_round_ties_away
:
3464 increment
= ((int64_t)zSig2
< 0);
3466 case float_round_to_zero
:
3469 case float_round_up
:
3470 increment
= !zSign
&& zSig2
;
3472 case float_round_down
:
3473 increment
= zSign
&& zSig2
;
3475 case float_round_to_odd
:
3476 increment
= !(zSig1
& 0x1) && zSig2
;
3484 status
->float_exception_flags
|= float_flag_inexact
;
3487 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
3488 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
3491 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
3493 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
3497 /*----------------------------------------------------------------------------
3498 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3499 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3500 | returns the proper quadruple-precision floating-point value corresponding
3501 | to the abstract input. This routine is just like `roundAndPackFloat128'
3502 | except that the input significand has fewer bits and does not have to be
3503 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
3505 *----------------------------------------------------------------------------*/
3507 static float128
normalizeRoundAndPackFloat128(flag zSign
, int32_t zExp
,
3508 uint64_t zSig0
, uint64_t zSig1
,
3509 float_status
*status
)
3519 shiftCount
= clz64(zSig0
) - 15;
3520 if ( 0 <= shiftCount
) {
3522 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
3525 shift128ExtraRightJamming(
3526 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
3529 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
3534 /*----------------------------------------------------------------------------
3535 | Returns the result of converting the 32-bit two's complement integer `a'
3536 | to the extended double-precision floating-point format. The conversion
3537 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3539 *----------------------------------------------------------------------------*/
3541 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
3548 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
3550 absA
= zSign
? - a
: a
;
3551 shiftCount
= clz32(absA
) + 32;
3553 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
3557 /*----------------------------------------------------------------------------
3558 | Returns the result of converting the 32-bit two's complement integer `a' to
3559 | the quadruple-precision floating-point format. The conversion is performed
3560 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3561 *----------------------------------------------------------------------------*/
3563 float128
int32_to_float128(int32_t a
, float_status
*status
)
3570 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
3572 absA
= zSign
? - a
: a
;
3573 shiftCount
= clz32(absA
) + 17;
3575 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
3579 /*----------------------------------------------------------------------------
3580 | Returns the result of converting the 64-bit two's complement integer `a'
3581 | to the extended double-precision floating-point format. The conversion
3582 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3584 *----------------------------------------------------------------------------*/
3586 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
3592 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
3594 absA
= zSign
? - a
: a
;
3595 shiftCount
= clz64(absA
);
3596 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
3600 /*----------------------------------------------------------------------------
3601 | Returns the result of converting the 64-bit two's complement integer `a' to
3602 | the quadruple-precision floating-point format. The conversion is performed
3603 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3604 *----------------------------------------------------------------------------*/
3606 float128
int64_to_float128(int64_t a
, float_status
*status
)
3612 uint64_t zSig0
, zSig1
;
3614 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
3616 absA
= zSign
? - a
: a
;
3617 shiftCount
= clz64(absA
) + 49;
3618 zExp
= 0x406E - shiftCount
;
3619 if ( 64 <= shiftCount
) {
3628 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
3629 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
3633 /*----------------------------------------------------------------------------
3634 | Returns the result of converting the 64-bit unsigned integer `a'
3635 | to the quadruple-precision floating-point format. The conversion is performed
3636 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3637 *----------------------------------------------------------------------------*/
3639 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
3642 return float128_zero
;
3644 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a
, status
);
3647 /*----------------------------------------------------------------------------
3648 | Returns the result of converting the single-precision floating-point value
3649 | `a' to the extended double-precision floating-point format. The conversion
3650 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3652 *----------------------------------------------------------------------------*/
3654 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
3660 a
= float32_squash_input_denormal(a
, status
);
3661 aSig
= extractFloat32Frac( a
);
3662 aExp
= extractFloat32Exp( a
);
3663 aSign
= extractFloat32Sign( a
);
3664 if ( aExp
== 0xFF ) {
3666 return commonNaNToFloatx80(float32ToCommonNaN(a
, status
), status
);
3668 return packFloatx80(aSign
,
3669 floatx80_infinity_high
,
3670 floatx80_infinity_low
);
3673 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
3674 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3677 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
3681 /*----------------------------------------------------------------------------
3682 | Returns the result of converting the single-precision floating-point value
3683 | `a' to the double-precision floating-point format. The conversion is
3684 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3686 *----------------------------------------------------------------------------*/
3688 float128
float32_to_float128(float32 a
, float_status
*status
)
3694 a
= float32_squash_input_denormal(a
, status
);
3695 aSig
= extractFloat32Frac( a
);
3696 aExp
= extractFloat32Exp( a
);
3697 aSign
= extractFloat32Sign( a
);
3698 if ( aExp
== 0xFF ) {
3700 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
3702 return packFloat128( aSign
, 0x7FFF, 0, 0 );
3705 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
3706 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3709 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
3713 /*----------------------------------------------------------------------------
3714 | Returns the remainder of the single-precision floating-point value `a'
3715 | with respect to the corresponding value `b'. The operation is performed
3716 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3717 *----------------------------------------------------------------------------*/
3719 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
3722 int aExp
, bExp
, expDiff
;
3723 uint32_t aSig
, bSig
;
3725 uint64_t aSig64
, bSig64
, q64
;
3726 uint32_t alternateASig
;
3728 a
= float32_squash_input_denormal(a
, status
);
3729 b
= float32_squash_input_denormal(b
, status
);
3731 aSig
= extractFloat32Frac( a
);
3732 aExp
= extractFloat32Exp( a
);
3733 aSign
= extractFloat32Sign( a
);
3734 bSig
= extractFloat32Frac( b
);
3735 bExp
= extractFloat32Exp( b
);
3736 if ( aExp
== 0xFF ) {
3737 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
3738 return propagateFloat32NaN(a
, b
, status
);
3740 float_raise(float_flag_invalid
, status
);
3741 return float32_default_nan(status
);
3743 if ( bExp
== 0xFF ) {
3745 return propagateFloat32NaN(a
, b
, status
);
3751 float_raise(float_flag_invalid
, status
);
3752 return float32_default_nan(status
);
3754 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
3757 if ( aSig
== 0 ) return a
;
3758 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3760 expDiff
= aExp
- bExp
;
3763 if ( expDiff
< 32 ) {
3766 if ( expDiff
< 0 ) {
3767 if ( expDiff
< -1 ) return a
;
3770 q
= ( bSig
<= aSig
);
3771 if ( q
) aSig
-= bSig
;
3772 if ( 0 < expDiff
) {
3773 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
3776 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3784 if ( bSig
<= aSig
) aSig
-= bSig
;
3785 aSig64
= ( (uint64_t) aSig
)<<40;
3786 bSig64
= ( (uint64_t) bSig
)<<40;
3788 while ( 0 < expDiff
) {
3789 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
3790 q64
= ( 2 < q64
) ? q64
- 2 : 0;
3791 aSig64
= - ( ( bSig
* q64
)<<38 );
3795 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
3796 q64
= ( 2 < q64
) ? q64
- 2 : 0;
3797 q
= q64
>>( 64 - expDiff
);
3799 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
3802 alternateASig
= aSig
;
3805 } while ( 0 <= (int32_t) aSig
);
3806 sigMean
= aSig
+ alternateASig
;
3807 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3808 aSig
= alternateASig
;
3810 zSign
= ( (int32_t) aSig
< 0 );
3811 if ( zSign
) aSig
= - aSig
;
3812 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
3817 /*----------------------------------------------------------------------------
3818 | Returns the binary exponential of the single-precision floating-point value
3819 | `a'. The operation is performed according to the IEC/IEEE Standard for
3820 | Binary Floating-Point Arithmetic.
3822 | Uses the following identities:
3824 | 1. -------------------------------------------------------------------------
3828 | 2. -------------------------------------------------------------------------
3831 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3833 *----------------------------------------------------------------------------*/
3835 static const float64 float32_exp2_coefficients
[15] =
3837 const_float64( 0x3ff0000000000000ll
), /* 1 */
3838 const_float64( 0x3fe0000000000000ll
), /* 2 */
3839 const_float64( 0x3fc5555555555555ll
), /* 3 */
3840 const_float64( 0x3fa5555555555555ll
), /* 4 */
3841 const_float64( 0x3f81111111111111ll
), /* 5 */
3842 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
3843 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
3844 const_float64( 0x3efa01a01a01a01all
), /* 8 */
3845 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
3846 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
3847 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
3848 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
3849 const_float64( 0x3de6124613a86d09ll
), /* 13 */
3850 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
3851 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
3854 float32
float32_exp2(float32 a
, float_status
*status
)
3861 a
= float32_squash_input_denormal(a
, status
);
3863 aSig
= extractFloat32Frac( a
);
3864 aExp
= extractFloat32Exp( a
);
3865 aSign
= extractFloat32Sign( a
);
3867 if ( aExp
== 0xFF) {
3869 return propagateFloat32NaN(a
, float32_zero
, status
);
3871 return (aSign
) ? float32_zero
: a
;
3874 if (aSig
== 0) return float32_one
;
3877 float_raise(float_flag_inexact
, status
);
3879 /* ******************************* */
3880 /* using float64 for approximation */
3881 /* ******************************* */
3882 x
= float32_to_float64(a
, status
);
3883 x
= float64_mul(x
, float64_ln2
, status
);
3887 for (i
= 0 ; i
< 15 ; i
++) {
3890 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
3891 r
= float64_add(r
, f
, status
);
3893 xn
= float64_mul(xn
, x
, status
);
3896 return float64_to_float32(r
, status
);
3899 /*----------------------------------------------------------------------------
3900 | Returns the binary log of the single-precision floating-point value `a'.
3901 | The operation is performed according to the IEC/IEEE Standard for Binary
3902 | Floating-Point Arithmetic.
3903 *----------------------------------------------------------------------------*/
3904 float32
float32_log2(float32 a
, float_status
*status
)
3908 uint32_t aSig
, zSig
, i
;
3910 a
= float32_squash_input_denormal(a
, status
);
3911 aSig
= extractFloat32Frac( a
);
3912 aExp
= extractFloat32Exp( a
);
3913 aSign
= extractFloat32Sign( a
);
3916 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
3917 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3920 float_raise(float_flag_invalid
, status
);
3921 return float32_default_nan(status
);
3923 if ( aExp
== 0xFF ) {
3925 return propagateFloat32NaN(a
, float32_zero
, status
);
3935 for (i
= 1 << 22; i
> 0; i
>>= 1) {
3936 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
3937 if ( aSig
& 0x01000000 ) {
3946 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
3949 /*----------------------------------------------------------------------------
3950 | Returns 1 if the single-precision floating-point value `a' is equal to
3951 | the corresponding value `b', and 0 otherwise. The invalid exception is
3952 | raised if either operand is a NaN. Otherwise, the comparison is performed
3953 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3954 *----------------------------------------------------------------------------*/
3956 int float32_eq(float32 a
, float32 b
, float_status
*status
)
3959 a
= float32_squash_input_denormal(a
, status
);
3960 b
= float32_squash_input_denormal(b
, status
);
3962 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3963 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3965 float_raise(float_flag_invalid
, status
);
3968 av
= float32_val(a
);
3969 bv
= float32_val(b
);
3970 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3973 /*----------------------------------------------------------------------------
3974 | Returns 1 if the single-precision floating-point value `a' is less than
3975 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3976 | exception is raised if either operand is a NaN. The comparison is performed
3977 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3978 *----------------------------------------------------------------------------*/
3980 int float32_le(float32 a
, float32 b
, float_status
*status
)
3984 a
= float32_squash_input_denormal(a
, status
);
3985 b
= float32_squash_input_denormal(b
, status
);
3987 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3988 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3990 float_raise(float_flag_invalid
, status
);
3993 aSign
= extractFloat32Sign( a
);
3994 bSign
= extractFloat32Sign( b
);
3995 av
= float32_val(a
);
3996 bv
= float32_val(b
);
3997 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3998 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4002 /*----------------------------------------------------------------------------
4003 | Returns 1 if the single-precision floating-point value `a' is less than
4004 | the corresponding value `b', and 0 otherwise. The invalid exception is
4005 | raised if either operand is a NaN. The comparison is performed according
4006 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4007 *----------------------------------------------------------------------------*/
4009 int float32_lt(float32 a
, float32 b
, float_status
*status
)
4013 a
= float32_squash_input_denormal(a
, status
);
4014 b
= float32_squash_input_denormal(b
, status
);
4016 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4017 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4019 float_raise(float_flag_invalid
, status
);
4022 aSign
= extractFloat32Sign( a
);
4023 bSign
= extractFloat32Sign( b
);
4024 av
= float32_val(a
);
4025 bv
= float32_val(b
);
4026 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
4027 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4031 /*----------------------------------------------------------------------------
4032 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4033 | be compared, and 0 otherwise. The invalid exception is raised if either
4034 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4035 | Standard for Binary Floating-Point Arithmetic.
4036 *----------------------------------------------------------------------------*/
4038 int float32_unordered(float32 a
, float32 b
, float_status
*status
)
4040 a
= float32_squash_input_denormal(a
, status
);
4041 b
= float32_squash_input_denormal(b
, status
);
4043 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4044 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4046 float_raise(float_flag_invalid
, status
);
4052 /*----------------------------------------------------------------------------
4053 | Returns 1 if the single-precision floating-point value `a' is equal to
4054 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4055 | exception. The comparison is performed according to the IEC/IEEE Standard
4056 | for Binary Floating-Point Arithmetic.
4057 *----------------------------------------------------------------------------*/
4059 int float32_eq_quiet(float32 a
, float32 b
, float_status
*status
)
4061 a
= float32_squash_input_denormal(a
, status
);
4062 b
= float32_squash_input_denormal(b
, status
);
4064 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4065 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4067 if (float32_is_signaling_nan(a
, status
)
4068 || float32_is_signaling_nan(b
, status
)) {
4069 float_raise(float_flag_invalid
, status
);
4073 return ( float32_val(a
) == float32_val(b
) ) ||
4074 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
4077 /*----------------------------------------------------------------------------
4078 | Returns 1 if the single-precision floating-point value `a' is less than or
4079 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4080 | cause an exception. Otherwise, the comparison is performed according to the
4081 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4082 *----------------------------------------------------------------------------*/
4084 int float32_le_quiet(float32 a
, float32 b
, float_status
*status
)
4088 a
= float32_squash_input_denormal(a
, status
);
4089 b
= float32_squash_input_denormal(b
, status
);
4091 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4092 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4094 if (float32_is_signaling_nan(a
, status
)
4095 || float32_is_signaling_nan(b
, status
)) {
4096 float_raise(float_flag_invalid
, status
);
4100 aSign
= extractFloat32Sign( a
);
4101 bSign
= extractFloat32Sign( b
);
4102 av
= float32_val(a
);
4103 bv
= float32_val(b
);
4104 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
4105 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4109 /*----------------------------------------------------------------------------
4110 | Returns 1 if the single-precision floating-point value `a' is less than
4111 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4112 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4113 | Standard for Binary Floating-Point Arithmetic.
4114 *----------------------------------------------------------------------------*/
4116 int float32_lt_quiet(float32 a
, float32 b
, float_status
*status
)
4120 a
= float32_squash_input_denormal(a
, status
);
4121 b
= float32_squash_input_denormal(b
, status
);
4123 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4124 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4126 if (float32_is_signaling_nan(a
, status
)
4127 || float32_is_signaling_nan(b
, status
)) {
4128 float_raise(float_flag_invalid
, status
);
4132 aSign
= extractFloat32Sign( a
);
4133 bSign
= extractFloat32Sign( b
);
4134 av
= float32_val(a
);
4135 bv
= float32_val(b
);
4136 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
4137 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4141 /*----------------------------------------------------------------------------
4142 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4143 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4144 | comparison is performed according to the IEC/IEEE Standard for Binary
4145 | Floating-Point Arithmetic.
4146 *----------------------------------------------------------------------------*/
4148 int float32_unordered_quiet(float32 a
, float32 b
, float_status
*status
)
4150 a
= float32_squash_input_denormal(a
, status
);
4151 b
= float32_squash_input_denormal(b
, status
);
4153 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4154 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4156 if (float32_is_signaling_nan(a
, status
)
4157 || float32_is_signaling_nan(b
, status
)) {
4158 float_raise(float_flag_invalid
, status
);
4165 /*----------------------------------------------------------------------------
4166 | If `a' is denormal and we are in flush-to-zero mode then set the
4167 | input-denormal exception and return zero. Otherwise just return the value.
4168 *----------------------------------------------------------------------------*/
4169 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
4171 if (status
->flush_inputs_to_zero
) {
4172 if (extractFloat16Exp(a
) == 0 && extractFloat16Frac(a
) != 0) {
4173 float_raise(float_flag_input_denormal
, status
);
4174 return make_float16(float16_val(a
) & 0x8000);
4180 /*----------------------------------------------------------------------------
4181 | Returns the result of converting the double-precision floating-point value
4182 | `a' to the extended double-precision floating-point format. The conversion
4183 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4185 *----------------------------------------------------------------------------*/
4187 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
4193 a
= float64_squash_input_denormal(a
, status
);
4194 aSig
= extractFloat64Frac( a
);
4195 aExp
= extractFloat64Exp( a
);
4196 aSign
= extractFloat64Sign( a
);
4197 if ( aExp
== 0x7FF ) {
4199 return commonNaNToFloatx80(float64ToCommonNaN(a
, status
), status
);
4201 return packFloatx80(aSign
,
4202 floatx80_infinity_high
,
4203 floatx80_infinity_low
);
4206 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
4207 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4211 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
4215 /*----------------------------------------------------------------------------
4216 | Returns the result of converting the double-precision floating-point value
4217 | `a' to the quadruple-precision floating-point format. The conversion is
4218 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4220 *----------------------------------------------------------------------------*/
4222 float128
float64_to_float128(float64 a
, float_status
*status
)
4226 uint64_t aSig
, zSig0
, zSig1
;
4228 a
= float64_squash_input_denormal(a
, status
);
4229 aSig
= extractFloat64Frac( a
);
4230 aExp
= extractFloat64Exp( a
);
4231 aSign
= extractFloat64Sign( a
);
4232 if ( aExp
== 0x7FF ) {
4234 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
4236 return packFloat128( aSign
, 0x7FFF, 0, 0 );
4239 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
4240 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4243 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
4244 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
4249 /*----------------------------------------------------------------------------
4250 | Returns the remainder of the double-precision floating-point value `a'
4251 | with respect to the corresponding value `b'. The operation is performed
4252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4253 *----------------------------------------------------------------------------*/
4255 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
4258 int aExp
, bExp
, expDiff
;
4259 uint64_t aSig
, bSig
;
4260 uint64_t q
, alternateASig
;
4263 a
= float64_squash_input_denormal(a
, status
);
4264 b
= float64_squash_input_denormal(b
, status
);
4265 aSig
= extractFloat64Frac( a
);
4266 aExp
= extractFloat64Exp( a
);
4267 aSign
= extractFloat64Sign( a
);
4268 bSig
= extractFloat64Frac( b
);
4269 bExp
= extractFloat64Exp( b
);
4270 if ( aExp
== 0x7FF ) {
4271 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
4272 return propagateFloat64NaN(a
, b
, status
);
4274 float_raise(float_flag_invalid
, status
);
4275 return float64_default_nan(status
);
4277 if ( bExp
== 0x7FF ) {
4279 return propagateFloat64NaN(a
, b
, status
);
4285 float_raise(float_flag_invalid
, status
);
4286 return float64_default_nan(status
);
4288 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4291 if ( aSig
== 0 ) return a
;
4292 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4294 expDiff
= aExp
- bExp
;
4295 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
4296 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4297 if ( expDiff
< 0 ) {
4298 if ( expDiff
< -1 ) return a
;
4301 q
= ( bSig
<= aSig
);
4302 if ( q
) aSig
-= bSig
;
4304 while ( 0 < expDiff
) {
4305 q
= estimateDiv128To64( aSig
, 0, bSig
);
4306 q
= ( 2 < q
) ? q
- 2 : 0;
4307 aSig
= - ( ( bSig
>>2 ) * q
);
4311 if ( 0 < expDiff
) {
4312 q
= estimateDiv128To64( aSig
, 0, bSig
);
4313 q
= ( 2 < q
) ? q
- 2 : 0;
4316 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4323 alternateASig
= aSig
;
4326 } while ( 0 <= (int64_t) aSig
);
4327 sigMean
= aSig
+ alternateASig
;
4328 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4329 aSig
= alternateASig
;
4331 zSign
= ( (int64_t) aSig
< 0 );
4332 if ( zSign
) aSig
= - aSig
;
4333 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
4337 /*----------------------------------------------------------------------------
4338 | Returns the binary log of the double-precision floating-point value `a'.
4339 | The operation is performed according to the IEC/IEEE Standard for Binary
4340 | Floating-Point Arithmetic.
4341 *----------------------------------------------------------------------------*/
4342 float64
float64_log2(float64 a
, float_status
*status
)
4346 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
4347 a
= float64_squash_input_denormal(a
, status
);
4349 aSig
= extractFloat64Frac( a
);
4350 aExp
= extractFloat64Exp( a
);
4351 aSign
= extractFloat64Sign( a
);
4354 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
4355 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4358 float_raise(float_flag_invalid
, status
);
4359 return float64_default_nan(status
);
4361 if ( aExp
== 0x7FF ) {
4363 return propagateFloat64NaN(a
, float64_zero
, status
);
4369 aSig
|= LIT64( 0x0010000000000000 );
4371 zSig
= (uint64_t)aExp
<< 52;
4372 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
4373 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
4374 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
4375 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
4383 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
4386 /*----------------------------------------------------------------------------
4387 | Returns 1 if the double-precision floating-point value `a' is equal to the
4388 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4389 | if either operand is a NaN. Otherwise, the comparison is performed
4390 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4391 *----------------------------------------------------------------------------*/
4393 int float64_eq(float64 a
, float64 b
, float_status
*status
)
4396 a
= float64_squash_input_denormal(a
, status
);
4397 b
= float64_squash_input_denormal(b
, status
);
4399 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4400 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4402 float_raise(float_flag_invalid
, status
);
4405 av
= float64_val(a
);
4406 bv
= float64_val(b
);
4407 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4411 /*----------------------------------------------------------------------------
4412 | Returns 1 if the double-precision floating-point value `a' is less than or
4413 | equal to the corresponding value `b', and 0 otherwise. The invalid
4414 | exception is raised if either operand is a NaN. The comparison is performed
4415 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4416 *----------------------------------------------------------------------------*/
4418 int float64_le(float64 a
, float64 b
, float_status
*status
)
4422 a
= float64_squash_input_denormal(a
, status
);
4423 b
= float64_squash_input_denormal(b
, status
);
4425 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4426 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4428 float_raise(float_flag_invalid
, status
);
4431 aSign
= extractFloat64Sign( a
);
4432 bSign
= extractFloat64Sign( b
);
4433 av
= float64_val(a
);
4434 bv
= float64_val(b
);
4435 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4436 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4440 /*----------------------------------------------------------------------------
4441 | Returns 1 if the double-precision floating-point value `a' is less than
4442 | the corresponding value `b', and 0 otherwise. The invalid exception is
4443 | raised if either operand is a NaN. The comparison is performed according
4444 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4445 *----------------------------------------------------------------------------*/
4447 int float64_lt(float64 a
, float64 b
, float_status
*status
)
4452 a
= float64_squash_input_denormal(a
, status
);
4453 b
= float64_squash_input_denormal(b
, status
);
4454 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4455 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4457 float_raise(float_flag_invalid
, status
);
4460 aSign
= extractFloat64Sign( a
);
4461 bSign
= extractFloat64Sign( b
);
4462 av
= float64_val(a
);
4463 bv
= float64_val(b
);
4464 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4465 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4469 /*----------------------------------------------------------------------------
4470 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4471 | be compared, and 0 otherwise. The invalid exception is raised if either
4472 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4473 | Standard for Binary Floating-Point Arithmetic.
4474 *----------------------------------------------------------------------------*/
4476 int float64_unordered(float64 a
, float64 b
, float_status
*status
)
4478 a
= float64_squash_input_denormal(a
, status
);
4479 b
= float64_squash_input_denormal(b
, status
);
4481 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4482 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4484 float_raise(float_flag_invalid
, status
);
4490 /*----------------------------------------------------------------------------
4491 | Returns 1 if the double-precision floating-point value `a' is equal to the
4492 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4493 | exception.The comparison is performed according to the IEC/IEEE Standard
4494 | for Binary Floating-Point Arithmetic.
4495 *----------------------------------------------------------------------------*/
4497 int float64_eq_quiet(float64 a
, float64 b
, float_status
*status
)
4500 a
= float64_squash_input_denormal(a
, status
);
4501 b
= float64_squash_input_denormal(b
, status
);
4503 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4504 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4506 if (float64_is_signaling_nan(a
, status
)
4507 || float64_is_signaling_nan(b
, status
)) {
4508 float_raise(float_flag_invalid
, status
);
4512 av
= float64_val(a
);
4513 bv
= float64_val(b
);
4514 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4518 /*----------------------------------------------------------------------------
4519 | Returns 1 if the double-precision floating-point value `a' is less than or
4520 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4521 | cause an exception. Otherwise, the comparison is performed according to the
4522 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4523 *----------------------------------------------------------------------------*/
4525 int float64_le_quiet(float64 a
, float64 b
, float_status
*status
)
4529 a
= float64_squash_input_denormal(a
, status
);
4530 b
= float64_squash_input_denormal(b
, status
);
4532 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4533 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4535 if (float64_is_signaling_nan(a
, status
)
4536 || float64_is_signaling_nan(b
, status
)) {
4537 float_raise(float_flag_invalid
, status
);
4541 aSign
= extractFloat64Sign( a
);
4542 bSign
= extractFloat64Sign( b
);
4543 av
= float64_val(a
);
4544 bv
= float64_val(b
);
4545 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4546 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4550 /*----------------------------------------------------------------------------
4551 | Returns 1 if the double-precision floating-point value `a' is less than
4552 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4553 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4554 | Standard for Binary Floating-Point Arithmetic.
4555 *----------------------------------------------------------------------------*/
4557 int float64_lt_quiet(float64 a
, float64 b
, float_status
*status
)
4561 a
= float64_squash_input_denormal(a
, status
);
4562 b
= float64_squash_input_denormal(b
, status
);
4564 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4565 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4567 if (float64_is_signaling_nan(a
, status
)
4568 || float64_is_signaling_nan(b
, status
)) {
4569 float_raise(float_flag_invalid
, status
);
4573 aSign
= extractFloat64Sign( a
);
4574 bSign
= extractFloat64Sign( b
);
4575 av
= float64_val(a
);
4576 bv
= float64_val(b
);
4577 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4578 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4582 /*----------------------------------------------------------------------------
4583 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4584 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4585 | comparison is performed according to the IEC/IEEE Standard for Binary
4586 | Floating-Point Arithmetic.
4587 *----------------------------------------------------------------------------*/
4589 int float64_unordered_quiet(float64 a
, float64 b
, float_status
*status
)
4591 a
= float64_squash_input_denormal(a
, status
);
4592 b
= float64_squash_input_denormal(b
, status
);
4594 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4595 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4597 if (float64_is_signaling_nan(a
, status
)
4598 || float64_is_signaling_nan(b
, status
)) {
4599 float_raise(float_flag_invalid
, status
);
4606 /*----------------------------------------------------------------------------
4607 | Returns the result of converting the extended double-precision floating-
4608 | point value `a' to the 32-bit two's complement integer format. The
4609 | conversion is performed according to the IEC/IEEE Standard for Binary
4610 | Floating-Point Arithmetic---which means in particular that the conversion
4611 | is rounded according to the current rounding mode. If `a' is a NaN, the
4612 | largest positive integer is returned. Otherwise, if the conversion
4613 | overflows, the largest integer with the same sign as `a' is returned.
4614 *----------------------------------------------------------------------------*/
4616 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
4619 int32_t aExp
, shiftCount
;
4622 if (floatx80_invalid_encoding(a
)) {
4623 float_raise(float_flag_invalid
, status
);
4626 aSig
= extractFloatx80Frac( a
);
4627 aExp
= extractFloatx80Exp( a
);
4628 aSign
= extractFloatx80Sign( a
);
4629 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4630 shiftCount
= 0x4037 - aExp
;
4631 if ( shiftCount
<= 0 ) shiftCount
= 1;
4632 shift64RightJamming( aSig
, shiftCount
, &aSig
);
4633 return roundAndPackInt32(aSign
, aSig
, status
);
4637 /*----------------------------------------------------------------------------
4638 | Returns the result of converting the extended double-precision floating-
4639 | point value `a' to the 32-bit two's complement integer format. The
4640 | conversion is performed according to the IEC/IEEE Standard for Binary
4641 | Floating-Point Arithmetic, except that the conversion is always rounded
4642 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4643 | Otherwise, if the conversion overflows, the largest integer with the same
4644 | sign as `a' is returned.
4645 *----------------------------------------------------------------------------*/
4647 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
4650 int32_t aExp
, shiftCount
;
4651 uint64_t aSig
, savedASig
;
4654 if (floatx80_invalid_encoding(a
)) {
4655 float_raise(float_flag_invalid
, status
);
4658 aSig
= extractFloatx80Frac( a
);
4659 aExp
= extractFloatx80Exp( a
);
4660 aSign
= extractFloatx80Sign( a
);
4661 if ( 0x401E < aExp
) {
4662 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4665 else if ( aExp
< 0x3FFF ) {
4667 status
->float_exception_flags
|= float_flag_inexact
;
4671 shiftCount
= 0x403E - aExp
;
4673 aSig
>>= shiftCount
;
4675 if ( aSign
) z
= - z
;
4676 if ( ( z
< 0 ) ^ aSign
) {
4678 float_raise(float_flag_invalid
, status
);
4679 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
4681 if ( ( aSig
<<shiftCount
) != savedASig
) {
4682 status
->float_exception_flags
|= float_flag_inexact
;
4688 /*----------------------------------------------------------------------------
4689 | Returns the result of converting the extended double-precision floating-
4690 | point value `a' to the 64-bit two's complement integer format. The
4691 | conversion is performed according to the IEC/IEEE Standard for Binary
4692 | Floating-Point Arithmetic---which means in particular that the conversion
4693 | is rounded according to the current rounding mode. If `a' is a NaN,
4694 | the largest positive integer is returned. Otherwise, if the conversion
4695 | overflows, the largest integer with the same sign as `a' is returned.
4696 *----------------------------------------------------------------------------*/
4698 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
4701 int32_t aExp
, shiftCount
;
4702 uint64_t aSig
, aSigExtra
;
4704 if (floatx80_invalid_encoding(a
)) {
4705 float_raise(float_flag_invalid
, status
);
4708 aSig
= extractFloatx80Frac( a
);
4709 aExp
= extractFloatx80Exp( a
);
4710 aSign
= extractFloatx80Sign( a
);
4711 shiftCount
= 0x403E - aExp
;
4712 if ( shiftCount
<= 0 ) {
4714 float_raise(float_flag_invalid
, status
);
4715 if (!aSign
|| floatx80_is_any_nan(a
)) {
4716 return LIT64( 0x7FFFFFFFFFFFFFFF );
4718 return (int64_t) LIT64( 0x8000000000000000 );
4723 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
4725 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
4729 /*----------------------------------------------------------------------------
4730 | Returns the result of converting the extended double-precision floating-
4731 | point value `a' to the 64-bit two's complement integer format. The
4732 | conversion is performed according to the IEC/IEEE Standard for Binary
4733 | Floating-Point Arithmetic, except that the conversion is always rounded
4734 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4735 | Otherwise, if the conversion overflows, the largest integer with the same
4736 | sign as `a' is returned.
4737 *----------------------------------------------------------------------------*/
4739 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
4742 int32_t aExp
, shiftCount
;
4746 if (floatx80_invalid_encoding(a
)) {
4747 float_raise(float_flag_invalid
, status
);
4750 aSig
= extractFloatx80Frac( a
);
4751 aExp
= extractFloatx80Exp( a
);
4752 aSign
= extractFloatx80Sign( a
);
4753 shiftCount
= aExp
- 0x403E;
4754 if ( 0 <= shiftCount
) {
4755 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
4756 if ( ( a
.high
!= 0xC03E ) || aSig
) {
4757 float_raise(float_flag_invalid
, status
);
4758 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
4759 return LIT64( 0x7FFFFFFFFFFFFFFF );
4762 return (int64_t) LIT64( 0x8000000000000000 );
4764 else if ( aExp
< 0x3FFF ) {
4766 status
->float_exception_flags
|= float_flag_inexact
;
4770 z
= aSig
>>( - shiftCount
);
4771 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
4772 status
->float_exception_flags
|= float_flag_inexact
;
4774 if ( aSign
) z
= - z
;
4779 /*----------------------------------------------------------------------------
4780 | Returns the result of converting the extended double-precision floating-
4781 | point value `a' to the single-precision floating-point format. The
4782 | conversion is performed according to the IEC/IEEE Standard for Binary
4783 | Floating-Point Arithmetic.
4784 *----------------------------------------------------------------------------*/
4786 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
4792 if (floatx80_invalid_encoding(a
)) {
4793 float_raise(float_flag_invalid
, status
);
4794 return float32_default_nan(status
);
4796 aSig
= extractFloatx80Frac( a
);
4797 aExp
= extractFloatx80Exp( a
);
4798 aSign
= extractFloatx80Sign( a
);
4799 if ( aExp
== 0x7FFF ) {
4800 if ( (uint64_t) ( aSig
<<1 ) ) {
4801 return commonNaNToFloat32(floatx80ToCommonNaN(a
, status
), status
);
4803 return packFloat32( aSign
, 0xFF, 0 );
4805 shift64RightJamming( aSig
, 33, &aSig
);
4806 if ( aExp
|| aSig
) aExp
-= 0x3F81;
4807 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
4811 /*----------------------------------------------------------------------------
4812 | Returns the result of converting the extended double-precision floating-
4813 | point value `a' to the double-precision floating-point format. The
4814 | conversion is performed according to the IEC/IEEE Standard for Binary
4815 | Floating-Point Arithmetic.
4816 *----------------------------------------------------------------------------*/
4818 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
4822 uint64_t aSig
, zSig
;
4824 if (floatx80_invalid_encoding(a
)) {
4825 float_raise(float_flag_invalid
, status
);
4826 return float64_default_nan(status
);
4828 aSig
= extractFloatx80Frac( a
);
4829 aExp
= extractFloatx80Exp( a
);
4830 aSign
= extractFloatx80Sign( a
);
4831 if ( aExp
== 0x7FFF ) {
4832 if ( (uint64_t) ( aSig
<<1 ) ) {
4833 return commonNaNToFloat64(floatx80ToCommonNaN(a
, status
), status
);
4835 return packFloat64( aSign
, 0x7FF, 0 );
4837 shift64RightJamming( aSig
, 1, &zSig
);
4838 if ( aExp
|| aSig
) aExp
-= 0x3C01;
4839 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
4843 /*----------------------------------------------------------------------------
4844 | Returns the result of converting the extended double-precision floating-
4845 | point value `a' to the quadruple-precision floating-point format. The
4846 | conversion is performed according to the IEC/IEEE Standard for Binary
4847 | Floating-Point Arithmetic.
4848 *----------------------------------------------------------------------------*/
4850 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
4854 uint64_t aSig
, zSig0
, zSig1
;
4856 if (floatx80_invalid_encoding(a
)) {
4857 float_raise(float_flag_invalid
, status
);
4858 return float128_default_nan(status
);
4860 aSig
= extractFloatx80Frac( a
);
4861 aExp
= extractFloatx80Exp( a
);
4862 aSign
= extractFloatx80Sign( a
);
4863 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
4864 return commonNaNToFloat128(floatx80ToCommonNaN(a
, status
), status
);
4866 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
4867 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
4871 /*----------------------------------------------------------------------------
4872 | Rounds the extended double-precision floating-point value `a'
4873 | to the precision provided by floatx80_rounding_precision and returns the
4874 | result as an extended double-precision floating-point value.
4875 | The operation is performed according to the IEC/IEEE Standard for Binary
4876 | Floating-Point Arithmetic.
4877 *----------------------------------------------------------------------------*/
4879 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
4881 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
4882 extractFloatx80Sign(a
),
4883 extractFloatx80Exp(a
),
4884 extractFloatx80Frac(a
), 0, status
);
4887 /*----------------------------------------------------------------------------
4888 | Rounds the extended double-precision floating-point value `a' to an integer,
4889 | and returns the result as an extended quadruple-precision floating-point
4890 | value. The operation is performed according to the IEC/IEEE Standard for
4891 | Binary Floating-Point Arithmetic.
4892 *----------------------------------------------------------------------------*/
4894 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
4898 uint64_t lastBitMask
, roundBitsMask
;
4901 if (floatx80_invalid_encoding(a
)) {
4902 float_raise(float_flag_invalid
, status
);
4903 return floatx80_default_nan(status
);
4905 aExp
= extractFloatx80Exp( a
);
4906 if ( 0x403E <= aExp
) {
4907 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
4908 return propagateFloatx80NaN(a
, a
, status
);
4912 if ( aExp
< 0x3FFF ) {
4914 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
4917 status
->float_exception_flags
|= float_flag_inexact
;
4918 aSign
= extractFloatx80Sign( a
);
4919 switch (status
->float_rounding_mode
) {
4920 case float_round_nearest_even
:
4921 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
4924 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
4927 case float_round_ties_away
:
4928 if (aExp
== 0x3FFE) {
4929 return packFloatx80(aSign
, 0x3FFF, LIT64(0x8000000000000000));
4932 case float_round_down
:
4935 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4936 : packFloatx80( 0, 0, 0 );
4937 case float_round_up
:
4939 aSign
? packFloatx80( 1, 0, 0 )
4940 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4942 return packFloatx80( aSign
, 0, 0 );
4945 lastBitMask
<<= 0x403E - aExp
;
4946 roundBitsMask
= lastBitMask
- 1;
4948 switch (status
->float_rounding_mode
) {
4949 case float_round_nearest_even
:
4950 z
.low
+= lastBitMask
>>1;
4951 if ((z
.low
& roundBitsMask
) == 0) {
4952 z
.low
&= ~lastBitMask
;
4955 case float_round_ties_away
:
4956 z
.low
+= lastBitMask
>> 1;
4958 case float_round_to_zero
:
4960 case float_round_up
:
4961 if (!extractFloatx80Sign(z
)) {
4962 z
.low
+= roundBitsMask
;
4965 case float_round_down
:
4966 if (extractFloatx80Sign(z
)) {
4967 z
.low
+= roundBitsMask
;
4973 z
.low
&= ~ roundBitsMask
;
4976 z
.low
= LIT64( 0x8000000000000000 );
4978 if (z
.low
!= a
.low
) {
4979 status
->float_exception_flags
|= float_flag_inexact
;
4985 /*----------------------------------------------------------------------------
4986 | Returns the result of adding the absolute values of the extended double-
4987 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4988 | negated before being returned. `zSign' is ignored if the result is a NaN.
4989 | The addition is performed according to the IEC/IEEE Standard for Binary
4990 | Floating-Point Arithmetic.
4991 *----------------------------------------------------------------------------*/
4993 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
4994 float_status
*status
)
4996 int32_t aExp
, bExp
, zExp
;
4997 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5000 aSig
= extractFloatx80Frac( a
);
5001 aExp
= extractFloatx80Exp( a
);
5002 bSig
= extractFloatx80Frac( b
);
5003 bExp
= extractFloatx80Exp( b
);
5004 expDiff
= aExp
- bExp
;
5005 if ( 0 < expDiff
) {
5006 if ( aExp
== 0x7FFF ) {
5007 if ((uint64_t)(aSig
<< 1)) {
5008 return propagateFloatx80NaN(a
, b
, status
);
5012 if ( bExp
== 0 ) --expDiff
;
5013 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5016 else if ( expDiff
< 0 ) {
5017 if ( bExp
== 0x7FFF ) {
5018 if ((uint64_t)(bSig
<< 1)) {
5019 return propagateFloatx80NaN(a
, b
, status
);
5021 return packFloatx80(zSign
,
5022 floatx80_infinity_high
,
5023 floatx80_infinity_low
);
5025 if ( aExp
== 0 ) ++expDiff
;
5026 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5030 if ( aExp
== 0x7FFF ) {
5031 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5032 return propagateFloatx80NaN(a
, b
, status
);
5037 zSig0
= aSig
+ bSig
;
5039 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5045 zSig0
= aSig
+ bSig
;
5046 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5048 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5049 zSig0
|= LIT64( 0x8000000000000000 );
5052 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5053 zSign
, zExp
, zSig0
, zSig1
, status
);
5056 /*----------------------------------------------------------------------------
5057 | Returns the result of subtracting the absolute values of the extended
5058 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5059 | difference is negated before being returned. `zSign' is ignored if the
5060 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5061 | Standard for Binary Floating-Point Arithmetic.
5062 *----------------------------------------------------------------------------*/
5064 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
5065 float_status
*status
)
5067 int32_t aExp
, bExp
, zExp
;
5068 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5071 aSig
= extractFloatx80Frac( a
);
5072 aExp
= extractFloatx80Exp( a
);
5073 bSig
= extractFloatx80Frac( b
);
5074 bExp
= extractFloatx80Exp( b
);
5075 expDiff
= aExp
- bExp
;
5076 if ( 0 < expDiff
) goto aExpBigger
;
5077 if ( expDiff
< 0 ) goto bExpBigger
;
5078 if ( aExp
== 0x7FFF ) {
5079 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5080 return propagateFloatx80NaN(a
, b
, status
);
5082 float_raise(float_flag_invalid
, status
);
5083 return floatx80_default_nan(status
);
5090 if ( bSig
< aSig
) goto aBigger
;
5091 if ( aSig
< bSig
) goto bBigger
;
5092 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
5094 if ( bExp
== 0x7FFF ) {
5095 if ((uint64_t)(bSig
<< 1)) {
5096 return propagateFloatx80NaN(a
, b
, status
);
5098 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
5099 floatx80_infinity_low
);
5101 if ( aExp
== 0 ) ++expDiff
;
5102 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5104 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
5107 goto normalizeRoundAndPack
;
5109 if ( aExp
== 0x7FFF ) {
5110 if ((uint64_t)(aSig
<< 1)) {
5111 return propagateFloatx80NaN(a
, b
, status
);
5115 if ( bExp
== 0 ) --expDiff
;
5116 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5118 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
5120 normalizeRoundAndPack
:
5121 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
5122 zSign
, zExp
, zSig0
, zSig1
, status
);
5125 /*----------------------------------------------------------------------------
5126 | Returns the result of adding the extended double-precision floating-point
5127 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5128 | Standard for Binary Floating-Point Arithmetic.
5129 *----------------------------------------------------------------------------*/
5131 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
5135 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5136 float_raise(float_flag_invalid
, status
);
5137 return floatx80_default_nan(status
);
5139 aSign
= extractFloatx80Sign( a
);
5140 bSign
= extractFloatx80Sign( b
);
5141 if ( aSign
== bSign
) {
5142 return addFloatx80Sigs(a
, b
, aSign
, status
);
5145 return subFloatx80Sigs(a
, b
, aSign
, status
);
5150 /*----------------------------------------------------------------------------
5151 | Returns the result of subtracting the extended double-precision floating-
5152 | point values `a' and `b'. The operation is performed according to the
5153 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5154 *----------------------------------------------------------------------------*/
5156 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5160 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5161 float_raise(float_flag_invalid
, status
);
5162 return floatx80_default_nan(status
);
5164 aSign
= extractFloatx80Sign( a
);
5165 bSign
= extractFloatx80Sign( b
);
5166 if ( aSign
== bSign
) {
5167 return subFloatx80Sigs(a
, b
, aSign
, status
);
5170 return addFloatx80Sigs(a
, b
, aSign
, status
);
5175 /*----------------------------------------------------------------------------
5176 | Returns the result of multiplying the extended double-precision floating-
5177 | point values `a' and `b'. The operation is performed according to the
5178 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5179 *----------------------------------------------------------------------------*/
5181 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
5183 flag aSign
, bSign
, zSign
;
5184 int32_t aExp
, bExp
, zExp
;
5185 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5187 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5188 float_raise(float_flag_invalid
, status
);
5189 return floatx80_default_nan(status
);
5191 aSig
= extractFloatx80Frac( a
);
5192 aExp
= extractFloatx80Exp( a
);
5193 aSign
= extractFloatx80Sign( a
);
5194 bSig
= extractFloatx80Frac( b
);
5195 bExp
= extractFloatx80Exp( b
);
5196 bSign
= extractFloatx80Sign( b
);
5197 zSign
= aSign
^ bSign
;
5198 if ( aExp
== 0x7FFF ) {
5199 if ( (uint64_t) ( aSig
<<1 )
5200 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5201 return propagateFloatx80NaN(a
, b
, status
);
5203 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
5204 return packFloatx80(zSign
, floatx80_infinity_high
,
5205 floatx80_infinity_low
);
5207 if ( bExp
== 0x7FFF ) {
5208 if ((uint64_t)(bSig
<< 1)) {
5209 return propagateFloatx80NaN(a
, b
, status
);
5211 if ( ( aExp
| aSig
) == 0 ) {
5213 float_raise(float_flag_invalid
, status
);
5214 return floatx80_default_nan(status
);
5216 return packFloatx80(zSign
, floatx80_infinity_high
,
5217 floatx80_infinity_low
);
5220 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5221 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5224 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5225 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5227 zExp
= aExp
+ bExp
- 0x3FFE;
5228 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
5229 if ( 0 < (int64_t) zSig0
) {
5230 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5233 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5234 zSign
, zExp
, zSig0
, zSig1
, status
);
5237 /*----------------------------------------------------------------------------
5238 | Returns the result of dividing the extended double-precision floating-point
5239 | value `a' by the corresponding value `b'. The operation is performed
5240 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5241 *----------------------------------------------------------------------------*/
5243 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
5245 flag aSign
, bSign
, zSign
;
5246 int32_t aExp
, bExp
, zExp
;
5247 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5248 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
5250 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5251 float_raise(float_flag_invalid
, status
);
5252 return floatx80_default_nan(status
);
5254 aSig
= extractFloatx80Frac( a
);
5255 aExp
= extractFloatx80Exp( a
);
5256 aSign
= extractFloatx80Sign( a
);
5257 bSig
= extractFloatx80Frac( b
);
5258 bExp
= extractFloatx80Exp( b
);
5259 bSign
= extractFloatx80Sign( b
);
5260 zSign
= aSign
^ bSign
;
5261 if ( aExp
== 0x7FFF ) {
5262 if ((uint64_t)(aSig
<< 1)) {
5263 return propagateFloatx80NaN(a
, b
, status
);
5265 if ( bExp
== 0x7FFF ) {
5266 if ((uint64_t)(bSig
<< 1)) {
5267 return propagateFloatx80NaN(a
, b
, status
);
5271 return packFloatx80(zSign
, floatx80_infinity_high
,
5272 floatx80_infinity_low
);
5274 if ( bExp
== 0x7FFF ) {
5275 if ((uint64_t)(bSig
<< 1)) {
5276 return propagateFloatx80NaN(a
, b
, status
);
5278 return packFloatx80( zSign
, 0, 0 );
5282 if ( ( aExp
| aSig
) == 0 ) {
5284 float_raise(float_flag_invalid
, status
);
5285 return floatx80_default_nan(status
);
5287 float_raise(float_flag_divbyzero
, status
);
5288 return packFloatx80(zSign
, floatx80_infinity_high
,
5289 floatx80_infinity_low
);
5291 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5294 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5295 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5297 zExp
= aExp
- bExp
+ 0x3FFE;
5299 if ( bSig
<= aSig
) {
5300 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
5303 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
5304 mul64To128( bSig
, zSig0
, &term0
, &term1
);
5305 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
5306 while ( (int64_t) rem0
< 0 ) {
5308 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
5310 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
5311 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
5312 mul64To128( bSig
, zSig1
, &term1
, &term2
);
5313 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5314 while ( (int64_t) rem1
< 0 ) {
5316 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
5318 zSig1
|= ( ( rem1
| rem2
) != 0 );
5320 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5321 zSign
, zExp
, zSig0
, zSig1
, status
);
5324 /*----------------------------------------------------------------------------
5325 | Returns the remainder of the extended double-precision floating-point value
5326 | `a' with respect to the corresponding value `b'. The operation is performed
5327 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5328 *----------------------------------------------------------------------------*/
5330 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
5333 int32_t aExp
, bExp
, expDiff
;
5334 uint64_t aSig0
, aSig1
, bSig
;
5335 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
5337 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5338 float_raise(float_flag_invalid
, status
);
5339 return floatx80_default_nan(status
);
5341 aSig0
= extractFloatx80Frac( a
);
5342 aExp
= extractFloatx80Exp( a
);
5343 aSign
= extractFloatx80Sign( a
);
5344 bSig
= extractFloatx80Frac( b
);
5345 bExp
= extractFloatx80Exp( b
);
5346 if ( aExp
== 0x7FFF ) {
5347 if ( (uint64_t) ( aSig0
<<1 )
5348 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5349 return propagateFloatx80NaN(a
, b
, status
);
5353 if ( bExp
== 0x7FFF ) {
5354 if ((uint64_t)(bSig
<< 1)) {
5355 return propagateFloatx80NaN(a
, b
, status
);
5362 float_raise(float_flag_invalid
, status
);
5363 return floatx80_default_nan(status
);
5365 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5368 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
5369 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5371 bSig
|= LIT64( 0x8000000000000000 );
5373 expDiff
= aExp
- bExp
;
5375 if ( expDiff
< 0 ) {
5376 if ( expDiff
< -1 ) return a
;
5377 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
5380 q
= ( bSig
<= aSig0
);
5381 if ( q
) aSig0
-= bSig
;
5383 while ( 0 < expDiff
) {
5384 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5385 q
= ( 2 < q
) ? q
- 2 : 0;
5386 mul64To128( bSig
, q
, &term0
, &term1
);
5387 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5388 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
5392 if ( 0 < expDiff
) {
5393 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5394 q
= ( 2 < q
) ? q
- 2 : 0;
5396 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
5397 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5398 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
5399 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
5401 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5408 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
5409 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5410 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5413 aSig0
= alternateASig0
;
5414 aSig1
= alternateASig1
;
5418 normalizeRoundAndPackFloatx80(
5419 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
5423 /*----------------------------------------------------------------------------
5424 | Returns the square root of the extended double-precision floating-point
5425 | value `a'. The operation is performed according to the IEC/IEEE Standard
5426 | for Binary Floating-Point Arithmetic.
5427 *----------------------------------------------------------------------------*/
5429 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
5433 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
5434 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5436 if (floatx80_invalid_encoding(a
)) {
5437 float_raise(float_flag_invalid
, status
);
5438 return floatx80_default_nan(status
);
5440 aSig0
= extractFloatx80Frac( a
);
5441 aExp
= extractFloatx80Exp( a
);
5442 aSign
= extractFloatx80Sign( a
);
5443 if ( aExp
== 0x7FFF ) {
5444 if ((uint64_t)(aSig0
<< 1)) {
5445 return propagateFloatx80NaN(a
, a
, status
);
5447 if ( ! aSign
) return a
;
5451 if ( ( aExp
| aSig0
) == 0 ) return a
;
5453 float_raise(float_flag_invalid
, status
);
5454 return floatx80_default_nan(status
);
5457 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
5458 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5460 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
5461 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
5462 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
5463 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5464 doubleZSig0
= zSig0
<<1;
5465 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5466 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5467 while ( (int64_t) rem0
< 0 ) {
5470 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5472 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5473 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5474 if ( zSig1
== 0 ) zSig1
= 1;
5475 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5476 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5477 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5478 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5479 while ( (int64_t) rem1
< 0 ) {
5481 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5483 term2
|= doubleZSig0
;
5484 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5486 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5488 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
5489 zSig0
|= doubleZSig0
;
5490 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5491 0, zExp
, zSig0
, zSig1
, status
);
5494 /*----------------------------------------------------------------------------
5495 | Returns 1 if the extended double-precision floating-point value `a' is equal
5496 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5497 | raised if either operand is a NaN. Otherwise, the comparison is performed
5498 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5499 *----------------------------------------------------------------------------*/
5501 int floatx80_eq(floatx80 a
, floatx80 b
, float_status
*status
)
5504 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5505 || (extractFloatx80Exp(a
) == 0x7FFF
5506 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5507 || (extractFloatx80Exp(b
) == 0x7FFF
5508 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5510 float_raise(float_flag_invalid
, status
);
5515 && ( ( a
.high
== b
.high
)
5517 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5522 /*----------------------------------------------------------------------------
5523 | Returns 1 if the extended double-precision floating-point value `a' is
5524 | less than or equal to the corresponding value `b', and 0 otherwise. The
5525 | invalid exception is raised if either operand is a NaN. The comparison is
5526 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5528 *----------------------------------------------------------------------------*/
5530 int floatx80_le(floatx80 a
, floatx80 b
, float_status
*status
)
5534 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5535 || (extractFloatx80Exp(a
) == 0x7FFF
5536 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5537 || (extractFloatx80Exp(b
) == 0x7FFF
5538 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5540 float_raise(float_flag_invalid
, status
);
5543 aSign
= extractFloatx80Sign( a
);
5544 bSign
= extractFloatx80Sign( b
);
5545 if ( aSign
!= bSign
) {
5548 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5552 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5553 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5557 /*----------------------------------------------------------------------------
5558 | Returns 1 if the extended double-precision floating-point value `a' is
5559 | less than the corresponding value `b', and 0 otherwise. The invalid
5560 | exception is raised if either operand is a NaN. The comparison is performed
5561 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5562 *----------------------------------------------------------------------------*/
5564 int floatx80_lt(floatx80 a
, floatx80 b
, float_status
*status
)
5568 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5569 || (extractFloatx80Exp(a
) == 0x7FFF
5570 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5571 || (extractFloatx80Exp(b
) == 0x7FFF
5572 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5574 float_raise(float_flag_invalid
, status
);
5577 aSign
= extractFloatx80Sign( a
);
5578 bSign
= extractFloatx80Sign( b
);
5579 if ( aSign
!= bSign
) {
5582 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5586 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5587 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5591 /*----------------------------------------------------------------------------
5592 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5593 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5594 | either operand is a NaN. The comparison is performed according to the
5595 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5596 *----------------------------------------------------------------------------*/
5597 int floatx80_unordered(floatx80 a
, floatx80 b
, float_status
*status
)
5599 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5600 || (extractFloatx80Exp(a
) == 0x7FFF
5601 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5602 || (extractFloatx80Exp(b
) == 0x7FFF
5603 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5605 float_raise(float_flag_invalid
, status
);
5611 /*----------------------------------------------------------------------------
5612 | Returns 1 if the extended double-precision floating-point value `a' is
5613 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5614 | cause an exception. The comparison is performed according to the IEC/IEEE
5615 | Standard for Binary Floating-Point Arithmetic.
5616 *----------------------------------------------------------------------------*/
5618 int floatx80_eq_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5621 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5622 float_raise(float_flag_invalid
, status
);
5625 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5626 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5627 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5628 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5630 if (floatx80_is_signaling_nan(a
, status
)
5631 || floatx80_is_signaling_nan(b
, status
)) {
5632 float_raise(float_flag_invalid
, status
);
5638 && ( ( a
.high
== b
.high
)
5640 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5645 /*----------------------------------------------------------------------------
5646 | Returns 1 if the extended double-precision floating-point value `a' is less
5647 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5648 | do not cause an exception. Otherwise, the comparison is performed according
5649 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5650 *----------------------------------------------------------------------------*/
5652 int floatx80_le_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5656 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5657 float_raise(float_flag_invalid
, status
);
5660 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5661 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5662 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5663 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5665 if (floatx80_is_signaling_nan(a
, status
)
5666 || floatx80_is_signaling_nan(b
, status
)) {
5667 float_raise(float_flag_invalid
, status
);
5671 aSign
= extractFloatx80Sign( a
);
5672 bSign
= extractFloatx80Sign( b
);
5673 if ( aSign
!= bSign
) {
5676 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5680 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5681 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5685 /*----------------------------------------------------------------------------
5686 | Returns 1 if the extended double-precision floating-point value `a' is less
5687 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5688 | an exception. Otherwise, the comparison is performed according to the
5689 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5690 *----------------------------------------------------------------------------*/
5692 int floatx80_lt_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5696 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5697 float_raise(float_flag_invalid
, status
);
5700 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5701 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5702 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5703 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5705 if (floatx80_is_signaling_nan(a
, status
)
5706 || floatx80_is_signaling_nan(b
, status
)) {
5707 float_raise(float_flag_invalid
, status
);
5711 aSign
= extractFloatx80Sign( a
);
5712 bSign
= extractFloatx80Sign( b
);
5713 if ( aSign
!= bSign
) {
5716 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5720 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5721 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5725 /*----------------------------------------------------------------------------
5726 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5727 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5728 | The comparison is performed according to the IEC/IEEE Standard for Binary
5729 | Floating-Point Arithmetic.
5730 *----------------------------------------------------------------------------*/
5731 int floatx80_unordered_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5733 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5734 float_raise(float_flag_invalid
, status
);
5737 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5738 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5739 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5740 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5742 if (floatx80_is_signaling_nan(a
, status
)
5743 || floatx80_is_signaling_nan(b
, status
)) {
5744 float_raise(float_flag_invalid
, status
);
5751 /*----------------------------------------------------------------------------
5752 | Returns the result of converting the quadruple-precision floating-point
5753 | value `a' to the 32-bit two's complement integer format. The conversion
5754 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5755 | Arithmetic---which means in particular that the conversion is rounded
5756 | according to the current rounding mode. If `a' is a NaN, the largest
5757 | positive integer is returned. Otherwise, if the conversion overflows, the
5758 | largest integer with the same sign as `a' is returned.
5759 *----------------------------------------------------------------------------*/
5761 int32_t float128_to_int32(float128 a
, float_status
*status
)
5764 int32_t aExp
, shiftCount
;
5765 uint64_t aSig0
, aSig1
;
5767 aSig1
= extractFloat128Frac1( a
);
5768 aSig0
= extractFloat128Frac0( a
);
5769 aExp
= extractFloat128Exp( a
);
5770 aSign
= extractFloat128Sign( a
);
5771 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
5772 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5773 aSig0
|= ( aSig1
!= 0 );
5774 shiftCount
= 0x4028 - aExp
;
5775 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
5776 return roundAndPackInt32(aSign
, aSig0
, status
);
5780 /*----------------------------------------------------------------------------
5781 | Returns the result of converting the quadruple-precision floating-point
5782 | value `a' to the 32-bit two's complement integer format. The conversion
5783 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5784 | Arithmetic, except that the conversion is always rounded toward zero. If
5785 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5786 | conversion overflows, the largest integer with the same sign as `a' is
5788 *----------------------------------------------------------------------------*/
5790 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
5793 int32_t aExp
, shiftCount
;
5794 uint64_t aSig0
, aSig1
, savedASig
;
5797 aSig1
= extractFloat128Frac1( a
);
5798 aSig0
= extractFloat128Frac0( a
);
5799 aExp
= extractFloat128Exp( a
);
5800 aSign
= extractFloat128Sign( a
);
5801 aSig0
|= ( aSig1
!= 0 );
5802 if ( 0x401E < aExp
) {
5803 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
5806 else if ( aExp
< 0x3FFF ) {
5807 if (aExp
|| aSig0
) {
5808 status
->float_exception_flags
|= float_flag_inexact
;
5812 aSig0
|= LIT64( 0x0001000000000000 );
5813 shiftCount
= 0x402F - aExp
;
5815 aSig0
>>= shiftCount
;
5817 if ( aSign
) z
= - z
;
5818 if ( ( z
< 0 ) ^ aSign
) {
5820 float_raise(float_flag_invalid
, status
);
5821 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5823 if ( ( aSig0
<<shiftCount
) != savedASig
) {
5824 status
->float_exception_flags
|= float_flag_inexact
;
5830 /*----------------------------------------------------------------------------
5831 | Returns the result of converting the quadruple-precision floating-point
5832 | value `a' to the 64-bit two's complement integer format. The conversion
5833 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5834 | Arithmetic---which means in particular that the conversion is rounded
5835 | according to the current rounding mode. If `a' is a NaN, the largest
5836 | positive integer is returned. Otherwise, if the conversion overflows, the
5837 | largest integer with the same sign as `a' is returned.
5838 *----------------------------------------------------------------------------*/
5840 int64_t float128_to_int64(float128 a
, float_status
*status
)
5843 int32_t aExp
, shiftCount
;
5844 uint64_t aSig0
, aSig1
;
5846 aSig1
= extractFloat128Frac1( a
);
5847 aSig0
= extractFloat128Frac0( a
);
5848 aExp
= extractFloat128Exp( a
);
5849 aSign
= extractFloat128Sign( a
);
5850 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5851 shiftCount
= 0x402F - aExp
;
5852 if ( shiftCount
<= 0 ) {
5853 if ( 0x403E < aExp
) {
5854 float_raise(float_flag_invalid
, status
);
5856 || ( ( aExp
== 0x7FFF )
5857 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
5860 return LIT64( 0x7FFFFFFFFFFFFFFF );
5862 return (int64_t) LIT64( 0x8000000000000000 );
5864 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
5867 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
5869 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
5873 /*----------------------------------------------------------------------------
5874 | Returns the result of converting the quadruple-precision floating-point
5875 | value `a' to the 64-bit two's complement integer format. The conversion
5876 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5877 | Arithmetic, except that the conversion is always rounded toward zero.
5878 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5879 | the conversion overflows, the largest integer with the same sign as `a' is
5881 *----------------------------------------------------------------------------*/
5883 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
5886 int32_t aExp
, shiftCount
;
5887 uint64_t aSig0
, aSig1
;
5890 aSig1
= extractFloat128Frac1( a
);
5891 aSig0
= extractFloat128Frac0( a
);
5892 aExp
= extractFloat128Exp( a
);
5893 aSign
= extractFloat128Sign( a
);
5894 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5895 shiftCount
= aExp
- 0x402F;
5896 if ( 0 < shiftCount
) {
5897 if ( 0x403E <= aExp
) {
5898 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
5899 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
5900 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
5902 status
->float_exception_flags
|= float_flag_inexact
;
5906 float_raise(float_flag_invalid
, status
);
5907 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
5908 return LIT64( 0x7FFFFFFFFFFFFFFF );
5911 return (int64_t) LIT64( 0x8000000000000000 );
5913 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
5914 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
5915 status
->float_exception_flags
|= float_flag_inexact
;
5919 if ( aExp
< 0x3FFF ) {
5920 if ( aExp
| aSig0
| aSig1
) {
5921 status
->float_exception_flags
|= float_flag_inexact
;
5925 z
= aSig0
>>( - shiftCount
);
5927 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
5928 status
->float_exception_flags
|= float_flag_inexact
;
5931 if ( aSign
) z
= - z
;
5936 /*----------------------------------------------------------------------------
5937 | Returns the result of converting the quadruple-precision floating-point value
5938 | `a' to the 64-bit unsigned integer format. The conversion is
5939 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5940 | Arithmetic---which means in particular that the conversion is rounded
5941 | according to the current rounding mode. If `a' is a NaN, the largest
5942 | positive integer is returned. If the conversion overflows, the
5943 | largest unsigned integer is returned. If 'a' is negative, the value is
5944 | rounded and zero is returned; negative values that do not round to zero
5945 | will raise the inexact exception.
5946 *----------------------------------------------------------------------------*/
5948 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
5953 uint64_t aSig0
, aSig1
;
5955 aSig0
= extractFloat128Frac0(a
);
5956 aSig1
= extractFloat128Frac1(a
);
5957 aExp
= extractFloat128Exp(a
);
5958 aSign
= extractFloat128Sign(a
);
5959 if (aSign
&& (aExp
> 0x3FFE)) {
5960 float_raise(float_flag_invalid
, status
);
5961 if (float128_is_any_nan(a
)) {
5962 return LIT64(0xFFFFFFFFFFFFFFFF);
5968 aSig0
|= LIT64(0x0001000000000000);
5970 shiftCount
= 0x402F - aExp
;
5971 if (shiftCount
<= 0) {
5972 if (0x403E < aExp
) {
5973 float_raise(float_flag_invalid
, status
);
5974 return LIT64(0xFFFFFFFFFFFFFFFF);
5976 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
5978 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
5980 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
5983 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
5986 signed char current_rounding_mode
= status
->float_rounding_mode
;
5988 set_float_rounding_mode(float_round_to_zero
, status
);
5989 v
= float128_to_uint64(a
, status
);
5990 set_float_rounding_mode(current_rounding_mode
, status
);
5995 /*----------------------------------------------------------------------------
5996 | Returns the result of converting the quadruple-precision floating-point
5997 | value `a' to the 32-bit unsigned integer format. The conversion
5998 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5999 | Arithmetic except that the conversion is always rounded toward zero.
6000 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6001 | if the conversion overflows, the largest unsigned integer is returned.
6002 | If 'a' is negative, the value is rounded and zero is returned; negative
6003 | values that do not round to zero will raise the inexact exception.
6004 *----------------------------------------------------------------------------*/
6006 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
6010 int old_exc_flags
= get_float_exception_flags(status
);
6012 v
= float128_to_uint64_round_to_zero(a
, status
);
6013 if (v
> 0xffffffff) {
6018 set_float_exception_flags(old_exc_flags
, status
);
6019 float_raise(float_flag_invalid
, status
);
6023 /*----------------------------------------------------------------------------
6024 | Returns the result of converting the quadruple-precision floating-point
6025 | value `a' to the single-precision floating-point format. The conversion
6026 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6028 *----------------------------------------------------------------------------*/
6030 float32
float128_to_float32(float128 a
, float_status
*status
)
6034 uint64_t aSig0
, aSig1
;
6037 aSig1
= extractFloat128Frac1( a
);
6038 aSig0
= extractFloat128Frac0( a
);
6039 aExp
= extractFloat128Exp( a
);
6040 aSign
= extractFloat128Sign( a
);
6041 if ( aExp
== 0x7FFF ) {
6042 if ( aSig0
| aSig1
) {
6043 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
6045 return packFloat32( aSign
, 0xFF, 0 );
6047 aSig0
|= ( aSig1
!= 0 );
6048 shift64RightJamming( aSig0
, 18, &aSig0
);
6050 if ( aExp
|| zSig
) {
6054 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
6058 /*----------------------------------------------------------------------------
6059 | Returns the result of converting the quadruple-precision floating-point
6060 | value `a' to the double-precision floating-point format. The conversion
6061 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6063 *----------------------------------------------------------------------------*/
6065 float64
float128_to_float64(float128 a
, float_status
*status
)
6069 uint64_t aSig0
, aSig1
;
6071 aSig1
= extractFloat128Frac1( a
);
6072 aSig0
= extractFloat128Frac0( a
);
6073 aExp
= extractFloat128Exp( a
);
6074 aSign
= extractFloat128Sign( a
);
6075 if ( aExp
== 0x7FFF ) {
6076 if ( aSig0
| aSig1
) {
6077 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
6079 return packFloat64( aSign
, 0x7FF, 0 );
6081 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6082 aSig0
|= ( aSig1
!= 0 );
6083 if ( aExp
|| aSig0
) {
6084 aSig0
|= LIT64( 0x4000000000000000 );
6087 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
6091 /*----------------------------------------------------------------------------
6092 | Returns the result of converting the quadruple-precision floating-point
6093 | value `a' to the extended double-precision floating-point format. The
6094 | conversion is performed according to the IEC/IEEE Standard for Binary
6095 | Floating-Point Arithmetic.
6096 *----------------------------------------------------------------------------*/
6098 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6102 uint64_t aSig0
, aSig1
;
6104 aSig1
= extractFloat128Frac1( a
);
6105 aSig0
= extractFloat128Frac0( a
);
6106 aExp
= extractFloat128Exp( a
);
6107 aSign
= extractFloat128Sign( a
);
6108 if ( aExp
== 0x7FFF ) {
6109 if ( aSig0
| aSig1
) {
6110 return commonNaNToFloatx80(float128ToCommonNaN(a
, status
), status
);
6112 return packFloatx80(aSign
, floatx80_infinity_high
,
6113 floatx80_infinity_low
);
6116 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6117 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6120 aSig0
|= LIT64( 0x0001000000000000 );
6122 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6123 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6127 /*----------------------------------------------------------------------------
6128 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6129 | returns the result as a quadruple-precision floating-point value. The
6130 | operation is performed according to the IEC/IEEE Standard for Binary
6131 | Floating-Point Arithmetic.
6132 *----------------------------------------------------------------------------*/
6134 float128
float128_round_to_int(float128 a
, float_status
*status
)
6138 uint64_t lastBitMask
, roundBitsMask
;
6141 aExp
= extractFloat128Exp( a
);
6142 if ( 0x402F <= aExp
) {
6143 if ( 0x406F <= aExp
) {
6144 if ( ( aExp
== 0x7FFF )
6145 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6147 return propagateFloat128NaN(a
, a
, status
);
6152 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6153 roundBitsMask
= lastBitMask
- 1;
6155 switch (status
->float_rounding_mode
) {
6156 case float_round_nearest_even
:
6157 if ( lastBitMask
) {
6158 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6159 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6162 if ( (int64_t) z
.low
< 0 ) {
6164 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6168 case float_round_ties_away
:
6170 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6172 if ((int64_t) z
.low
< 0) {
6177 case float_round_to_zero
:
6179 case float_round_up
:
6180 if (!extractFloat128Sign(z
)) {
6181 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6184 case float_round_down
:
6185 if (extractFloat128Sign(z
)) {
6186 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6192 z
.low
&= ~ roundBitsMask
;
6195 if ( aExp
< 0x3FFF ) {
6196 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6197 status
->float_exception_flags
|= float_flag_inexact
;
6198 aSign
= extractFloat128Sign( a
);
6199 switch (status
->float_rounding_mode
) {
6200 case float_round_nearest_even
:
6201 if ( ( aExp
== 0x3FFE )
6202 && ( extractFloat128Frac0( a
)
6203 | extractFloat128Frac1( a
) )
6205 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6208 case float_round_ties_away
:
6209 if (aExp
== 0x3FFE) {
6210 return packFloat128(aSign
, 0x3FFF, 0, 0);
6213 case float_round_down
:
6215 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6216 : packFloat128( 0, 0, 0, 0 );
6217 case float_round_up
:
6219 aSign
? packFloat128( 1, 0, 0, 0 )
6220 : packFloat128( 0, 0x3FFF, 0, 0 );
6222 return packFloat128( aSign
, 0, 0, 0 );
6225 lastBitMask
<<= 0x402F - aExp
;
6226 roundBitsMask
= lastBitMask
- 1;
6229 switch (status
->float_rounding_mode
) {
6230 case float_round_nearest_even
:
6231 z
.high
+= lastBitMask
>>1;
6232 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6233 z
.high
&= ~ lastBitMask
;
6236 case float_round_ties_away
:
6237 z
.high
+= lastBitMask
>>1;
6239 case float_round_to_zero
:
6241 case float_round_up
:
6242 if (!extractFloat128Sign(z
)) {
6243 z
.high
|= ( a
.low
!= 0 );
6244 z
.high
+= roundBitsMask
;
6247 case float_round_down
:
6248 if (extractFloat128Sign(z
)) {
6249 z
.high
|= (a
.low
!= 0);
6250 z
.high
+= roundBitsMask
;
6256 z
.high
&= ~ roundBitsMask
;
6258 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
6259 status
->float_exception_flags
|= float_flag_inexact
;
6265 /*----------------------------------------------------------------------------
6266 | Returns the result of adding the absolute values of the quadruple-precision
6267 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6268 | before being returned. `zSign' is ignored if the result is a NaN.
6269 | The addition is performed according to the IEC/IEEE Standard for Binary
6270 | Floating-Point Arithmetic.
6271 *----------------------------------------------------------------------------*/
6273 static float128
addFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6274 float_status
*status
)
6276 int32_t aExp
, bExp
, zExp
;
6277 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6280 aSig1
= extractFloat128Frac1( a
);
6281 aSig0
= extractFloat128Frac0( a
);
6282 aExp
= extractFloat128Exp( a
);
6283 bSig1
= extractFloat128Frac1( b
);
6284 bSig0
= extractFloat128Frac0( b
);
6285 bExp
= extractFloat128Exp( b
);
6286 expDiff
= aExp
- bExp
;
6287 if ( 0 < expDiff
) {
6288 if ( aExp
== 0x7FFF ) {
6289 if (aSig0
| aSig1
) {
6290 return propagateFloat128NaN(a
, b
, status
);
6298 bSig0
|= LIT64( 0x0001000000000000 );
6300 shift128ExtraRightJamming(
6301 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
6304 else if ( expDiff
< 0 ) {
6305 if ( bExp
== 0x7FFF ) {
6306 if (bSig0
| bSig1
) {
6307 return propagateFloat128NaN(a
, b
, status
);
6309 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6315 aSig0
|= LIT64( 0x0001000000000000 );
6317 shift128ExtraRightJamming(
6318 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
6322 if ( aExp
== 0x7FFF ) {
6323 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6324 return propagateFloat128NaN(a
, b
, status
);
6328 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6330 if (status
->flush_to_zero
) {
6331 if (zSig0
| zSig1
) {
6332 float_raise(float_flag_output_denormal
, status
);
6334 return packFloat128(zSign
, 0, 0, 0);
6336 return packFloat128( zSign
, 0, zSig0
, zSig1
);
6339 zSig0
|= LIT64( 0x0002000000000000 );
6343 aSig0
|= LIT64( 0x0001000000000000 );
6344 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6346 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
6349 shift128ExtraRightJamming(
6350 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6352 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6356 /*----------------------------------------------------------------------------
6357 | Returns the result of subtracting the absolute values of the quadruple-
6358 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6359 | difference is negated before being returned. `zSign' is ignored if the
6360 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6361 | Standard for Binary Floating-Point Arithmetic.
6362 *----------------------------------------------------------------------------*/
6364 static float128
subFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6365 float_status
*status
)
6367 int32_t aExp
, bExp
, zExp
;
6368 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
6371 aSig1
= extractFloat128Frac1( a
);
6372 aSig0
= extractFloat128Frac0( a
);
6373 aExp
= extractFloat128Exp( a
);
6374 bSig1
= extractFloat128Frac1( b
);
6375 bSig0
= extractFloat128Frac0( b
);
6376 bExp
= extractFloat128Exp( b
);
6377 expDiff
= aExp
- bExp
;
6378 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6379 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
6380 if ( 0 < expDiff
) goto aExpBigger
;
6381 if ( expDiff
< 0 ) goto bExpBigger
;
6382 if ( aExp
== 0x7FFF ) {
6383 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6384 return propagateFloat128NaN(a
, b
, status
);
6386 float_raise(float_flag_invalid
, status
);
6387 return float128_default_nan(status
);
6393 if ( bSig0
< aSig0
) goto aBigger
;
6394 if ( aSig0
< bSig0
) goto bBigger
;
6395 if ( bSig1
< aSig1
) goto aBigger
;
6396 if ( aSig1
< bSig1
) goto bBigger
;
6397 return packFloat128(status
->float_rounding_mode
== float_round_down
,
6400 if ( bExp
== 0x7FFF ) {
6401 if (bSig0
| bSig1
) {
6402 return propagateFloat128NaN(a
, b
, status
);
6404 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
6410 aSig0
|= LIT64( 0x4000000000000000 );
6412 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6413 bSig0
|= LIT64( 0x4000000000000000 );
6415 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6418 goto normalizeRoundAndPack
;
6420 if ( aExp
== 0x7FFF ) {
6421 if (aSig0
| aSig1
) {
6422 return propagateFloat128NaN(a
, b
, status
);
6430 bSig0
|= LIT64( 0x4000000000000000 );
6432 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
6433 aSig0
|= LIT64( 0x4000000000000000 );
6435 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6437 normalizeRoundAndPack
:
6439 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
6444 /*----------------------------------------------------------------------------
6445 | Returns the result of adding the quadruple-precision floating-point values
6446 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6447 | for Binary Floating-Point Arithmetic.
6448 *----------------------------------------------------------------------------*/
6450 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
6454 aSign
= extractFloat128Sign( a
);
6455 bSign
= extractFloat128Sign( b
);
6456 if ( aSign
== bSign
) {
6457 return addFloat128Sigs(a
, b
, aSign
, status
);
6460 return subFloat128Sigs(a
, b
, aSign
, status
);
6465 /*----------------------------------------------------------------------------
6466 | Returns the result of subtracting the quadruple-precision floating-point
6467 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6468 | Standard for Binary Floating-Point Arithmetic.
6469 *----------------------------------------------------------------------------*/
6471 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
6475 aSign
= extractFloat128Sign( a
);
6476 bSign
= extractFloat128Sign( b
);
6477 if ( aSign
== bSign
) {
6478 return subFloat128Sigs(a
, b
, aSign
, status
);
6481 return addFloat128Sigs(a
, b
, aSign
, status
);
6486 /*----------------------------------------------------------------------------
6487 | Returns the result of multiplying the quadruple-precision floating-point
6488 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6489 | Standard for Binary Floating-Point Arithmetic.
6490 *----------------------------------------------------------------------------*/
6492 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
6494 flag aSign
, bSign
, zSign
;
6495 int32_t aExp
, bExp
, zExp
;
6496 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
6498 aSig1
= extractFloat128Frac1( a
);
6499 aSig0
= extractFloat128Frac0( a
);
6500 aExp
= extractFloat128Exp( a
);
6501 aSign
= extractFloat128Sign( a
);
6502 bSig1
= extractFloat128Frac1( b
);
6503 bSig0
= extractFloat128Frac0( b
);
6504 bExp
= extractFloat128Exp( b
);
6505 bSign
= extractFloat128Sign( b
);
6506 zSign
= aSign
^ bSign
;
6507 if ( aExp
== 0x7FFF ) {
6508 if ( ( aSig0
| aSig1
)
6509 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6510 return propagateFloat128NaN(a
, b
, status
);
6512 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
6513 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6515 if ( bExp
== 0x7FFF ) {
6516 if (bSig0
| bSig1
) {
6517 return propagateFloat128NaN(a
, b
, status
);
6519 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6521 float_raise(float_flag_invalid
, status
);
6522 return float128_default_nan(status
);
6524 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6527 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6528 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6531 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6532 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6534 zExp
= aExp
+ bExp
- 0x4000;
6535 aSig0
|= LIT64( 0x0001000000000000 );
6536 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
6537 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
6538 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6539 zSig2
|= ( zSig3
!= 0 );
6540 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
6541 shift128ExtraRightJamming(
6542 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6545 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6549 /*----------------------------------------------------------------------------
6550 | Returns the result of dividing the quadruple-precision floating-point value
6551 | `a' by the corresponding value `b'. The operation is performed according to
6552 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6553 *----------------------------------------------------------------------------*/
6555 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
6557 flag aSign
, bSign
, zSign
;
6558 int32_t aExp
, bExp
, zExp
;
6559 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6560 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6562 aSig1
= extractFloat128Frac1( a
);
6563 aSig0
= extractFloat128Frac0( a
);
6564 aExp
= extractFloat128Exp( a
);
6565 aSign
= extractFloat128Sign( a
);
6566 bSig1
= extractFloat128Frac1( b
);
6567 bSig0
= extractFloat128Frac0( b
);
6568 bExp
= extractFloat128Exp( b
);
6569 bSign
= extractFloat128Sign( b
);
6570 zSign
= aSign
^ bSign
;
6571 if ( aExp
== 0x7FFF ) {
6572 if (aSig0
| aSig1
) {
6573 return propagateFloat128NaN(a
, b
, status
);
6575 if ( bExp
== 0x7FFF ) {
6576 if (bSig0
| bSig1
) {
6577 return propagateFloat128NaN(a
, b
, status
);
6581 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6583 if ( bExp
== 0x7FFF ) {
6584 if (bSig0
| bSig1
) {
6585 return propagateFloat128NaN(a
, b
, status
);
6587 return packFloat128( zSign
, 0, 0, 0 );
6590 if ( ( bSig0
| bSig1
) == 0 ) {
6591 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6593 float_raise(float_flag_invalid
, status
);
6594 return float128_default_nan(status
);
6596 float_raise(float_flag_divbyzero
, status
);
6597 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6599 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6602 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6603 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6605 zExp
= aExp
- bExp
+ 0x3FFD;
6607 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
6609 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6610 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
6611 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
6614 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6615 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
6616 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
6617 while ( (int64_t) rem0
< 0 ) {
6619 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
6621 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
6622 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
6623 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
6624 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
6625 while ( (int64_t) rem1
< 0 ) {
6627 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
6629 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6631 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
6632 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6636 /*----------------------------------------------------------------------------
6637 | Returns the remainder of the quadruple-precision floating-point value `a'
6638 | with respect to the corresponding value `b'. The operation is performed
6639 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6640 *----------------------------------------------------------------------------*/
6642 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
6645 int32_t aExp
, bExp
, expDiff
;
6646 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
6647 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
6650 aSig1
= extractFloat128Frac1( a
);
6651 aSig0
= extractFloat128Frac0( a
);
6652 aExp
= extractFloat128Exp( a
);
6653 aSign
= extractFloat128Sign( a
);
6654 bSig1
= extractFloat128Frac1( b
);
6655 bSig0
= extractFloat128Frac0( b
);
6656 bExp
= extractFloat128Exp( b
);
6657 if ( aExp
== 0x7FFF ) {
6658 if ( ( aSig0
| aSig1
)
6659 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6660 return propagateFloat128NaN(a
, b
, status
);
6664 if ( bExp
== 0x7FFF ) {
6665 if (bSig0
| bSig1
) {
6666 return propagateFloat128NaN(a
, b
, status
);
6671 if ( ( bSig0
| bSig1
) == 0 ) {
6673 float_raise(float_flag_invalid
, status
);
6674 return float128_default_nan(status
);
6676 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6679 if ( ( aSig0
| aSig1
) == 0 ) return a
;
6680 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6682 expDiff
= aExp
- bExp
;
6683 if ( expDiff
< -1 ) return a
;
6685 aSig0
| LIT64( 0x0001000000000000 ),
6687 15 - ( expDiff
< 0 ),
6692 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6693 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
6694 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6696 while ( 0 < expDiff
) {
6697 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6698 q
= ( 4 < q
) ? q
- 4 : 0;
6699 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6700 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
6701 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
6702 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
6705 if ( -64 < expDiff
) {
6706 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6707 q
= ( 4 < q
) ? q
- 4 : 0;
6709 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6711 if ( expDiff
< 0 ) {
6712 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6715 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
6717 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6718 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
6721 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
6722 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6725 alternateASig0
= aSig0
;
6726 alternateASig1
= aSig1
;
6728 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6729 } while ( 0 <= (int64_t) aSig0
);
6731 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
6732 if ( ( sigMean0
< 0 )
6733 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
6734 aSig0
= alternateASig0
;
6735 aSig1
= alternateASig1
;
6737 zSign
= ( (int64_t) aSig0
< 0 );
6738 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
6739 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
6743 /*----------------------------------------------------------------------------
6744 | Returns the square root of the quadruple-precision floating-point value `a'.
6745 | The operation is performed according to the IEC/IEEE Standard for Binary
6746 | Floating-Point Arithmetic.
6747 *----------------------------------------------------------------------------*/
6749 float128
float128_sqrt(float128 a
, float_status
*status
)
6753 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
6754 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6756 aSig1
= extractFloat128Frac1( a
);
6757 aSig0
= extractFloat128Frac0( a
);
6758 aExp
= extractFloat128Exp( a
);
6759 aSign
= extractFloat128Sign( a
);
6760 if ( aExp
== 0x7FFF ) {
6761 if (aSig0
| aSig1
) {
6762 return propagateFloat128NaN(a
, a
, status
);
6764 if ( ! aSign
) return a
;
6768 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
6770 float_raise(float_flag_invalid
, status
);
6771 return float128_default_nan(status
);
6774 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
6775 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6777 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
6778 aSig0
|= LIT64( 0x0001000000000000 );
6779 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
6780 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
6781 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6782 doubleZSig0
= zSig0
<<1;
6783 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6784 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6785 while ( (int64_t) rem0
< 0 ) {
6788 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6790 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6791 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
6792 if ( zSig1
== 0 ) zSig1
= 1;
6793 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6794 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6795 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6796 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6797 while ( (int64_t) rem1
< 0 ) {
6799 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6801 term2
|= doubleZSig0
;
6802 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6804 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6806 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
6807 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
6811 /*----------------------------------------------------------------------------
6812 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6813 | the corresponding value `b', and 0 otherwise. The invalid exception is
6814 | raised if either operand is a NaN. Otherwise, the comparison is performed
6815 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6816 *----------------------------------------------------------------------------*/
6818 int float128_eq(float128 a
, float128 b
, float_status
*status
)
6821 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6822 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6823 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6824 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6826 float_raise(float_flag_invalid
, status
);
6831 && ( ( a
.high
== b
.high
)
6833 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6838 /*----------------------------------------------------------------------------
6839 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6840 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6841 | exception is raised if either operand is a NaN. The comparison is performed
6842 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6843 *----------------------------------------------------------------------------*/
6845 int float128_le(float128 a
, float128 b
, float_status
*status
)
6849 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6850 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6851 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6852 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6854 float_raise(float_flag_invalid
, status
);
6857 aSign
= extractFloat128Sign( a
);
6858 bSign
= extractFloat128Sign( b
);
6859 if ( aSign
!= bSign
) {
6862 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6866 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6867 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6871 /*----------------------------------------------------------------------------
6872 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6873 | the corresponding value `b', and 0 otherwise. The invalid exception is
6874 | raised if either operand is a NaN. The comparison is performed according
6875 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6876 *----------------------------------------------------------------------------*/
6878 int float128_lt(float128 a
, float128 b
, float_status
*status
)
6882 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6883 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6884 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6885 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6887 float_raise(float_flag_invalid
, status
);
6890 aSign
= extractFloat128Sign( a
);
6891 bSign
= extractFloat128Sign( b
);
6892 if ( aSign
!= bSign
) {
6895 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6899 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6900 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6904 /*----------------------------------------------------------------------------
6905 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6906 | be compared, and 0 otherwise. The invalid exception is raised if either
6907 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6908 | Standard for Binary Floating-Point Arithmetic.
6909 *----------------------------------------------------------------------------*/
6911 int float128_unordered(float128 a
, float128 b
, float_status
*status
)
6913 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6914 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6915 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6916 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6918 float_raise(float_flag_invalid
, status
);
6924 /*----------------------------------------------------------------------------
6925 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6926 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6927 | exception. The comparison is performed according to the IEC/IEEE Standard
6928 | for Binary Floating-Point Arithmetic.
6929 *----------------------------------------------------------------------------*/
6931 int float128_eq_quiet(float128 a
, float128 b
, float_status
*status
)
6934 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6935 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6936 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6937 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6939 if (float128_is_signaling_nan(a
, status
)
6940 || float128_is_signaling_nan(b
, status
)) {
6941 float_raise(float_flag_invalid
, status
);
6947 && ( ( a
.high
== b
.high
)
6949 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6954 /*----------------------------------------------------------------------------
6955 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6956 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6957 | cause an exception. Otherwise, the comparison is performed according to the
6958 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6959 *----------------------------------------------------------------------------*/
6961 int float128_le_quiet(float128 a
, float128 b
, float_status
*status
)
6965 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6966 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6967 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6968 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6970 if (float128_is_signaling_nan(a
, status
)
6971 || float128_is_signaling_nan(b
, status
)) {
6972 float_raise(float_flag_invalid
, status
);
6976 aSign
= extractFloat128Sign( a
);
6977 bSign
= extractFloat128Sign( b
);
6978 if ( aSign
!= bSign
) {
6981 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6985 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6986 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6990 /*----------------------------------------------------------------------------
6991 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6992 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6993 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6994 | Standard for Binary Floating-Point Arithmetic.
6995 *----------------------------------------------------------------------------*/
6997 int float128_lt_quiet(float128 a
, float128 b
, float_status
*status
)
7001 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7002 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7003 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7004 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7006 if (float128_is_signaling_nan(a
, status
)
7007 || float128_is_signaling_nan(b
, status
)) {
7008 float_raise(float_flag_invalid
, status
);
7012 aSign
= extractFloat128Sign( a
);
7013 bSign
= extractFloat128Sign( b
);
7014 if ( aSign
!= bSign
) {
7017 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7021 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
7022 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
7026 /*----------------------------------------------------------------------------
7027 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7028 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7029 | comparison is performed according to the IEC/IEEE Standard for Binary
7030 | Floating-Point Arithmetic.
7031 *----------------------------------------------------------------------------*/
7033 int float128_unordered_quiet(float128 a
, float128 b
, float_status
*status
)
7035 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7036 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7037 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7038 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7040 if (float128_is_signaling_nan(a
, status
)
7041 || float128_is_signaling_nan(b
, status
)) {
7042 float_raise(float_flag_invalid
, status
);
7049 static inline int floatx80_compare_internal(floatx80 a
, floatx80 b
,
7050 int is_quiet
, float_status
*status
)
7054 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
7055 float_raise(float_flag_invalid
, status
);
7056 return float_relation_unordered
;
7058 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7059 ( extractFloatx80Frac( a
)<<1 ) ) ||
7060 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7061 ( extractFloatx80Frac( b
)<<1 ) )) {
7063 floatx80_is_signaling_nan(a
, status
) ||
7064 floatx80_is_signaling_nan(b
, status
)) {
7065 float_raise(float_flag_invalid
, status
);
7067 return float_relation_unordered
;
7069 aSign
= extractFloatx80Sign( a
);
7070 bSign
= extractFloatx80Sign( b
);
7071 if ( aSign
!= bSign
) {
7073 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7074 ( ( a
.low
| b
.low
) == 0 ) ) {
7076 return float_relation_equal
;
7078 return 1 - (2 * aSign
);
7081 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7082 return float_relation_equal
;
7084 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7089 int floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
7091 return floatx80_compare_internal(a
, b
, 0, status
);
7094 int floatx80_compare_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
7096 return floatx80_compare_internal(a
, b
, 1, status
);
7099 static inline int float128_compare_internal(float128 a
, float128 b
,
7100 int is_quiet
, float_status
*status
)
7104 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7105 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7106 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7107 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7109 float128_is_signaling_nan(a
, status
) ||
7110 float128_is_signaling_nan(b
, status
)) {
7111 float_raise(float_flag_invalid
, status
);
7113 return float_relation_unordered
;
7115 aSign
= extractFloat128Sign( a
);
7116 bSign
= extractFloat128Sign( b
);
7117 if ( aSign
!= bSign
) {
7118 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7120 return float_relation_equal
;
7122 return 1 - (2 * aSign
);
7125 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7126 return float_relation_equal
;
7128 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7133 int float128_compare(float128 a
, float128 b
, float_status
*status
)
7135 return float128_compare_internal(a
, b
, 0, status
);
7138 int float128_compare_quiet(float128 a
, float128 b
, float_status
*status
)
7140 return float128_compare_internal(a
, b
, 1, status
);
7143 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7149 if (floatx80_invalid_encoding(a
)) {
7150 float_raise(float_flag_invalid
, status
);
7151 return floatx80_default_nan(status
);
7153 aSig
= extractFloatx80Frac( a
);
7154 aExp
= extractFloatx80Exp( a
);
7155 aSign
= extractFloatx80Sign( a
);
7157 if ( aExp
== 0x7FFF ) {
7159 return propagateFloatx80NaN(a
, a
, status
);
7173 } else if (n
< -0x10000) {
7178 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7179 aSign
, aExp
, aSig
, 0, status
);
7182 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7186 uint64_t aSig0
, aSig1
;
7188 aSig1
= extractFloat128Frac1( a
);
7189 aSig0
= extractFloat128Frac0( a
);
7190 aExp
= extractFloat128Exp( a
);
7191 aSign
= extractFloat128Sign( a
);
7192 if ( aExp
== 0x7FFF ) {
7193 if ( aSig0
| aSig1
) {
7194 return propagateFloat128NaN(a
, a
, status
);
7199 aSig0
|= LIT64( 0x0001000000000000 );
7200 } else if (aSig0
== 0 && aSig1
== 0) {
7208 } else if (n
< -0x10000) {
7213 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1