4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Returns the fraction bits of the half-precision floating-point value `a'.
100 *----------------------------------------------------------------------------*/
102 static inline uint32_t extractFloat16Frac(float16 a
)
104 return float16_val(a
) & 0x3ff;
107 /*----------------------------------------------------------------------------
108 | Returns the exponent bits of the half-precision floating-point value `a'.
109 *----------------------------------------------------------------------------*/
111 static inline int extractFloat16Exp(float16 a
)
113 return (float16_val(a
) >> 10) & 0x1f;
116 /*----------------------------------------------------------------------------
117 | Returns the fraction bits of the single-precision floating-point value `a'.
118 *----------------------------------------------------------------------------*/
120 static inline uint32_t extractFloat32Frac(float32 a
)
122 return float32_val(a
) & 0x007FFFFF;
125 /*----------------------------------------------------------------------------
126 | Returns the exponent bits of the single-precision floating-point value `a'.
127 *----------------------------------------------------------------------------*/
129 static inline int extractFloat32Exp(float32 a
)
131 return (float32_val(a
) >> 23) & 0xFF;
134 /*----------------------------------------------------------------------------
135 | Returns the sign bit of the single-precision floating-point value `a'.
136 *----------------------------------------------------------------------------*/
138 static inline flag
extractFloat32Sign(float32 a
)
140 return float32_val(a
) >> 31;
143 /*----------------------------------------------------------------------------
144 | Returns the fraction bits of the double-precision floating-point value `a'.
145 *----------------------------------------------------------------------------*/
147 static inline uint64_t extractFloat64Frac(float64 a
)
149 return float64_val(a
) & LIT64(0x000FFFFFFFFFFFFF);
152 /*----------------------------------------------------------------------------
153 | Returns the exponent bits of the double-precision floating-point value `a'.
154 *----------------------------------------------------------------------------*/
156 static inline int extractFloat64Exp(float64 a
)
158 return (float64_val(a
) >> 52) & 0x7FF;
161 /*----------------------------------------------------------------------------
162 | Returns the sign bit of the double-precision floating-point value `a'.
163 *----------------------------------------------------------------------------*/
165 static inline flag
extractFloat64Sign(float64 a
)
167 return float64_val(a
) >> 63;
171 * Classify a floating point number. Everything above float_class_qnan
172 * is a NaN so cls >= float_class_qnan is any NaN.
175 typedef enum __attribute__ ((__packed__
)) {
176 float_class_unclassified
,
180 float_class_qnan
, /* all NaNs from here */
184 /* Simple helpers for checking if, or what kind of, NaN we have */
185 static inline __attribute__((unused
)) bool is_nan(FloatClass c
)
187 return unlikely(c
>= float_class_qnan
);
190 static inline __attribute__((unused
)) bool is_snan(FloatClass c
)
192 return c
== float_class_snan
;
195 static inline __attribute__((unused
)) bool is_qnan(FloatClass c
)
197 return c
== float_class_qnan
;
201 * Structure holding all of the decomposed parts of a float. The
202 * exponent is unbiased and the fraction is normalized. All
203 * calculations are done with a 64 bit fraction and then rounded as
204 * appropriate for the final format.
206 * Thanks to the packed FloatClass a decent compiler should be able to
207 * fit the whole structure into registers and avoid using the stack
208 * for parameter passing.
218 #define DECOMPOSED_BINARY_POINT (64 - 2)
219 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
220 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
222 /* Structure holding all of the relevant parameters for a format.
223 * exp_size: the size of the exponent field
224 * exp_bias: the offset applied to the exponent field
225 * exp_max: the maximum normalised exponent
226 * frac_size: the size of the fraction field
227 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
228 * The following are computed based the size of fraction
229 * frac_lsb: least significant bit of fraction
230 * frac_lsbm1: the bit below the least significant bit (for rounding)
231 * round_mask/roundeven_mask: masks used for rounding
232 * The following optional modifiers are available:
233 * arm_althp: handle ARM Alternative Half Precision
244 uint64_t roundeven_mask
;
248 /* Expand fields based on the size of exponent and fraction */
249 #define FLOAT_PARAMS(E, F) \
251 .exp_bias = ((1 << E) - 1) >> 1, \
252 .exp_max = (1 << E) - 1, \
254 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
255 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
256 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
257 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
258 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
260 static const FloatFmt float16_params
= {
264 static const FloatFmt float16_params_ahp
= {
269 static const FloatFmt float32_params
= {
273 static const FloatFmt float64_params
= {
277 /* Unpack a float to parts, but do not canonicalize. */
278 static inline FloatParts
unpack_raw(FloatFmt fmt
, uint64_t raw
)
280 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
282 return (FloatParts
) {
283 .cls
= float_class_unclassified
,
284 .sign
= extract64(raw
, sign_pos
, 1),
285 .exp
= extract64(raw
, fmt
.frac_size
, fmt
.exp_size
),
286 .frac
= extract64(raw
, 0, fmt
.frac_size
),
290 static inline FloatParts
float16_unpack_raw(float16 f
)
292 return unpack_raw(float16_params
, f
);
295 static inline FloatParts
float32_unpack_raw(float32 f
)
297 return unpack_raw(float32_params
, f
);
300 static inline FloatParts
float64_unpack_raw(float64 f
)
302 return unpack_raw(float64_params
, f
);
305 /* Pack a float from parts, but do not canonicalize. */
306 static inline uint64_t pack_raw(FloatFmt fmt
, FloatParts p
)
308 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
309 uint64_t ret
= deposit64(p
.frac
, fmt
.frac_size
, fmt
.exp_size
, p
.exp
);
310 return deposit64(ret
, sign_pos
, 1, p
.sign
);
313 static inline float16
float16_pack_raw(FloatParts p
)
315 return make_float16(pack_raw(float16_params
, p
));
318 static inline float32
float32_pack_raw(FloatParts p
)
320 return make_float32(pack_raw(float32_params
, p
));
323 static inline float64
float64_pack_raw(FloatParts p
)
325 return make_float64(pack_raw(float64_params
, p
));
328 /*----------------------------------------------------------------------------
329 | Functions and definitions to determine: (1) whether tininess for underflow
330 | is detected before or after rounding by default, (2) what (if anything)
331 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
332 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
333 | are propagated from function inputs to output. These details are target-
335 *----------------------------------------------------------------------------*/
336 #include "softfloat-specialize.h"
338 /* Canonicalize EXP and FRAC, setting CLS. */
339 static FloatParts
canonicalize(FloatParts part
, const FloatFmt
*parm
,
340 float_status
*status
)
342 if (part
.exp
== parm
->exp_max
&& !parm
->arm_althp
) {
343 if (part
.frac
== 0) {
344 part
.cls
= float_class_inf
;
346 part
.frac
<<= parm
->frac_shift
;
347 part
.cls
= (parts_is_snan_frac(part
.frac
, status
)
348 ? float_class_snan
: float_class_qnan
);
350 } else if (part
.exp
== 0) {
351 if (likely(part
.frac
== 0)) {
352 part
.cls
= float_class_zero
;
353 } else if (status
->flush_inputs_to_zero
) {
354 float_raise(float_flag_input_denormal
, status
);
355 part
.cls
= float_class_zero
;
358 int shift
= clz64(part
.frac
) - 1;
359 part
.cls
= float_class_normal
;
360 part
.exp
= parm
->frac_shift
- parm
->exp_bias
- shift
+ 1;
364 part
.cls
= float_class_normal
;
365 part
.exp
-= parm
->exp_bias
;
366 part
.frac
= DECOMPOSED_IMPLICIT_BIT
+ (part
.frac
<< parm
->frac_shift
);
371 /* Round and uncanonicalize a floating-point number by parts. There
372 * are FRAC_SHIFT bits that may require rounding at the bottom of the
373 * fraction; these bits will be removed. The exponent will be biased
374 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
377 static FloatParts
round_canonical(FloatParts p
, float_status
*s
,
378 const FloatFmt
*parm
)
380 const uint64_t frac_lsbm1
= parm
->frac_lsbm1
;
381 const uint64_t round_mask
= parm
->round_mask
;
382 const uint64_t roundeven_mask
= parm
->roundeven_mask
;
383 const int exp_max
= parm
->exp_max
;
384 const int frac_shift
= parm
->frac_shift
;
393 case float_class_normal
:
394 switch (s
->float_rounding_mode
) {
395 case float_round_nearest_even
:
396 overflow_norm
= false;
397 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
399 case float_round_ties_away
:
400 overflow_norm
= false;
403 case float_round_to_zero
:
404 overflow_norm
= true;
408 inc
= p
.sign
? 0 : round_mask
;
409 overflow_norm
= p
.sign
;
411 case float_round_down
:
412 inc
= p
.sign
? round_mask
: 0;
413 overflow_norm
= !p
.sign
;
416 g_assert_not_reached();
419 exp
+= parm
->exp_bias
;
420 if (likely(exp
> 0)) {
421 if (frac
& round_mask
) {
422 flags
|= float_flag_inexact
;
424 if (frac
& DECOMPOSED_OVERFLOW_BIT
) {
431 if (parm
->arm_althp
) {
432 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
433 if (unlikely(exp
> exp_max
)) {
434 /* Overflow. Return the maximum normal. */
435 flags
= float_flag_invalid
;
439 } else if (unlikely(exp
>= exp_max
)) {
440 flags
|= float_flag_overflow
| float_flag_inexact
;
445 p
.cls
= float_class_inf
;
449 } else if (s
->flush_to_zero
) {
450 flags
|= float_flag_output_denormal
;
451 p
.cls
= float_class_zero
;
454 bool is_tiny
= (s
->float_detect_tininess
455 == float_tininess_before_rounding
)
457 || !((frac
+ inc
) & DECOMPOSED_OVERFLOW_BIT
);
459 shift64RightJamming(frac
, 1 - exp
, &frac
);
460 if (frac
& round_mask
) {
461 /* Need to recompute round-to-even. */
462 if (s
->float_rounding_mode
== float_round_nearest_even
) {
463 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
466 flags
|= float_flag_inexact
;
470 exp
= (frac
& DECOMPOSED_IMPLICIT_BIT
? 1 : 0);
473 if (is_tiny
&& (flags
& float_flag_inexact
)) {
474 flags
|= float_flag_underflow
;
476 if (exp
== 0 && frac
== 0) {
477 p
.cls
= float_class_zero
;
482 case float_class_zero
:
488 case float_class_inf
:
490 assert(!parm
->arm_althp
);
495 case float_class_qnan
:
496 case float_class_snan
:
497 assert(!parm
->arm_althp
);
499 frac
>>= parm
->frac_shift
;
503 g_assert_not_reached();
506 float_raise(flags
, s
);
512 /* Explicit FloatFmt version */
513 static FloatParts
float16a_unpack_canonical(float16 f
, float_status
*s
,
514 const FloatFmt
*params
)
516 return canonicalize(float16_unpack_raw(f
), params
, s
);
519 static FloatParts
float16_unpack_canonical(float16 f
, float_status
*s
)
521 return float16a_unpack_canonical(f
, s
, &float16_params
);
524 static float16
float16a_round_pack_canonical(FloatParts p
, float_status
*s
,
525 const FloatFmt
*params
)
527 return float16_pack_raw(round_canonical(p
, s
, params
));
530 static float16
float16_round_pack_canonical(FloatParts p
, float_status
*s
)
532 return float16a_round_pack_canonical(p
, s
, &float16_params
);
535 static FloatParts
float32_unpack_canonical(float32 f
, float_status
*s
)
537 return canonicalize(float32_unpack_raw(f
), &float32_params
, s
);
540 static float32
float32_round_pack_canonical(FloatParts p
, float_status
*s
)
542 return float32_pack_raw(round_canonical(p
, s
, &float32_params
));
545 static FloatParts
float64_unpack_canonical(float64 f
, float_status
*s
)
547 return canonicalize(float64_unpack_raw(f
), &float64_params
, s
);
550 static float64
float64_round_pack_canonical(FloatParts p
, float_status
*s
)
552 return float64_pack_raw(round_canonical(p
, s
, &float64_params
));
555 static FloatParts
return_nan(FloatParts a
, float_status
*s
)
558 case float_class_snan
:
559 s
->float_exception_flags
|= float_flag_invalid
;
560 a
= parts_silence_nan(a
, s
);
562 case float_class_qnan
:
563 if (s
->default_nan_mode
) {
564 return parts_default_nan(s
);
569 g_assert_not_reached();
574 static FloatParts
pick_nan(FloatParts a
, FloatParts b
, float_status
*s
)
576 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
577 s
->float_exception_flags
|= float_flag_invalid
;
580 if (s
->default_nan_mode
) {
581 return parts_default_nan(s
);
583 if (pickNaN(a
.cls
, b
.cls
,
585 (a
.frac
== b
.frac
&& a
.sign
< b
.sign
))) {
588 if (is_snan(a
.cls
)) {
589 return parts_silence_nan(a
, s
);
595 static FloatParts
pick_nan_muladd(FloatParts a
, FloatParts b
, FloatParts c
,
596 bool inf_zero
, float_status
*s
)
600 if (is_snan(a
.cls
) || is_snan(b
.cls
) || is_snan(c
.cls
)) {
601 s
->float_exception_flags
|= float_flag_invalid
;
604 which
= pickNaNMulAdd(a
.cls
, b
.cls
, c
.cls
, inf_zero
, s
);
606 if (s
->default_nan_mode
) {
607 /* Note that this check is after pickNaNMulAdd so that function
608 * has an opportunity to set the Invalid flag.
623 return parts_default_nan(s
);
625 g_assert_not_reached();
628 if (is_snan(a
.cls
)) {
629 return parts_silence_nan(a
, s
);
635 * Returns the result of adding or subtracting the values of the
636 * floating-point values `a' and `b'. The operation is performed
637 * according to the IEC/IEEE Standard for Binary Floating-Point
641 static FloatParts
addsub_floats(FloatParts a
, FloatParts b
, bool subtract
,
644 bool a_sign
= a
.sign
;
645 bool b_sign
= b
.sign
^ subtract
;
647 if (a_sign
!= b_sign
) {
650 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
651 if (a
.exp
> b
.exp
|| (a
.exp
== b
.exp
&& a
.frac
>= b
.frac
)) {
652 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
653 a
.frac
= a
.frac
- b
.frac
;
655 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
656 a
.frac
= b
.frac
- a
.frac
;
662 a
.cls
= float_class_zero
;
663 a
.sign
= s
->float_rounding_mode
== float_round_down
;
665 int shift
= clz64(a
.frac
) - 1;
666 a
.frac
= a
.frac
<< shift
;
667 a
.exp
= a
.exp
- shift
;
672 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
673 return pick_nan(a
, b
, s
);
675 if (a
.cls
== float_class_inf
) {
676 if (b
.cls
== float_class_inf
) {
677 float_raise(float_flag_invalid
, s
);
678 return parts_default_nan(s
);
682 if (a
.cls
== float_class_zero
&& b
.cls
== float_class_zero
) {
683 a
.sign
= s
->float_rounding_mode
== float_round_down
;
686 if (a
.cls
== float_class_zero
|| b
.cls
== float_class_inf
) {
690 if (b
.cls
== float_class_zero
) {
695 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
697 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
698 } else if (a
.exp
< b
.exp
) {
699 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
703 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
704 shift64RightJamming(a
.frac
, 1, &a
.frac
);
709 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
710 return pick_nan(a
, b
, s
);
712 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
715 if (b
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
720 g_assert_not_reached();
724 * Returns the result of adding or subtracting the floating-point
725 * values `a' and `b'. The operation is performed according to the
726 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
729 float16
__attribute__((flatten
)) float16_add(float16 a
, float16 b
,
730 float_status
*status
)
732 FloatParts pa
= float16_unpack_canonical(a
, status
);
733 FloatParts pb
= float16_unpack_canonical(b
, status
);
734 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
736 return float16_round_pack_canonical(pr
, status
);
739 float32
__attribute__((flatten
)) float32_add(float32 a
, float32 b
,
740 float_status
*status
)
742 FloatParts pa
= float32_unpack_canonical(a
, status
);
743 FloatParts pb
= float32_unpack_canonical(b
, status
);
744 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
746 return float32_round_pack_canonical(pr
, status
);
749 float64
__attribute__((flatten
)) float64_add(float64 a
, float64 b
,
750 float_status
*status
)
752 FloatParts pa
= float64_unpack_canonical(a
, status
);
753 FloatParts pb
= float64_unpack_canonical(b
, status
);
754 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
756 return float64_round_pack_canonical(pr
, status
);
759 float16
__attribute__((flatten
)) float16_sub(float16 a
, float16 b
,
760 float_status
*status
)
762 FloatParts pa
= float16_unpack_canonical(a
, status
);
763 FloatParts pb
= float16_unpack_canonical(b
, status
);
764 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
766 return float16_round_pack_canonical(pr
, status
);
769 float32
__attribute__((flatten
)) float32_sub(float32 a
, float32 b
,
770 float_status
*status
)
772 FloatParts pa
= float32_unpack_canonical(a
, status
);
773 FloatParts pb
= float32_unpack_canonical(b
, status
);
774 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
776 return float32_round_pack_canonical(pr
, status
);
779 float64
__attribute__((flatten
)) float64_sub(float64 a
, float64 b
,
780 float_status
*status
)
782 FloatParts pa
= float64_unpack_canonical(a
, status
);
783 FloatParts pb
= float64_unpack_canonical(b
, status
);
784 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
786 return float64_round_pack_canonical(pr
, status
);
790 * Returns the result of multiplying the floating-point values `a' and
791 * `b'. The operation is performed according to the IEC/IEEE Standard
792 * for Binary Floating-Point Arithmetic.
795 static FloatParts
mul_floats(FloatParts a
, FloatParts b
, float_status
*s
)
797 bool sign
= a
.sign
^ b
.sign
;
799 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
801 int exp
= a
.exp
+ b
.exp
;
803 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
804 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
805 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
806 shift64RightJamming(lo
, 1, &lo
);
816 /* handle all the NaN cases */
817 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
818 return pick_nan(a
, b
, s
);
820 /* Inf * Zero == NaN */
821 if ((a
.cls
== float_class_inf
&& b
.cls
== float_class_zero
) ||
822 (a
.cls
== float_class_zero
&& b
.cls
== float_class_inf
)) {
823 s
->float_exception_flags
|= float_flag_invalid
;
824 return parts_default_nan(s
);
826 /* Multiply by 0 or Inf */
827 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
831 if (b
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
835 g_assert_not_reached();
838 float16
__attribute__((flatten
)) float16_mul(float16 a
, float16 b
,
839 float_status
*status
)
841 FloatParts pa
= float16_unpack_canonical(a
, status
);
842 FloatParts pb
= float16_unpack_canonical(b
, status
);
843 FloatParts pr
= mul_floats(pa
, pb
, status
);
845 return float16_round_pack_canonical(pr
, status
);
848 float32
__attribute__((flatten
)) float32_mul(float32 a
, float32 b
,
849 float_status
*status
)
851 FloatParts pa
= float32_unpack_canonical(a
, status
);
852 FloatParts pb
= float32_unpack_canonical(b
, status
);
853 FloatParts pr
= mul_floats(pa
, pb
, status
);
855 return float32_round_pack_canonical(pr
, status
);
858 float64
__attribute__((flatten
)) float64_mul(float64 a
, float64 b
,
859 float_status
*status
)
861 FloatParts pa
= float64_unpack_canonical(a
, status
);
862 FloatParts pb
= float64_unpack_canonical(b
, status
);
863 FloatParts pr
= mul_floats(pa
, pb
, status
);
865 return float64_round_pack_canonical(pr
, status
);
869 * Returns the result of multiplying the floating-point values `a' and
870 * `b' then adding 'c', with no intermediate rounding step after the
871 * multiplication. The operation is performed according to the
872 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
873 * The flags argument allows the caller to select negation of the
874 * addend, the intermediate product, or the final result. (The
875 * difference between this and having the caller do a separate
876 * negation is that negating externally will flip the sign bit on
880 static FloatParts
muladd_floats(FloatParts a
, FloatParts b
, FloatParts c
,
881 int flags
, float_status
*s
)
883 bool inf_zero
= ((1 << a
.cls
) | (1 << b
.cls
)) ==
884 ((1 << float_class_inf
) | (1 << float_class_zero
));
886 bool sign_flip
= flags
& float_muladd_negate_result
;
891 /* It is implementation-defined whether the cases of (0,inf,qnan)
892 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
893 * they return if they do), so we have to hand this information
894 * off to the target-specific pick-a-NaN routine.
896 if (is_nan(a
.cls
) || is_nan(b
.cls
) || is_nan(c
.cls
)) {
897 return pick_nan_muladd(a
, b
, c
, inf_zero
, s
);
901 s
->float_exception_flags
|= float_flag_invalid
;
902 return parts_default_nan(s
);
905 if (flags
& float_muladd_negate_c
) {
909 p_sign
= a
.sign
^ b
.sign
;
911 if (flags
& float_muladd_negate_product
) {
915 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_inf
) {
916 p_class
= float_class_inf
;
917 } else if (a
.cls
== float_class_zero
|| b
.cls
== float_class_zero
) {
918 p_class
= float_class_zero
;
920 p_class
= float_class_normal
;
923 if (c
.cls
== float_class_inf
) {
924 if (p_class
== float_class_inf
&& p_sign
!= c
.sign
) {
925 s
->float_exception_flags
|= float_flag_invalid
;
926 return parts_default_nan(s
);
928 a
.cls
= float_class_inf
;
929 a
.sign
= c
.sign
^ sign_flip
;
934 if (p_class
== float_class_inf
) {
935 a
.cls
= float_class_inf
;
936 a
.sign
= p_sign
^ sign_flip
;
940 if (p_class
== float_class_zero
) {
941 if (c
.cls
== float_class_zero
) {
942 if (p_sign
!= c
.sign
) {
943 p_sign
= s
->float_rounding_mode
== float_round_down
;
946 } else if (flags
& float_muladd_halve_result
) {
953 /* a & b should be normals now... */
954 assert(a
.cls
== float_class_normal
&&
955 b
.cls
== float_class_normal
);
957 p_exp
= a
.exp
+ b
.exp
;
959 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
962 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
963 /* binary point now at bit 124 */
965 /* check for overflow */
966 if (hi
& (1ULL << (DECOMPOSED_BINARY_POINT
* 2 + 1 - 64))) {
967 shift128RightJamming(hi
, lo
, 1, &hi
, &lo
);
972 if (c
.cls
== float_class_zero
) {
973 /* move binary point back to 62 */
974 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
976 int exp_diff
= p_exp
- c
.exp
;
977 if (p_sign
== c
.sign
) {
980 shift128RightJamming(hi
, lo
,
981 DECOMPOSED_BINARY_POINT
- exp_diff
,
987 /* shift c to the same binary point as the product (124) */
990 shift128RightJamming(c_hi
, c_lo
,
993 add128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
994 /* move binary point back to 62 */
995 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
998 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
999 shift64RightJamming(lo
, 1, &lo
);
1005 uint64_t c_hi
, c_lo
;
1006 /* make C binary point match product at bit 124 */
1010 if (exp_diff
<= 0) {
1011 shift128RightJamming(hi
, lo
, -exp_diff
, &hi
, &lo
);
1014 (hi
> c_hi
|| (hi
== c_hi
&& lo
>= c_lo
))) {
1015 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1017 sub128(c_hi
, c_lo
, hi
, lo
, &hi
, &lo
);
1022 shift128RightJamming(c_hi
, c_lo
,
1025 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1028 if (hi
== 0 && lo
== 0) {
1029 a
.cls
= float_class_zero
;
1030 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1031 a
.sign
^= sign_flip
;
1038 shift
= clz64(lo
) + 64;
1040 /* Normalizing to a binary point of 124 is the
1041 correct adjust for the exponent. However since we're
1042 shifting, we might as well put the binary point back
1043 at 62 where we really want it. Therefore shift as
1044 if we're leaving 1 bit at the top of the word, but
1045 adjust the exponent as if we're leaving 3 bits. */
1048 lo
= lo
<< (shift
- 64);
1050 hi
= (hi
<< shift
) | (lo
>> (64 - shift
));
1051 lo
= hi
| ((lo
<< shift
) != 0);
1058 if (flags
& float_muladd_halve_result
) {
1062 /* finally prepare our result */
1063 a
.cls
= float_class_normal
;
1064 a
.sign
= p_sign
^ sign_flip
;
1071 float16
__attribute__((flatten
)) float16_muladd(float16 a
, float16 b
, float16 c
,
1072 int flags
, float_status
*status
)
1074 FloatParts pa
= float16_unpack_canonical(a
, status
);
1075 FloatParts pb
= float16_unpack_canonical(b
, status
);
1076 FloatParts pc
= float16_unpack_canonical(c
, status
);
1077 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1079 return float16_round_pack_canonical(pr
, status
);
1082 float32
__attribute__((flatten
)) float32_muladd(float32 a
, float32 b
, float32 c
,
1083 int flags
, float_status
*status
)
1085 FloatParts pa
= float32_unpack_canonical(a
, status
);
1086 FloatParts pb
= float32_unpack_canonical(b
, status
);
1087 FloatParts pc
= float32_unpack_canonical(c
, status
);
1088 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1090 return float32_round_pack_canonical(pr
, status
);
1093 float64
__attribute__((flatten
)) float64_muladd(float64 a
, float64 b
, float64 c
,
1094 int flags
, float_status
*status
)
1096 FloatParts pa
= float64_unpack_canonical(a
, status
);
1097 FloatParts pb
= float64_unpack_canonical(b
, status
);
1098 FloatParts pc
= float64_unpack_canonical(c
, status
);
1099 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1101 return float64_round_pack_canonical(pr
, status
);
1105 * Returns the result of dividing the floating-point value `a' by the
1106 * corresponding value `b'. The operation is performed according to
1107 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1110 static FloatParts
div_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1112 bool sign
= a
.sign
^ b
.sign
;
1114 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1115 uint64_t temp_lo
, temp_hi
;
1116 int exp
= a
.exp
- b
.exp
;
1117 if (a
.frac
< b
.frac
) {
1119 shortShift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1,
1120 &temp_hi
, &temp_lo
);
1122 shortShift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
,
1123 &temp_hi
, &temp_lo
);
1125 /* LSB of quot is set if inexact which roundandpack will use
1126 * to set flags. Yet again we re-use a for the result */
1127 a
.frac
= div128To64(temp_lo
, temp_hi
, b
.frac
);
1132 /* handle all the NaN cases */
1133 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1134 return pick_nan(a
, b
, s
);
1136 /* 0/0 or Inf/Inf */
1139 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1140 s
->float_exception_flags
|= float_flag_invalid
;
1141 return parts_default_nan(s
);
1143 /* Inf / x or 0 / x */
1144 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1149 if (b
.cls
== float_class_zero
) {
1150 s
->float_exception_flags
|= float_flag_divbyzero
;
1151 a
.cls
= float_class_inf
;
1156 if (b
.cls
== float_class_inf
) {
1157 a
.cls
= float_class_zero
;
1161 g_assert_not_reached();
1164 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1166 FloatParts pa
= float16_unpack_canonical(a
, status
);
1167 FloatParts pb
= float16_unpack_canonical(b
, status
);
1168 FloatParts pr
= div_floats(pa
, pb
, status
);
1170 return float16_round_pack_canonical(pr
, status
);
1173 float32
float32_div(float32 a
, float32 b
, float_status
*status
)
1175 FloatParts pa
= float32_unpack_canonical(a
, status
);
1176 FloatParts pb
= float32_unpack_canonical(b
, status
);
1177 FloatParts pr
= div_floats(pa
, pb
, status
);
1179 return float32_round_pack_canonical(pr
, status
);
1182 float64
float64_div(float64 a
, float64 b
, float_status
*status
)
1184 FloatParts pa
= float64_unpack_canonical(a
, status
);
1185 FloatParts pb
= float64_unpack_canonical(b
, status
);
1186 FloatParts pr
= div_floats(pa
, pb
, status
);
1188 return float64_round_pack_canonical(pr
, status
);
1192 * Float to Float conversions
1194 * Returns the result of converting one float format to another. The
1195 * conversion is performed according to the IEC/IEEE Standard for
1196 * Binary Floating-Point Arithmetic.
1198 * The float_to_float helper only needs to take care of raising
1199 * invalid exceptions and handling the conversion on NaNs.
1202 static FloatParts
float_to_float(FloatParts a
, const FloatFmt
*dstf
,
1205 if (dstf
->arm_althp
) {
1207 case float_class_qnan
:
1208 case float_class_snan
:
1209 /* There is no NaN in the destination format. Raise Invalid
1210 * and return a zero with the sign of the input NaN.
1212 s
->float_exception_flags
|= float_flag_invalid
;
1213 a
.cls
= float_class_zero
;
1218 case float_class_inf
:
1219 /* There is no Inf in the destination format. Raise Invalid
1220 * and return the maximum normal with the correct sign.
1222 s
->float_exception_flags
|= float_flag_invalid
;
1223 a
.cls
= float_class_normal
;
1224 a
.exp
= dstf
->exp_max
;
1225 a
.frac
= ((1ull << dstf
->frac_size
) - 1) << dstf
->frac_shift
;
1231 } else if (is_nan(a
.cls
)) {
1232 if (is_snan(a
.cls
)) {
1233 s
->float_exception_flags
|= float_flag_invalid
;
1234 a
= parts_silence_nan(a
, s
);
1236 if (s
->default_nan_mode
) {
1237 return parts_default_nan(s
);
1243 float32
float16_to_float32(float16 a
, bool ieee
, float_status
*s
)
1245 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1246 FloatParts p
= float16a_unpack_canonical(a
, s
, fmt16
);
1247 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
1248 return float32_round_pack_canonical(pr
, s
);
1251 float64
float16_to_float64(float16 a
, bool ieee
, float_status
*s
)
1253 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1254 FloatParts p
= float16a_unpack_canonical(a
, s
, fmt16
);
1255 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
1256 return float64_round_pack_canonical(pr
, s
);
1259 float16
float32_to_float16(float32 a
, bool ieee
, float_status
*s
)
1261 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1262 FloatParts p
= float32_unpack_canonical(a
, s
);
1263 FloatParts pr
= float_to_float(p
, fmt16
, s
);
1264 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1267 float64
float32_to_float64(float32 a
, float_status
*s
)
1269 FloatParts p
= float32_unpack_canonical(a
, s
);
1270 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
1271 return float64_round_pack_canonical(pr
, s
);
1274 float16
float64_to_float16(float64 a
, bool ieee
, float_status
*s
)
1276 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1277 FloatParts p
= float64_unpack_canonical(a
, s
);
1278 FloatParts pr
= float_to_float(p
, fmt16
, s
);
1279 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1282 float32
float64_to_float32(float64 a
, float_status
*s
)
1284 FloatParts p
= float64_unpack_canonical(a
, s
);
1285 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
1286 return float32_round_pack_canonical(pr
, s
);
1290 * Rounds the floating-point value `a' to an integer, and returns the
1291 * result as a floating-point value. The operation is performed
1292 * according to the IEC/IEEE Standard for Binary Floating-Point
1296 static FloatParts
round_to_int(FloatParts a
, int rmode
,
1297 int scale
, float_status
*s
)
1300 case float_class_qnan
:
1301 case float_class_snan
:
1302 return return_nan(a
, s
);
1304 case float_class_zero
:
1305 case float_class_inf
:
1306 /* already "integral" */
1309 case float_class_normal
:
1310 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
1313 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
1314 /* already integral */
1319 /* all fractional */
1320 s
->float_exception_flags
|= float_flag_inexact
;
1322 case float_round_nearest_even
:
1323 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
1325 case float_round_ties_away
:
1326 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
1328 case float_round_to_zero
:
1331 case float_round_up
:
1334 case float_round_down
:
1338 g_assert_not_reached();
1342 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
1345 a
.cls
= float_class_zero
;
1348 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
1349 uint64_t frac_lsbm1
= frac_lsb
>> 1;
1350 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
1351 uint64_t rnd_mask
= rnd_even_mask
>> 1;
1355 case float_round_nearest_even
:
1356 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
1358 case float_round_ties_away
:
1361 case float_round_to_zero
:
1364 case float_round_up
:
1365 inc
= a
.sign
? 0 : rnd_mask
;
1367 case float_round_down
:
1368 inc
= a
.sign
? rnd_mask
: 0;
1371 g_assert_not_reached();
1374 if (a
.frac
& rnd_mask
) {
1375 s
->float_exception_flags
|= float_flag_inexact
;
1377 a
.frac
&= ~rnd_mask
;
1378 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
1386 g_assert_not_reached();
1391 float16
float16_round_to_int(float16 a
, float_status
*s
)
1393 FloatParts pa
= float16_unpack_canonical(a
, s
);
1394 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
1395 return float16_round_pack_canonical(pr
, s
);
1398 float32
float32_round_to_int(float32 a
, float_status
*s
)
1400 FloatParts pa
= float32_unpack_canonical(a
, s
);
1401 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
1402 return float32_round_pack_canonical(pr
, s
);
1405 float64
float64_round_to_int(float64 a
, float_status
*s
)
1407 FloatParts pa
= float64_unpack_canonical(a
, s
);
1408 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
1409 return float64_round_pack_canonical(pr
, s
);
1412 float64
float64_trunc_to_int(float64 a
, float_status
*s
)
1414 FloatParts pa
= float64_unpack_canonical(a
, s
);
1415 FloatParts pr
= round_to_int(pa
, float_round_to_zero
, 0, s
);
1416 return float64_round_pack_canonical(pr
, s
);
1420 * Returns the result of converting the floating-point value `a' to
1421 * the two's complement integer format. The conversion is performed
1422 * according to the IEC/IEEE Standard for Binary Floating-Point
1423 * Arithmetic---which means in particular that the conversion is
1424 * rounded according to the current rounding mode. If `a' is a NaN,
1425 * the largest positive integer is returned. Otherwise, if the
1426 * conversion overflows, the largest integer with the same sign as `a'
1430 static int64_t round_to_int_and_pack(FloatParts in
, int rmode
, int scale
,
1431 int64_t min
, int64_t max
,
1435 int orig_flags
= get_float_exception_flags(s
);
1436 FloatParts p
= round_to_int(in
, rmode
, scale
, s
);
1439 case float_class_snan
:
1440 case float_class_qnan
:
1441 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1443 case float_class_inf
:
1444 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1445 return p
.sign
? min
: max
;
1446 case float_class_zero
:
1448 case float_class_normal
:
1449 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
1450 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
1451 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
1452 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
1457 if (r
<= -(uint64_t) min
) {
1460 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1467 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1472 g_assert_not_reached();
1476 int16_t float16_to_int16_scalbn(float16 a
, int rmode
, int scale
,
1479 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
1480 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
1483 int32_t float16_to_int32_scalbn(float16 a
, int rmode
, int scale
,
1486 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
1487 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
1490 int64_t float16_to_int64_scalbn(float16 a
, int rmode
, int scale
,
1493 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
1494 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
1497 int16_t float32_to_int16_scalbn(float32 a
, int rmode
, int scale
,
1500 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
1501 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
1504 int32_t float32_to_int32_scalbn(float32 a
, int rmode
, int scale
,
1507 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
1508 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
1511 int64_t float32_to_int64_scalbn(float32 a
, int rmode
, int scale
,
1514 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
1515 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
1518 int16_t float64_to_int16_scalbn(float64 a
, int rmode
, int scale
,
1521 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
1522 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
1525 int32_t float64_to_int32_scalbn(float64 a
, int rmode
, int scale
,
1528 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
1529 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
1532 int64_t float64_to_int64_scalbn(float64 a
, int rmode
, int scale
,
1535 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
1536 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
1539 int16_t float16_to_int16(float16 a
, float_status
*s
)
1541 return float16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1544 int32_t float16_to_int32(float16 a
, float_status
*s
)
1546 return float16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1549 int64_t float16_to_int64(float16 a
, float_status
*s
)
1551 return float16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1554 int16_t float32_to_int16(float32 a
, float_status
*s
)
1556 return float32_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1559 int32_t float32_to_int32(float32 a
, float_status
*s
)
1561 return float32_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1564 int64_t float32_to_int64(float32 a
, float_status
*s
)
1566 return float32_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1569 int16_t float64_to_int16(float64 a
, float_status
*s
)
1571 return float64_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1574 int32_t float64_to_int32(float64 a
, float_status
*s
)
1576 return float64_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1579 int64_t float64_to_int64(float64 a
, float_status
*s
)
1581 return float64_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1584 int16_t float16_to_int16_round_to_zero(float16 a
, float_status
*s
)
1586 return float16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
1589 int32_t float16_to_int32_round_to_zero(float16 a
, float_status
*s
)
1591 return float16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
1594 int64_t float16_to_int64_round_to_zero(float16 a
, float_status
*s
)
1596 return float16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
1599 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*s
)
1601 return float32_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
1604 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*s
)
1606 return float32_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
1609 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*s
)
1611 return float32_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
1614 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*s
)
1616 return float64_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
1619 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*s
)
1621 return float64_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
1624 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*s
)
1626 return float64_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
1630 * Returns the result of converting the floating-point value `a' to
1631 * the unsigned integer format. The conversion is performed according
1632 * to the IEC/IEEE Standard for Binary Floating-Point
1633 * Arithmetic---which means in particular that the conversion is
1634 * rounded according to the current rounding mode. If `a' is a NaN,
1635 * the largest unsigned integer is returned. Otherwise, if the
1636 * conversion overflows, the largest unsigned integer is returned. If
1637 * the 'a' is negative, the result is rounded and zero is returned;
1638 * values that do not round to zero will raise the inexact exception
1642 static uint64_t round_to_uint_and_pack(FloatParts in
, int rmode
, int scale
,
1643 uint64_t max
, float_status
*s
)
1645 int orig_flags
= get_float_exception_flags(s
);
1646 FloatParts p
= round_to_int(in
, rmode
, scale
, s
);
1650 case float_class_snan
:
1651 case float_class_qnan
:
1652 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1654 case float_class_inf
:
1655 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1656 return p
.sign
? 0 : max
;
1657 case float_class_zero
:
1659 case float_class_normal
:
1661 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1665 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
1666 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
1667 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
1668 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
1670 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1674 /* For uint64 this will never trip, but if p.exp is too large
1675 * to shift a decomposed fraction we shall have exited via the
1679 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1684 g_assert_not_reached();
1688 uint16_t float16_to_uint16_scalbn(float16 a
, int rmode
, int scale
,
1691 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
1692 rmode
, scale
, UINT16_MAX
, s
);
1695 uint32_t float16_to_uint32_scalbn(float16 a
, int rmode
, int scale
,
1698 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
1699 rmode
, scale
, UINT32_MAX
, s
);
1702 uint64_t float16_to_uint64_scalbn(float16 a
, int rmode
, int scale
,
1705 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
1706 rmode
, scale
, UINT64_MAX
, s
);
1709 uint16_t float32_to_uint16_scalbn(float32 a
, int rmode
, int scale
,
1712 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
1713 rmode
, scale
, UINT16_MAX
, s
);
1716 uint32_t float32_to_uint32_scalbn(float32 a
, int rmode
, int scale
,
1719 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
1720 rmode
, scale
, UINT32_MAX
, s
);
1723 uint64_t float32_to_uint64_scalbn(float32 a
, int rmode
, int scale
,
1726 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
1727 rmode
, scale
, UINT64_MAX
, s
);
1730 uint16_t float64_to_uint16_scalbn(float64 a
, int rmode
, int scale
,
1733 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
1734 rmode
, scale
, UINT16_MAX
, s
);
1737 uint32_t float64_to_uint32_scalbn(float64 a
, int rmode
, int scale
,
1740 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
1741 rmode
, scale
, UINT32_MAX
, s
);
1744 uint64_t float64_to_uint64_scalbn(float64 a
, int rmode
, int scale
,
1747 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
1748 rmode
, scale
, UINT64_MAX
, s
);
1751 uint16_t float16_to_uint16(float16 a
, float_status
*s
)
1753 return float16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1756 uint32_t float16_to_uint32(float16 a
, float_status
*s
)
1758 return float16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1761 uint64_t float16_to_uint64(float16 a
, float_status
*s
)
1763 return float16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1766 uint16_t float32_to_uint16(float32 a
, float_status
*s
)
1768 return float32_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1771 uint32_t float32_to_uint32(float32 a
, float_status
*s
)
1773 return float32_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1776 uint64_t float32_to_uint64(float32 a
, float_status
*s
)
1778 return float32_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1781 uint16_t float64_to_uint16(float64 a
, float_status
*s
)
1783 return float64_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1786 uint32_t float64_to_uint32(float64 a
, float_status
*s
)
1788 return float64_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1791 uint64_t float64_to_uint64(float64 a
, float_status
*s
)
1793 return float64_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
1796 uint16_t float16_to_uint16_round_to_zero(float16 a
, float_status
*s
)
1798 return float16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
1801 uint32_t float16_to_uint32_round_to_zero(float16 a
, float_status
*s
)
1803 return float16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
1806 uint64_t float16_to_uint64_round_to_zero(float16 a
, float_status
*s
)
1808 return float16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
1811 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*s
)
1813 return float32_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
1816 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*s
)
1818 return float32_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
1821 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*s
)
1823 return float32_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
1826 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*s
)
1828 return float64_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
1831 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*s
)
1833 return float64_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
1836 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*s
)
1838 return float64_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
1842 * Integer to float conversions
1844 * Returns the result of converting the two's complement integer `a'
1845 * to the floating-point format. The conversion is performed according
1846 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1849 static FloatParts
int_to_float(int64_t a
, int scale
, float_status
*status
)
1851 FloatParts r
= { .sign
= false };
1854 r
.cls
= float_class_zero
;
1859 r
.cls
= float_class_normal
;
1864 shift
= clz64(f
) - 1;
1865 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
1867 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
1868 r
.frac
= (shift
< 0 ? DECOMPOSED_IMPLICIT_BIT
: f
<< shift
);
1874 float16
int64_to_float16_scalbn(int64_t a
, int scale
, float_status
*status
)
1876 FloatParts pa
= int_to_float(a
, scale
, status
);
1877 return float16_round_pack_canonical(pa
, status
);
1880 float16
int32_to_float16_scalbn(int32_t a
, int scale
, float_status
*status
)
1882 return int64_to_float16_scalbn(a
, scale
, status
);
1885 float16
int16_to_float16_scalbn(int16_t a
, int scale
, float_status
*status
)
1887 return int64_to_float16_scalbn(a
, scale
, status
);
1890 float16
int64_to_float16(int64_t a
, float_status
*status
)
1892 return int64_to_float16_scalbn(a
, 0, status
);
1895 float16
int32_to_float16(int32_t a
, float_status
*status
)
1897 return int64_to_float16_scalbn(a
, 0, status
);
1900 float16
int16_to_float16(int16_t a
, float_status
*status
)
1902 return int64_to_float16_scalbn(a
, 0, status
);
1905 float32
int64_to_float32_scalbn(int64_t a
, int scale
, float_status
*status
)
1907 FloatParts pa
= int_to_float(a
, scale
, status
);
1908 return float32_round_pack_canonical(pa
, status
);
1911 float32
int32_to_float32_scalbn(int32_t a
, int scale
, float_status
*status
)
1913 return int64_to_float32_scalbn(a
, scale
, status
);
1916 float32
int16_to_float32_scalbn(int16_t a
, int scale
, float_status
*status
)
1918 return int64_to_float32_scalbn(a
, scale
, status
);
1921 float32
int64_to_float32(int64_t a
, float_status
*status
)
1923 return int64_to_float32_scalbn(a
, 0, status
);
1926 float32
int32_to_float32(int32_t a
, float_status
*status
)
1928 return int64_to_float32_scalbn(a
, 0, status
);
1931 float32
int16_to_float32(int16_t a
, float_status
*status
)
1933 return int64_to_float32_scalbn(a
, 0, status
);
1936 float64
int64_to_float64_scalbn(int64_t a
, int scale
, float_status
*status
)
1938 FloatParts pa
= int_to_float(a
, scale
, status
);
1939 return float64_round_pack_canonical(pa
, status
);
1942 float64
int32_to_float64_scalbn(int32_t a
, int scale
, float_status
*status
)
1944 return int64_to_float64_scalbn(a
, scale
, status
);
1947 float64
int16_to_float64_scalbn(int16_t a
, int scale
, float_status
*status
)
1949 return int64_to_float64_scalbn(a
, scale
, status
);
1952 float64
int64_to_float64(int64_t a
, float_status
*status
)
1954 return int64_to_float64_scalbn(a
, 0, status
);
1957 float64
int32_to_float64(int32_t a
, float_status
*status
)
1959 return int64_to_float64_scalbn(a
, 0, status
);
1962 float64
int16_to_float64(int16_t a
, float_status
*status
)
1964 return int64_to_float64_scalbn(a
, 0, status
);
1969 * Unsigned Integer to float conversions
1971 * Returns the result of converting the unsigned integer `a' to the
1972 * floating-point format. The conversion is performed according to the
1973 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1976 static FloatParts
uint_to_float(uint64_t a
, int scale
, float_status
*status
)
1978 FloatParts r
= { .sign
= false };
1981 r
.cls
= float_class_zero
;
1983 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
1984 r
.cls
= float_class_normal
;
1985 if ((int64_t)a
< 0) {
1986 r
.exp
= DECOMPOSED_BINARY_POINT
+ 1 + scale
;
1987 shift64RightJamming(a
, 1, &a
);
1990 int shift
= clz64(a
) - 1;
1991 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
1992 r
.frac
= a
<< shift
;
1999 float16
uint64_to_float16_scalbn(uint64_t a
, int scale
, float_status
*status
)
2001 FloatParts pa
= uint_to_float(a
, scale
, status
);
2002 return float16_round_pack_canonical(pa
, status
);
2005 float16
uint32_to_float16_scalbn(uint32_t a
, int scale
, float_status
*status
)
2007 return uint64_to_float16_scalbn(a
, scale
, status
);
2010 float16
uint16_to_float16_scalbn(uint16_t a
, int scale
, float_status
*status
)
2012 return uint64_to_float16_scalbn(a
, scale
, status
);
2015 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
2017 return uint64_to_float16_scalbn(a
, 0, status
);
2020 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
2022 return uint64_to_float16_scalbn(a
, 0, status
);
2025 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
2027 return uint64_to_float16_scalbn(a
, 0, status
);
2030 float32
uint64_to_float32_scalbn(uint64_t a
, int scale
, float_status
*status
)
2032 FloatParts pa
= uint_to_float(a
, scale
, status
);
2033 return float32_round_pack_canonical(pa
, status
);
2036 float32
uint32_to_float32_scalbn(uint32_t a
, int scale
, float_status
*status
)
2038 return uint64_to_float32_scalbn(a
, scale
, status
);
2041 float32
uint16_to_float32_scalbn(uint16_t a
, int scale
, float_status
*status
)
2043 return uint64_to_float32_scalbn(a
, scale
, status
);
2046 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
2048 return uint64_to_float32_scalbn(a
, 0, status
);
2051 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
2053 return uint64_to_float32_scalbn(a
, 0, status
);
2056 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
2058 return uint64_to_float32_scalbn(a
, 0, status
);
2061 float64
uint64_to_float64_scalbn(uint64_t a
, int scale
, float_status
*status
)
2063 FloatParts pa
= uint_to_float(a
, scale
, status
);
2064 return float64_round_pack_canonical(pa
, status
);
2067 float64
uint32_to_float64_scalbn(uint32_t a
, int scale
, float_status
*status
)
2069 return uint64_to_float64_scalbn(a
, scale
, status
);
2072 float64
uint16_to_float64_scalbn(uint16_t a
, int scale
, float_status
*status
)
2074 return uint64_to_float64_scalbn(a
, scale
, status
);
2077 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
2079 return uint64_to_float64_scalbn(a
, 0, status
);
2082 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
2084 return uint64_to_float64_scalbn(a
, 0, status
);
2087 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
2089 return uint64_to_float64_scalbn(a
, 0, status
);
2093 /* min() and max() functions. These can't be implemented as
2094 * 'compare and pick one input' because that would mishandle
2095 * NaNs and +0 vs -0.
2097 * minnum() and maxnum() functions. These are similar to the min()
2098 * and max() functions but if one of the arguments is a QNaN and
2099 * the other is numerical then the numerical argument is returned.
2100 * SNaNs will get quietened before being returned.
2101 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2102 * and maxNum() operations. min() and max() are the typical min/max
2103 * semantics provided by many CPUs which predate that specification.
2105 * minnummag() and maxnummag() functions correspond to minNumMag()
2106 * and minNumMag() from the IEEE-754 2008.
2108 static FloatParts
minmax_floats(FloatParts a
, FloatParts b
, bool ismin
,
2109 bool ieee
, bool ismag
, float_status
*s
)
2111 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
2113 /* Takes two floating-point values `a' and `b', one of
2114 * which is a NaN, and returns the appropriate NaN
2115 * result. If either `a' or `b' is a signaling NaN,
2116 * the invalid exception is raised.
2118 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
2119 return pick_nan(a
, b
, s
);
2120 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
2122 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
2126 return pick_nan(a
, b
, s
);
2131 case float_class_normal
:
2134 case float_class_inf
:
2137 case float_class_zero
:
2141 g_assert_not_reached();
2145 case float_class_normal
:
2148 case float_class_inf
:
2151 case float_class_zero
:
2155 g_assert_not_reached();
2159 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
2160 bool a_less
= a_exp
< b_exp
;
2161 if (a_exp
== b_exp
) {
2162 a_less
= a
.frac
< b
.frac
;
2164 return a_less
^ ismin
? b
: a
;
2167 if (a
.sign
== b
.sign
) {
2168 bool a_less
= a_exp
< b_exp
;
2169 if (a_exp
== b_exp
) {
2170 a_less
= a
.frac
< b
.frac
;
2172 return a
.sign
^ a_less
^ ismin
? b
: a
;
2174 return a
.sign
^ ismin
? b
: a
;
2179 #define MINMAX(sz, name, ismin, isiee, ismag) \
2180 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
2183 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2184 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2185 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
2187 return float ## sz ## _round_pack_canonical(pr, s); \
2190 MINMAX(16, min
, true, false, false)
2191 MINMAX(16, minnum
, true, true, false)
2192 MINMAX(16, minnummag
, true, true, true)
2193 MINMAX(16, max
, false, false, false)
2194 MINMAX(16, maxnum
, false, true, false)
2195 MINMAX(16, maxnummag
, false, true, true)
2197 MINMAX(32, min
, true, false, false)
2198 MINMAX(32, minnum
, true, true, false)
2199 MINMAX(32, minnummag
, true, true, true)
2200 MINMAX(32, max
, false, false, false)
2201 MINMAX(32, maxnum
, false, true, false)
2202 MINMAX(32, maxnummag
, false, true, true)
2204 MINMAX(64, min
, true, false, false)
2205 MINMAX(64, minnum
, true, true, false)
2206 MINMAX(64, minnummag
, true, true, true)
2207 MINMAX(64, max
, false, false, false)
2208 MINMAX(64, maxnum
, false, true, false)
2209 MINMAX(64, maxnummag
, false, true, true)
2213 /* Floating point compare */
2214 static int compare_floats(FloatParts a
, FloatParts b
, bool is_quiet
,
2217 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
2219 a
.cls
== float_class_snan
||
2220 b
.cls
== float_class_snan
) {
2221 s
->float_exception_flags
|= float_flag_invalid
;
2223 return float_relation_unordered
;
2226 if (a
.cls
== float_class_zero
) {
2227 if (b
.cls
== float_class_zero
) {
2228 return float_relation_equal
;
2230 return b
.sign
? float_relation_greater
: float_relation_less
;
2231 } else if (b
.cls
== float_class_zero
) {
2232 return a
.sign
? float_relation_less
: float_relation_greater
;
2235 /* The only really important thing about infinity is its sign. If
2236 * both are infinities the sign marks the smallest of the two.
2238 if (a
.cls
== float_class_inf
) {
2239 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
2240 return float_relation_equal
;
2242 return a
.sign
? float_relation_less
: float_relation_greater
;
2243 } else if (b
.cls
== float_class_inf
) {
2244 return b
.sign
? float_relation_greater
: float_relation_less
;
2247 if (a
.sign
!= b
.sign
) {
2248 return a
.sign
? float_relation_less
: float_relation_greater
;
2251 if (a
.exp
== b
.exp
) {
2252 if (a
.frac
== b
.frac
) {
2253 return float_relation_equal
;
2256 return a
.frac
> b
.frac
?
2257 float_relation_less
: float_relation_greater
;
2259 return a
.frac
> b
.frac
?
2260 float_relation_greater
: float_relation_less
;
2264 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
2266 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
2271 #define COMPARE(sz) \
2272 int float ## sz ## _compare(float ## sz a, float ## sz b, \
2275 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2276 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2277 return compare_floats(pa, pb, false, s); \
2279 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
2282 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2283 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2284 return compare_floats(pa, pb, true, s); \
2293 /* Multiply A by 2 raised to the power N. */
2294 static FloatParts
scalbn_decomposed(FloatParts a
, int n
, float_status
*s
)
2296 if (unlikely(is_nan(a
.cls
))) {
2297 return return_nan(a
, s
);
2299 if (a
.cls
== float_class_normal
) {
2300 /* The largest float type (even though not supported by FloatParts)
2301 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
2302 * still allows rounding to infinity, without allowing overflow
2303 * within the int32_t that backs FloatParts.exp.
2305 n
= MIN(MAX(n
, -0x10000), 0x10000);
2311 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
2313 FloatParts pa
= float16_unpack_canonical(a
, status
);
2314 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
2315 return float16_round_pack_canonical(pr
, status
);
2318 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
2320 FloatParts pa
= float32_unpack_canonical(a
, status
);
2321 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
2322 return float32_round_pack_canonical(pr
, status
);
2325 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
2327 FloatParts pa
= float64_unpack_canonical(a
, status
);
2328 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
2329 return float64_round_pack_canonical(pr
, status
);
2335 * The old softfloat code did an approximation step before zeroing in
2336 * on the final result. However for simpleness we just compute the
2337 * square root by iterating down from the implicit bit to enough extra
2338 * bits to ensure we get a correctly rounded result.
2340 * This does mean however the calculation is slower than before,
2341 * especially for 64 bit floats.
2344 static FloatParts
sqrt_float(FloatParts a
, float_status
*s
, const FloatFmt
*p
)
2346 uint64_t a_frac
, r_frac
, s_frac
;
2349 if (is_nan(a
.cls
)) {
2350 return return_nan(a
, s
);
2352 if (a
.cls
== float_class_zero
) {
2353 return a
; /* sqrt(+-0) = +-0 */
2356 s
->float_exception_flags
|= float_flag_invalid
;
2357 return parts_default_nan(s
);
2359 if (a
.cls
== float_class_inf
) {
2360 return a
; /* sqrt(+inf) = +inf */
2363 assert(a
.cls
== float_class_normal
);
2365 /* We need two overflow bits at the top. Adding room for that is a
2366 * right shift. If the exponent is odd, we can discard the low bit
2367 * by multiplying the fraction by 2; that's a left shift. Combine
2368 * those and we shift right if the exponent is even.
2376 /* Bit-by-bit computation of sqrt. */
2380 /* Iterate from implicit bit down to the 3 extra bits to compute a
2381 * properly rounded result. Remember we've inserted one more bit
2382 * at the top, so these positions are one less.
2384 bit
= DECOMPOSED_BINARY_POINT
- 1;
2385 last_bit
= MAX(p
->frac_shift
- 4, 0);
2387 uint64_t q
= 1ULL << bit
;
2388 uint64_t t_frac
= s_frac
+ q
;
2389 if (t_frac
<= a_frac
) {
2390 s_frac
= t_frac
+ q
;
2395 } while (--bit
>= last_bit
);
2397 /* Undo the right shift done above. If there is any remaining
2398 * fraction, the result is inexact. Set the sticky bit.
2400 a
.frac
= (r_frac
<< 1) + (a_frac
!= 0);
2405 float16
__attribute__((flatten
)) float16_sqrt(float16 a
, float_status
*status
)
2407 FloatParts pa
= float16_unpack_canonical(a
, status
);
2408 FloatParts pr
= sqrt_float(pa
, status
, &float16_params
);
2409 return float16_round_pack_canonical(pr
, status
);
2412 float32
__attribute__((flatten
)) float32_sqrt(float32 a
, float_status
*status
)
2414 FloatParts pa
= float32_unpack_canonical(a
, status
);
2415 FloatParts pr
= sqrt_float(pa
, status
, &float32_params
);
2416 return float32_round_pack_canonical(pr
, status
);
2419 float64
__attribute__((flatten
)) float64_sqrt(float64 a
, float_status
*status
)
2421 FloatParts pa
= float64_unpack_canonical(a
, status
);
2422 FloatParts pr
= sqrt_float(pa
, status
, &float64_params
);
2423 return float64_round_pack_canonical(pr
, status
);
2426 /*----------------------------------------------------------------------------
2427 | The pattern for a default generated NaN.
2428 *----------------------------------------------------------------------------*/
2430 float16
float16_default_nan(float_status
*status
)
2432 FloatParts p
= parts_default_nan(status
);
2433 p
.frac
>>= float16_params
.frac_shift
;
2434 return float16_pack_raw(p
);
2437 float32
float32_default_nan(float_status
*status
)
2439 FloatParts p
= parts_default_nan(status
);
2440 p
.frac
>>= float32_params
.frac_shift
;
2441 return float32_pack_raw(p
);
2444 float64
float64_default_nan(float_status
*status
)
2446 FloatParts p
= parts_default_nan(status
);
2447 p
.frac
>>= float64_params
.frac_shift
;
2448 return float64_pack_raw(p
);
2451 float128
float128_default_nan(float_status
*status
)
2453 FloatParts p
= parts_default_nan(status
);
2456 /* Extrapolate from the choices made by parts_default_nan to fill
2457 * in the quad-floating format. If the low bit is set, assume we
2458 * want to set all non-snan bits.
2460 r
.low
= -(p
.frac
& 1);
2461 r
.high
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- 48);
2462 r
.high
|= LIT64(0x7FFF000000000000);
2463 r
.high
|= (uint64_t)p
.sign
<< 63;
2468 /*----------------------------------------------------------------------------
2469 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
2470 *----------------------------------------------------------------------------*/
2472 float16
float16_silence_nan(float16 a
, float_status
*status
)
2474 FloatParts p
= float16_unpack_raw(a
);
2475 p
.frac
<<= float16_params
.frac_shift
;
2476 p
= parts_silence_nan(p
, status
);
2477 p
.frac
>>= float16_params
.frac_shift
;
2478 return float16_pack_raw(p
);
2481 float32
float32_silence_nan(float32 a
, float_status
*status
)
2483 FloatParts p
= float32_unpack_raw(a
);
2484 p
.frac
<<= float32_params
.frac_shift
;
2485 p
= parts_silence_nan(p
, status
);
2486 p
.frac
>>= float32_params
.frac_shift
;
2487 return float32_pack_raw(p
);
2490 float64
float64_silence_nan(float64 a
, float_status
*status
)
2492 FloatParts p
= float64_unpack_raw(a
);
2493 p
.frac
<<= float64_params
.frac_shift
;
2494 p
= parts_silence_nan(p
, status
);
2495 p
.frac
>>= float64_params
.frac_shift
;
2496 return float64_pack_raw(p
);
2499 /*----------------------------------------------------------------------------
2500 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2501 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2502 | input. If `zSign' is 1, the input is negated before being converted to an
2503 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2504 | is simply rounded to an integer, with the inexact exception raised if the
2505 | input cannot be represented exactly as an integer. However, if the fixed-
2506 | point input is too large, the invalid exception is raised and the largest
2507 | positive or negative integer is returned.
2508 *----------------------------------------------------------------------------*/
2510 static int32_t roundAndPackInt32(flag zSign
, uint64_t absZ
, float_status
*status
)
2512 int8_t roundingMode
;
2513 flag roundNearestEven
;
2514 int8_t roundIncrement
, roundBits
;
2517 roundingMode
= status
->float_rounding_mode
;
2518 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2519 switch (roundingMode
) {
2520 case float_round_nearest_even
:
2521 case float_round_ties_away
:
2522 roundIncrement
= 0x40;
2524 case float_round_to_zero
:
2527 case float_round_up
:
2528 roundIncrement
= zSign
? 0 : 0x7f;
2530 case float_round_down
:
2531 roundIncrement
= zSign
? 0x7f : 0;
2536 roundBits
= absZ
& 0x7F;
2537 absZ
= ( absZ
+ roundIncrement
)>>7;
2538 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
2540 if ( zSign
) z
= - z
;
2541 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
2542 float_raise(float_flag_invalid
, status
);
2543 return zSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
2546 status
->float_exception_flags
|= float_flag_inexact
;
2552 /*----------------------------------------------------------------------------
2553 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2554 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2555 | and returns the properly rounded 64-bit integer corresponding to the input.
2556 | If `zSign' is 1, the input is negated before being converted to an integer.
2557 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2558 | the inexact exception raised if the input cannot be represented exactly as
2559 | an integer. However, if the fixed-point input is too large, the invalid
2560 | exception is raised and the largest positive or negative integer is
2562 *----------------------------------------------------------------------------*/
2564 static int64_t roundAndPackInt64(flag zSign
, uint64_t absZ0
, uint64_t absZ1
,
2565 float_status
*status
)
2567 int8_t roundingMode
;
2568 flag roundNearestEven
, increment
;
2571 roundingMode
= status
->float_rounding_mode
;
2572 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2573 switch (roundingMode
) {
2574 case float_round_nearest_even
:
2575 case float_round_ties_away
:
2576 increment
= ((int64_t) absZ1
< 0);
2578 case float_round_to_zero
:
2581 case float_round_up
:
2582 increment
= !zSign
&& absZ1
;
2584 case float_round_down
:
2585 increment
= zSign
&& absZ1
;
2592 if ( absZ0
== 0 ) goto overflow
;
2593 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
2596 if ( zSign
) z
= - z
;
2597 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
2599 float_raise(float_flag_invalid
, status
);
2601 zSign
? (int64_t) LIT64( 0x8000000000000000 )
2602 : LIT64( 0x7FFFFFFFFFFFFFFF );
2605 status
->float_exception_flags
|= float_flag_inexact
;
2611 /*----------------------------------------------------------------------------
2612 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2613 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2614 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2615 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2616 | with the inexact exception raised if the input cannot be represented exactly
2617 | as an integer. However, if the fixed-point input is too large, the invalid
2618 | exception is raised and the largest unsigned integer is returned.
2619 *----------------------------------------------------------------------------*/
2621 static int64_t roundAndPackUint64(flag zSign
, uint64_t absZ0
,
2622 uint64_t absZ1
, float_status
*status
)
2624 int8_t roundingMode
;
2625 flag roundNearestEven
, increment
;
2627 roundingMode
= status
->float_rounding_mode
;
2628 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
2629 switch (roundingMode
) {
2630 case float_round_nearest_even
:
2631 case float_round_ties_away
:
2632 increment
= ((int64_t)absZ1
< 0);
2634 case float_round_to_zero
:
2637 case float_round_up
:
2638 increment
= !zSign
&& absZ1
;
2640 case float_round_down
:
2641 increment
= zSign
&& absZ1
;
2649 float_raise(float_flag_invalid
, status
);
2650 return LIT64(0xFFFFFFFFFFFFFFFF);
2652 absZ0
&= ~(((uint64_t)(absZ1
<<1) == 0) & roundNearestEven
);
2655 if (zSign
&& absZ0
) {
2656 float_raise(float_flag_invalid
, status
);
2661 status
->float_exception_flags
|= float_flag_inexact
;
2666 /*----------------------------------------------------------------------------
2667 | If `a' is denormal and we are in flush-to-zero mode then set the
2668 | input-denormal exception and return zero. Otherwise just return the value.
2669 *----------------------------------------------------------------------------*/
2670 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
2672 if (status
->flush_inputs_to_zero
) {
2673 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
2674 float_raise(float_flag_input_denormal
, status
);
2675 return make_float32(float32_val(a
) & 0x80000000);
2681 /*----------------------------------------------------------------------------
2682 | Normalizes the subnormal single-precision floating-point value represented
2683 | by the denormalized significand `aSig'. The normalized exponent and
2684 | significand are stored at the locations pointed to by `zExpPtr' and
2685 | `zSigPtr', respectively.
2686 *----------------------------------------------------------------------------*/
2689 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
2693 shiftCount
= countLeadingZeros32( aSig
) - 8;
2694 *zSigPtr
= aSig
<<shiftCount
;
2695 *zExpPtr
= 1 - shiftCount
;
2699 /*----------------------------------------------------------------------------
2700 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2701 | and significand `zSig', and returns the proper single-precision floating-
2702 | point value corresponding to the abstract input. Ordinarily, the abstract
2703 | value is simply rounded and packed into the single-precision format, with
2704 | the inexact exception raised if the abstract input cannot be represented
2705 | exactly. However, if the abstract value is too large, the overflow and
2706 | inexact exceptions are raised and an infinity or maximal finite value is
2707 | returned. If the abstract value is too small, the input value is rounded to
2708 | a subnormal number, and the underflow and inexact exceptions are raised if
2709 | the abstract input cannot be represented exactly as a subnormal single-
2710 | precision floating-point number.
2711 | The input significand `zSig' has its binary point between bits 30
2712 | and 29, which is 7 bits to the left of the usual location. This shifted
2713 | significand must be normalized or smaller. If `zSig' is not normalized,
2714 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2715 | and it must not require rounding. In the usual case that `zSig' is
2716 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2717 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2718 | Binary Floating-Point Arithmetic.
2719 *----------------------------------------------------------------------------*/
2721 static float32
roundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
2722 float_status
*status
)
2724 int8_t roundingMode
;
2725 flag roundNearestEven
;
2726 int8_t roundIncrement
, roundBits
;
2729 roundingMode
= status
->float_rounding_mode
;
2730 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2731 switch (roundingMode
) {
2732 case float_round_nearest_even
:
2733 case float_round_ties_away
:
2734 roundIncrement
= 0x40;
2736 case float_round_to_zero
:
2739 case float_round_up
:
2740 roundIncrement
= zSign
? 0 : 0x7f;
2742 case float_round_down
:
2743 roundIncrement
= zSign
? 0x7f : 0;
2749 roundBits
= zSig
& 0x7F;
2750 if ( 0xFD <= (uint16_t) zExp
) {
2751 if ( ( 0xFD < zExp
)
2752 || ( ( zExp
== 0xFD )
2753 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
2755 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2756 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
2759 if (status
->flush_to_zero
) {
2760 float_raise(float_flag_output_denormal
, status
);
2761 return packFloat32(zSign
, 0, 0);
2764 (status
->float_detect_tininess
2765 == float_tininess_before_rounding
)
2767 || ( zSig
+ roundIncrement
< 0x80000000 );
2768 shift32RightJamming( zSig
, - zExp
, &zSig
);
2770 roundBits
= zSig
& 0x7F;
2771 if (isTiny
&& roundBits
) {
2772 float_raise(float_flag_underflow
, status
);
2777 status
->float_exception_flags
|= float_flag_inexact
;
2779 zSig
= ( zSig
+ roundIncrement
)>>7;
2780 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
2781 if ( zSig
== 0 ) zExp
= 0;
2782 return packFloat32( zSign
, zExp
, zSig
);
2786 /*----------------------------------------------------------------------------
2787 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2788 | and significand `zSig', and returns the proper single-precision floating-
2789 | point value corresponding to the abstract input. This routine is just like
2790 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2791 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2792 | floating-point exponent.
2793 *----------------------------------------------------------------------------*/
2796 normalizeRoundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
2797 float_status
*status
)
2801 shiftCount
= countLeadingZeros32( zSig
) - 1;
2802 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
2807 /*----------------------------------------------------------------------------
2808 | If `a' is denormal and we are in flush-to-zero mode then set the
2809 | input-denormal exception and return zero. Otherwise just return the value.
2810 *----------------------------------------------------------------------------*/
2811 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
2813 if (status
->flush_inputs_to_zero
) {
2814 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
2815 float_raise(float_flag_input_denormal
, status
);
2816 return make_float64(float64_val(a
) & (1ULL << 63));
2822 /*----------------------------------------------------------------------------
2823 | Normalizes the subnormal double-precision floating-point value represented
2824 | by the denormalized significand `aSig'. The normalized exponent and
2825 | significand are stored at the locations pointed to by `zExpPtr' and
2826 | `zSigPtr', respectively.
2827 *----------------------------------------------------------------------------*/
2830 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
2834 shiftCount
= countLeadingZeros64( aSig
) - 11;
2835 *zSigPtr
= aSig
<<shiftCount
;
2836 *zExpPtr
= 1 - shiftCount
;
2840 /*----------------------------------------------------------------------------
2841 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2842 | double-precision floating-point value, returning the result. After being
2843 | shifted into the proper positions, the three fields are simply added
2844 | together to form the result. This means that any integer portion of `zSig'
2845 | will be added into the exponent. Since a properly normalized significand
2846 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2847 | than the desired result exponent whenever `zSig' is a complete, normalized
2849 *----------------------------------------------------------------------------*/
2851 static inline float64
packFloat64(flag zSign
, int zExp
, uint64_t zSig
)
2854 return make_float64(
2855 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
2859 /*----------------------------------------------------------------------------
2860 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2861 | and significand `zSig', and returns the proper double-precision floating-
2862 | point value corresponding to the abstract input. Ordinarily, the abstract
2863 | value is simply rounded and packed into the double-precision format, with
2864 | the inexact exception raised if the abstract input cannot be represented
2865 | exactly. However, if the abstract value is too large, the overflow and
2866 | inexact exceptions are raised and an infinity or maximal finite value is
2867 | returned. If the abstract value is too small, the input value is rounded to
2868 | a subnormal number, and the underflow and inexact exceptions are raised if
2869 | the abstract input cannot be represented exactly as a subnormal double-
2870 | precision floating-point number.
2871 | The input significand `zSig' has its binary point between bits 62
2872 | and 61, which is 10 bits to the left of the usual location. This shifted
2873 | significand must be normalized or smaller. If `zSig' is not normalized,
2874 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2875 | and it must not require rounding. In the usual case that `zSig' is
2876 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2877 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2878 | Binary Floating-Point Arithmetic.
2879 *----------------------------------------------------------------------------*/
2881 static float64
roundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
2882 float_status
*status
)
2884 int8_t roundingMode
;
2885 flag roundNearestEven
;
2886 int roundIncrement
, roundBits
;
2889 roundingMode
= status
->float_rounding_mode
;
2890 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2891 switch (roundingMode
) {
2892 case float_round_nearest_even
:
2893 case float_round_ties_away
:
2894 roundIncrement
= 0x200;
2896 case float_round_to_zero
:
2899 case float_round_up
:
2900 roundIncrement
= zSign
? 0 : 0x3ff;
2902 case float_round_down
:
2903 roundIncrement
= zSign
? 0x3ff : 0;
2905 case float_round_to_odd
:
2906 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
2911 roundBits
= zSig
& 0x3FF;
2912 if ( 0x7FD <= (uint16_t) zExp
) {
2913 if ( ( 0x7FD < zExp
)
2914 || ( ( zExp
== 0x7FD )
2915 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
2917 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
2918 roundIncrement
!= 0;
2919 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2920 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
2923 if (status
->flush_to_zero
) {
2924 float_raise(float_flag_output_denormal
, status
);
2925 return packFloat64(zSign
, 0, 0);
2928 (status
->float_detect_tininess
2929 == float_tininess_before_rounding
)
2931 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
2932 shift64RightJamming( zSig
, - zExp
, &zSig
);
2934 roundBits
= zSig
& 0x3FF;
2935 if (isTiny
&& roundBits
) {
2936 float_raise(float_flag_underflow
, status
);
2938 if (roundingMode
== float_round_to_odd
) {
2940 * For round-to-odd case, the roundIncrement depends on
2941 * zSig which just changed.
2943 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
2948 status
->float_exception_flags
|= float_flag_inexact
;
2950 zSig
= ( zSig
+ roundIncrement
)>>10;
2951 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
2952 if ( zSig
== 0 ) zExp
= 0;
2953 return packFloat64( zSign
, zExp
, zSig
);
2957 /*----------------------------------------------------------------------------
2958 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2959 | and significand `zSig', and returns the proper double-precision floating-
2960 | point value corresponding to the abstract input. This routine is just like
2961 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2962 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2963 | floating-point exponent.
2964 *----------------------------------------------------------------------------*/
2967 normalizeRoundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
2968 float_status
*status
)
2972 shiftCount
= countLeadingZeros64( zSig
) - 1;
2973 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
2978 /*----------------------------------------------------------------------------
2979 | Normalizes the subnormal extended double-precision floating-point value
2980 | represented by the denormalized significand `aSig'. The normalized exponent
2981 | and significand are stored at the locations pointed to by `zExpPtr' and
2982 | `zSigPtr', respectively.
2983 *----------------------------------------------------------------------------*/
2985 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
2990 shiftCount
= countLeadingZeros64( aSig
);
2991 *zSigPtr
= aSig
<<shiftCount
;
2992 *zExpPtr
= 1 - shiftCount
;
2995 /*----------------------------------------------------------------------------
2996 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2997 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2998 | and returns the proper extended double-precision floating-point value
2999 | corresponding to the abstract input. Ordinarily, the abstract value is
3000 | rounded and packed into the extended double-precision format, with the
3001 | inexact exception raised if the abstract input cannot be represented
3002 | exactly. However, if the abstract value is too large, the overflow and
3003 | inexact exceptions are raised and an infinity or maximal finite value is
3004 | returned. If the abstract value is too small, the input value is rounded to
3005 | a subnormal number, and the underflow and inexact exceptions are raised if
3006 | the abstract input cannot be represented exactly as a subnormal extended
3007 | double-precision floating-point number.
3008 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
3009 | number of bits as single or double precision, respectively. Otherwise, the
3010 | result is rounded to the full precision of the extended double-precision
3012 | The input significand must be normalized or smaller. If the input
3013 | significand is not normalized, `zExp' must be 0; in that case, the result
3014 | returned is a subnormal number, and it must not require rounding. The
3015 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3016 | Floating-Point Arithmetic.
3017 *----------------------------------------------------------------------------*/
3019 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, flag zSign
,
3020 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
3021 float_status
*status
)
3023 int8_t roundingMode
;
3024 flag roundNearestEven
, increment
, isTiny
;
3025 int64_t roundIncrement
, roundMask
, roundBits
;
3027 roundingMode
= status
->float_rounding_mode
;
3028 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3029 if ( roundingPrecision
== 80 ) goto precision80
;
3030 if ( roundingPrecision
== 64 ) {
3031 roundIncrement
= LIT64( 0x0000000000000400 );
3032 roundMask
= LIT64( 0x00000000000007FF );
3034 else if ( roundingPrecision
== 32 ) {
3035 roundIncrement
= LIT64( 0x0000008000000000 );
3036 roundMask
= LIT64( 0x000000FFFFFFFFFF );
3041 zSig0
|= ( zSig1
!= 0 );
3042 switch (roundingMode
) {
3043 case float_round_nearest_even
:
3044 case float_round_ties_away
:
3046 case float_round_to_zero
:
3049 case float_round_up
:
3050 roundIncrement
= zSign
? 0 : roundMask
;
3052 case float_round_down
:
3053 roundIncrement
= zSign
? roundMask
: 0;
3058 roundBits
= zSig0
& roundMask
;
3059 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
3060 if ( ( 0x7FFE < zExp
)
3061 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
3066 if (status
->flush_to_zero
) {
3067 float_raise(float_flag_output_denormal
, status
);
3068 return packFloatx80(zSign
, 0, 0);
3071 (status
->float_detect_tininess
3072 == float_tininess_before_rounding
)
3074 || ( zSig0
<= zSig0
+ roundIncrement
);
3075 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
3077 roundBits
= zSig0
& roundMask
;
3078 if (isTiny
&& roundBits
) {
3079 float_raise(float_flag_underflow
, status
);
3082 status
->float_exception_flags
|= float_flag_inexact
;
3084 zSig0
+= roundIncrement
;
3085 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
3086 roundIncrement
= roundMask
+ 1;
3087 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
3088 roundMask
|= roundIncrement
;
3090 zSig0
&= ~ roundMask
;
3091 return packFloatx80( zSign
, zExp
, zSig0
);
3095 status
->float_exception_flags
|= float_flag_inexact
;
3097 zSig0
+= roundIncrement
;
3098 if ( zSig0
< roundIncrement
) {
3100 zSig0
= LIT64( 0x8000000000000000 );
3102 roundIncrement
= roundMask
+ 1;
3103 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
3104 roundMask
|= roundIncrement
;
3106 zSig0
&= ~ roundMask
;
3107 if ( zSig0
== 0 ) zExp
= 0;
3108 return packFloatx80( zSign
, zExp
, zSig0
);
3110 switch (roundingMode
) {
3111 case float_round_nearest_even
:
3112 case float_round_ties_away
:
3113 increment
= ((int64_t)zSig1
< 0);
3115 case float_round_to_zero
:
3118 case float_round_up
:
3119 increment
= !zSign
&& zSig1
;
3121 case float_round_down
:
3122 increment
= zSign
&& zSig1
;
3127 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
3128 if ( ( 0x7FFE < zExp
)
3129 || ( ( zExp
== 0x7FFE )
3130 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
3136 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3137 if ( ( roundingMode
== float_round_to_zero
)
3138 || ( zSign
&& ( roundingMode
== float_round_up
) )
3139 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
3141 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
3143 return packFloatx80(zSign
,
3144 floatx80_infinity_high
,
3145 floatx80_infinity_low
);
3149 (status
->float_detect_tininess
3150 == float_tininess_before_rounding
)
3153 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
3154 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
3156 if (isTiny
&& zSig1
) {
3157 float_raise(float_flag_underflow
, status
);
3160 status
->float_exception_flags
|= float_flag_inexact
;
3162 switch (roundingMode
) {
3163 case float_round_nearest_even
:
3164 case float_round_ties_away
:
3165 increment
= ((int64_t)zSig1
< 0);
3167 case float_round_to_zero
:
3170 case float_round_up
:
3171 increment
= !zSign
&& zSig1
;
3173 case float_round_down
:
3174 increment
= zSign
&& zSig1
;
3182 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
3183 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
3185 return packFloatx80( zSign
, zExp
, zSig0
);
3189 status
->float_exception_flags
|= float_flag_inexact
;
3195 zSig0
= LIT64( 0x8000000000000000 );
3198 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
3202 if ( zSig0
== 0 ) zExp
= 0;
3204 return packFloatx80( zSign
, zExp
, zSig0
);
3208 /*----------------------------------------------------------------------------
3209 | Takes an abstract floating-point value having sign `zSign', exponent
3210 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3211 | and returns the proper extended double-precision floating-point value
3212 | corresponding to the abstract input. This routine is just like
3213 | `roundAndPackFloatx80' except that the input significand does not have to be
3215 *----------------------------------------------------------------------------*/
3217 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
3218 flag zSign
, int32_t zExp
,
3219 uint64_t zSig0
, uint64_t zSig1
,
3220 float_status
*status
)
3229 shiftCount
= countLeadingZeros64( zSig0
);
3230 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
3232 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
3233 zSig0
, zSig1
, status
);
3237 /*----------------------------------------------------------------------------
3238 | Returns the least-significant 64 fraction bits of the quadruple-precision
3239 | floating-point value `a'.
3240 *----------------------------------------------------------------------------*/
3242 static inline uint64_t extractFloat128Frac1( float128 a
)
3249 /*----------------------------------------------------------------------------
3250 | Returns the most-significant 48 fraction bits of the quadruple-precision
3251 | floating-point value `a'.
3252 *----------------------------------------------------------------------------*/
3254 static inline uint64_t extractFloat128Frac0( float128 a
)
3257 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
3261 /*----------------------------------------------------------------------------
3262 | Returns the exponent bits of the quadruple-precision floating-point value
3264 *----------------------------------------------------------------------------*/
3266 static inline int32_t extractFloat128Exp( float128 a
)
3269 return ( a
.high
>>48 ) & 0x7FFF;
3273 /*----------------------------------------------------------------------------
3274 | Returns the sign bit of the quadruple-precision floating-point value `a'.
3275 *----------------------------------------------------------------------------*/
3277 static inline flag
extractFloat128Sign( float128 a
)
3284 /*----------------------------------------------------------------------------
3285 | Normalizes the subnormal quadruple-precision floating-point value
3286 | represented by the denormalized significand formed by the concatenation of
3287 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
3288 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
3289 | significand are stored at the location pointed to by `zSig0Ptr', and the
3290 | least significant 64 bits of the normalized significand are stored at the
3291 | location pointed to by `zSig1Ptr'.
3292 *----------------------------------------------------------------------------*/
3295 normalizeFloat128Subnormal(
3306 shiftCount
= countLeadingZeros64( aSig1
) - 15;
3307 if ( shiftCount
< 0 ) {
3308 *zSig0Ptr
= aSig1
>>( - shiftCount
);
3309 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
3312 *zSig0Ptr
= aSig1
<<shiftCount
;
3315 *zExpPtr
= - shiftCount
- 63;
3318 shiftCount
= countLeadingZeros64( aSig0
) - 15;
3319 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
3320 *zExpPtr
= 1 - shiftCount
;
3325 /*----------------------------------------------------------------------------
3326 | Packs the sign `zSign', the exponent `zExp', and the significand formed
3327 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
3328 | floating-point value, returning the result. After being shifted into the
3329 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
3330 | added together to form the most significant 32 bits of the result. This
3331 | means that any integer portion of `zSig0' will be added into the exponent.
3332 | Since a properly normalized significand will have an integer portion equal
3333 | to 1, the `zExp' input should be 1 less than the desired result exponent
3334 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
3336 *----------------------------------------------------------------------------*/
3338 static inline float128
3339 packFloat128( flag zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
3344 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
3349 /*----------------------------------------------------------------------------
3350 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3351 | and extended significand formed by the concatenation of `zSig0', `zSig1',
3352 | and `zSig2', and returns the proper quadruple-precision floating-point value
3353 | corresponding to the abstract input. Ordinarily, the abstract value is
3354 | simply rounded and packed into the quadruple-precision format, with the
3355 | inexact exception raised if the abstract input cannot be represented
3356 | exactly. However, if the abstract value is too large, the overflow and
3357 | inexact exceptions are raised and an infinity or maximal finite value is
3358 | returned. If the abstract value is too small, the input value is rounded to
3359 | a subnormal number, and the underflow and inexact exceptions are raised if
3360 | the abstract input cannot be represented exactly as a subnormal quadruple-
3361 | precision floating-point number.
3362 | The input significand must be normalized or smaller. If the input
3363 | significand is not normalized, `zExp' must be 0; in that case, the result
3364 | returned is a subnormal number, and it must not require rounding. In the
3365 | usual case that the input significand is normalized, `zExp' must be 1 less
3366 | than the ``true'' floating-point exponent. The handling of underflow and
3367 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3368 *----------------------------------------------------------------------------*/
3370 static float128
roundAndPackFloat128(flag zSign
, int32_t zExp
,
3371 uint64_t zSig0
, uint64_t zSig1
,
3372 uint64_t zSig2
, float_status
*status
)
3374 int8_t roundingMode
;
3375 flag roundNearestEven
, increment
, isTiny
;
3377 roundingMode
= status
->float_rounding_mode
;
3378 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3379 switch (roundingMode
) {
3380 case float_round_nearest_even
:
3381 case float_round_ties_away
:
3382 increment
= ((int64_t)zSig2
< 0);
3384 case float_round_to_zero
:
3387 case float_round_up
:
3388 increment
= !zSign
&& zSig2
;
3390 case float_round_down
:
3391 increment
= zSign
&& zSig2
;
3393 case float_round_to_odd
:
3394 increment
= !(zSig1
& 0x1) && zSig2
;
3399 if ( 0x7FFD <= (uint32_t) zExp
) {
3400 if ( ( 0x7FFD < zExp
)
3401 || ( ( zExp
== 0x7FFD )
3403 LIT64( 0x0001FFFFFFFFFFFF ),
3404 LIT64( 0xFFFFFFFFFFFFFFFF ),
3411 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3412 if ( ( roundingMode
== float_round_to_zero
)
3413 || ( zSign
&& ( roundingMode
== float_round_up
) )
3414 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
3415 || (roundingMode
== float_round_to_odd
)
3421 LIT64( 0x0000FFFFFFFFFFFF ),
3422 LIT64( 0xFFFFFFFFFFFFFFFF )
3425 return packFloat128( zSign
, 0x7FFF, 0, 0 );
3428 if (status
->flush_to_zero
) {
3429 float_raise(float_flag_output_denormal
, status
);
3430 return packFloat128(zSign
, 0, 0, 0);
3433 (status
->float_detect_tininess
3434 == float_tininess_before_rounding
)
3440 LIT64( 0x0001FFFFFFFFFFFF ),
3441 LIT64( 0xFFFFFFFFFFFFFFFF )
3443 shift128ExtraRightJamming(
3444 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
3446 if (isTiny
&& zSig2
) {
3447 float_raise(float_flag_underflow
, status
);
3449 switch (roundingMode
) {
3450 case float_round_nearest_even
:
3451 case float_round_ties_away
:
3452 increment
= ((int64_t)zSig2
< 0);
3454 case float_round_to_zero
:
3457 case float_round_up
:
3458 increment
= !zSign
&& zSig2
;
3460 case float_round_down
:
3461 increment
= zSign
&& zSig2
;
3463 case float_round_to_odd
:
3464 increment
= !(zSig1
& 0x1) && zSig2
;
3472 status
->float_exception_flags
|= float_flag_inexact
;
3475 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
3476 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
3479 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
3481 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
3485 /*----------------------------------------------------------------------------
3486 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3487 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3488 | returns the proper quadruple-precision floating-point value corresponding
3489 | to the abstract input. This routine is just like `roundAndPackFloat128'
3490 | except that the input significand has fewer bits and does not have to be
3491 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
3493 *----------------------------------------------------------------------------*/
3495 static float128
normalizeRoundAndPackFloat128(flag zSign
, int32_t zExp
,
3496 uint64_t zSig0
, uint64_t zSig1
,
3497 float_status
*status
)
3507 shiftCount
= countLeadingZeros64( zSig0
) - 15;
3508 if ( 0 <= shiftCount
) {
3510 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
3513 shift128ExtraRightJamming(
3514 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
3517 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
3522 /*----------------------------------------------------------------------------
3523 | Returns the result of converting the 32-bit two's complement integer `a'
3524 | to the extended double-precision floating-point format. The conversion
3525 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3527 *----------------------------------------------------------------------------*/
3529 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
3536 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
3538 absA
= zSign
? - a
: a
;
3539 shiftCount
= countLeadingZeros32( absA
) + 32;
3541 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
3545 /*----------------------------------------------------------------------------
3546 | Returns the result of converting the 32-bit two's complement integer `a' to
3547 | the quadruple-precision floating-point format. The conversion is performed
3548 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3549 *----------------------------------------------------------------------------*/
3551 float128
int32_to_float128(int32_t a
, float_status
*status
)
3558 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
3560 absA
= zSign
? - a
: a
;
3561 shiftCount
= countLeadingZeros32( absA
) + 17;
3563 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
3567 /*----------------------------------------------------------------------------
3568 | Returns the result of converting the 64-bit two's complement integer `a'
3569 | to the extended double-precision floating-point format. The conversion
3570 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3572 *----------------------------------------------------------------------------*/
3574 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
3580 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
3582 absA
= zSign
? - a
: a
;
3583 shiftCount
= countLeadingZeros64( absA
);
3584 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
3588 /*----------------------------------------------------------------------------
3589 | Returns the result of converting the 64-bit two's complement integer `a' to
3590 | the quadruple-precision floating-point format. The conversion is performed
3591 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3592 *----------------------------------------------------------------------------*/
3594 float128
int64_to_float128(int64_t a
, float_status
*status
)
3600 uint64_t zSig0
, zSig1
;
3602 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
3604 absA
= zSign
? - a
: a
;
3605 shiftCount
= countLeadingZeros64( absA
) + 49;
3606 zExp
= 0x406E - shiftCount
;
3607 if ( 64 <= shiftCount
) {
3616 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
3617 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
3621 /*----------------------------------------------------------------------------
3622 | Returns the result of converting the 64-bit unsigned integer `a'
3623 | to the quadruple-precision floating-point format. The conversion is performed
3624 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3625 *----------------------------------------------------------------------------*/
3627 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
3630 return float128_zero
;
3632 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a
, status
);
3635 /*----------------------------------------------------------------------------
3636 | Returns the result of converting the single-precision floating-point value
3637 | `a' to the extended double-precision floating-point format. The conversion
3638 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3640 *----------------------------------------------------------------------------*/
3642 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
3648 a
= float32_squash_input_denormal(a
, status
);
3649 aSig
= extractFloat32Frac( a
);
3650 aExp
= extractFloat32Exp( a
);
3651 aSign
= extractFloat32Sign( a
);
3652 if ( aExp
== 0xFF ) {
3654 return commonNaNToFloatx80(float32ToCommonNaN(a
, status
), status
);
3656 return packFloatx80(aSign
,
3657 floatx80_infinity_high
,
3658 floatx80_infinity_low
);
3661 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
3662 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3665 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
3669 /*----------------------------------------------------------------------------
3670 | Returns the result of converting the single-precision floating-point value
3671 | `a' to the double-precision floating-point format. The conversion is
3672 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3674 *----------------------------------------------------------------------------*/
3676 float128
float32_to_float128(float32 a
, float_status
*status
)
3682 a
= float32_squash_input_denormal(a
, status
);
3683 aSig
= extractFloat32Frac( a
);
3684 aExp
= extractFloat32Exp( a
);
3685 aSign
= extractFloat32Sign( a
);
3686 if ( aExp
== 0xFF ) {
3688 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
3690 return packFloat128( aSign
, 0x7FFF, 0, 0 );
3693 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
3694 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3697 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
3701 /*----------------------------------------------------------------------------
3702 | Returns the remainder of the single-precision floating-point value `a'
3703 | with respect to the corresponding value `b'. The operation is performed
3704 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3705 *----------------------------------------------------------------------------*/
3707 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
3710 int aExp
, bExp
, expDiff
;
3711 uint32_t aSig
, bSig
;
3713 uint64_t aSig64
, bSig64
, q64
;
3714 uint32_t alternateASig
;
3716 a
= float32_squash_input_denormal(a
, status
);
3717 b
= float32_squash_input_denormal(b
, status
);
3719 aSig
= extractFloat32Frac( a
);
3720 aExp
= extractFloat32Exp( a
);
3721 aSign
= extractFloat32Sign( a
);
3722 bSig
= extractFloat32Frac( b
);
3723 bExp
= extractFloat32Exp( b
);
3724 if ( aExp
== 0xFF ) {
3725 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
3726 return propagateFloat32NaN(a
, b
, status
);
3728 float_raise(float_flag_invalid
, status
);
3729 return float32_default_nan(status
);
3731 if ( bExp
== 0xFF ) {
3733 return propagateFloat32NaN(a
, b
, status
);
3739 float_raise(float_flag_invalid
, status
);
3740 return float32_default_nan(status
);
3742 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
3745 if ( aSig
== 0 ) return a
;
3746 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3748 expDiff
= aExp
- bExp
;
3751 if ( expDiff
< 32 ) {
3754 if ( expDiff
< 0 ) {
3755 if ( expDiff
< -1 ) return a
;
3758 q
= ( bSig
<= aSig
);
3759 if ( q
) aSig
-= bSig
;
3760 if ( 0 < expDiff
) {
3761 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
3764 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3772 if ( bSig
<= aSig
) aSig
-= bSig
;
3773 aSig64
= ( (uint64_t) aSig
)<<40;
3774 bSig64
= ( (uint64_t) bSig
)<<40;
3776 while ( 0 < expDiff
) {
3777 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
3778 q64
= ( 2 < q64
) ? q64
- 2 : 0;
3779 aSig64
= - ( ( bSig
* q64
)<<38 );
3783 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
3784 q64
= ( 2 < q64
) ? q64
- 2 : 0;
3785 q
= q64
>>( 64 - expDiff
);
3787 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
3790 alternateASig
= aSig
;
3793 } while ( 0 <= (int32_t) aSig
);
3794 sigMean
= aSig
+ alternateASig
;
3795 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3796 aSig
= alternateASig
;
3798 zSign
= ( (int32_t) aSig
< 0 );
3799 if ( zSign
) aSig
= - aSig
;
3800 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
3805 /*----------------------------------------------------------------------------
3806 | Returns the binary exponential of the single-precision floating-point value
3807 | `a'. The operation is performed according to the IEC/IEEE Standard for
3808 | Binary Floating-Point Arithmetic.
3810 | Uses the following identities:
3812 | 1. -------------------------------------------------------------------------
3816 | 2. -------------------------------------------------------------------------
3819 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3821 *----------------------------------------------------------------------------*/
3823 static const float64 float32_exp2_coefficients
[15] =
3825 const_float64( 0x3ff0000000000000ll
), /* 1 */
3826 const_float64( 0x3fe0000000000000ll
), /* 2 */
3827 const_float64( 0x3fc5555555555555ll
), /* 3 */
3828 const_float64( 0x3fa5555555555555ll
), /* 4 */
3829 const_float64( 0x3f81111111111111ll
), /* 5 */
3830 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
3831 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
3832 const_float64( 0x3efa01a01a01a01all
), /* 8 */
3833 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
3834 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
3835 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
3836 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
3837 const_float64( 0x3de6124613a86d09ll
), /* 13 */
3838 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
3839 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
3842 float32
float32_exp2(float32 a
, float_status
*status
)
3849 a
= float32_squash_input_denormal(a
, status
);
3851 aSig
= extractFloat32Frac( a
);
3852 aExp
= extractFloat32Exp( a
);
3853 aSign
= extractFloat32Sign( a
);
3855 if ( aExp
== 0xFF) {
3857 return propagateFloat32NaN(a
, float32_zero
, status
);
3859 return (aSign
) ? float32_zero
: a
;
3862 if (aSig
== 0) return float32_one
;
3865 float_raise(float_flag_inexact
, status
);
3867 /* ******************************* */
3868 /* using float64 for approximation */
3869 /* ******************************* */
3870 x
= float32_to_float64(a
, status
);
3871 x
= float64_mul(x
, float64_ln2
, status
);
3875 for (i
= 0 ; i
< 15 ; i
++) {
3878 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
3879 r
= float64_add(r
, f
, status
);
3881 xn
= float64_mul(xn
, x
, status
);
3884 return float64_to_float32(r
, status
);
3887 /*----------------------------------------------------------------------------
3888 | Returns the binary log of the single-precision floating-point value `a'.
3889 | The operation is performed according to the IEC/IEEE Standard for Binary
3890 | Floating-Point Arithmetic.
3891 *----------------------------------------------------------------------------*/
3892 float32
float32_log2(float32 a
, float_status
*status
)
3896 uint32_t aSig
, zSig
, i
;
3898 a
= float32_squash_input_denormal(a
, status
);
3899 aSig
= extractFloat32Frac( a
);
3900 aExp
= extractFloat32Exp( a
);
3901 aSign
= extractFloat32Sign( a
);
3904 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
3905 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3908 float_raise(float_flag_invalid
, status
);
3909 return float32_default_nan(status
);
3911 if ( aExp
== 0xFF ) {
3913 return propagateFloat32NaN(a
, float32_zero
, status
);
3923 for (i
= 1 << 22; i
> 0; i
>>= 1) {
3924 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
3925 if ( aSig
& 0x01000000 ) {
3934 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
3937 /*----------------------------------------------------------------------------
3938 | Returns 1 if the single-precision floating-point value `a' is equal to
3939 | the corresponding value `b', and 0 otherwise. The invalid exception is
3940 | raised if either operand is a NaN. Otherwise, the comparison is performed
3941 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3942 *----------------------------------------------------------------------------*/
3944 int float32_eq(float32 a
, float32 b
, float_status
*status
)
3947 a
= float32_squash_input_denormal(a
, status
);
3948 b
= float32_squash_input_denormal(b
, status
);
3950 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3951 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3953 float_raise(float_flag_invalid
, status
);
3956 av
= float32_val(a
);
3957 bv
= float32_val(b
);
3958 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3961 /*----------------------------------------------------------------------------
3962 | Returns 1 if the single-precision floating-point value `a' is less than
3963 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3964 | exception is raised if either operand is a NaN. The comparison is performed
3965 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3966 *----------------------------------------------------------------------------*/
3968 int float32_le(float32 a
, float32 b
, float_status
*status
)
3972 a
= float32_squash_input_denormal(a
, status
);
3973 b
= float32_squash_input_denormal(b
, status
);
3975 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3976 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3978 float_raise(float_flag_invalid
, status
);
3981 aSign
= extractFloat32Sign( a
);
3982 bSign
= extractFloat32Sign( b
);
3983 av
= float32_val(a
);
3984 bv
= float32_val(b
);
3985 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3986 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3990 /*----------------------------------------------------------------------------
3991 | Returns 1 if the single-precision floating-point value `a' is less than
3992 | the corresponding value `b', and 0 otherwise. The invalid exception is
3993 | raised if either operand is a NaN. The comparison is performed according
3994 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3995 *----------------------------------------------------------------------------*/
3997 int float32_lt(float32 a
, float32 b
, float_status
*status
)
4001 a
= float32_squash_input_denormal(a
, status
);
4002 b
= float32_squash_input_denormal(b
, status
);
4004 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4005 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4007 float_raise(float_flag_invalid
, status
);
4010 aSign
= extractFloat32Sign( a
);
4011 bSign
= extractFloat32Sign( b
);
4012 av
= float32_val(a
);
4013 bv
= float32_val(b
);
4014 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
4015 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4019 /*----------------------------------------------------------------------------
4020 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4021 | be compared, and 0 otherwise. The invalid exception is raised if either
4022 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4023 | Standard for Binary Floating-Point Arithmetic.
4024 *----------------------------------------------------------------------------*/
4026 int float32_unordered(float32 a
, float32 b
, float_status
*status
)
4028 a
= float32_squash_input_denormal(a
, status
);
4029 b
= float32_squash_input_denormal(b
, status
);
4031 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4032 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4034 float_raise(float_flag_invalid
, status
);
4040 /*----------------------------------------------------------------------------
4041 | Returns 1 if the single-precision floating-point value `a' is equal to
4042 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4043 | exception. The comparison is performed according to the IEC/IEEE Standard
4044 | for Binary Floating-Point Arithmetic.
4045 *----------------------------------------------------------------------------*/
4047 int float32_eq_quiet(float32 a
, float32 b
, float_status
*status
)
4049 a
= float32_squash_input_denormal(a
, status
);
4050 b
= float32_squash_input_denormal(b
, status
);
4052 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4053 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4055 if (float32_is_signaling_nan(a
, status
)
4056 || float32_is_signaling_nan(b
, status
)) {
4057 float_raise(float_flag_invalid
, status
);
4061 return ( float32_val(a
) == float32_val(b
) ) ||
4062 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
4065 /*----------------------------------------------------------------------------
4066 | Returns 1 if the single-precision floating-point value `a' is less than or
4067 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4068 | cause an exception. Otherwise, the comparison is performed according to the
4069 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4070 *----------------------------------------------------------------------------*/
4072 int float32_le_quiet(float32 a
, float32 b
, float_status
*status
)
4076 a
= float32_squash_input_denormal(a
, status
);
4077 b
= float32_squash_input_denormal(b
, status
);
4079 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4080 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4082 if (float32_is_signaling_nan(a
, status
)
4083 || float32_is_signaling_nan(b
, status
)) {
4084 float_raise(float_flag_invalid
, status
);
4088 aSign
= extractFloat32Sign( a
);
4089 bSign
= extractFloat32Sign( b
);
4090 av
= float32_val(a
);
4091 bv
= float32_val(b
);
4092 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
4093 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4097 /*----------------------------------------------------------------------------
4098 | Returns 1 if the single-precision floating-point value `a' is less than
4099 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4100 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4101 | Standard for Binary Floating-Point Arithmetic.
4102 *----------------------------------------------------------------------------*/
4104 int float32_lt_quiet(float32 a
, float32 b
, float_status
*status
)
4108 a
= float32_squash_input_denormal(a
, status
);
4109 b
= float32_squash_input_denormal(b
, status
);
4111 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4112 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4114 if (float32_is_signaling_nan(a
, status
)
4115 || float32_is_signaling_nan(b
, status
)) {
4116 float_raise(float_flag_invalid
, status
);
4120 aSign
= extractFloat32Sign( a
);
4121 bSign
= extractFloat32Sign( b
);
4122 av
= float32_val(a
);
4123 bv
= float32_val(b
);
4124 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
4125 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4129 /*----------------------------------------------------------------------------
4130 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4131 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4132 | comparison is performed according to the IEC/IEEE Standard for Binary
4133 | Floating-Point Arithmetic.
4134 *----------------------------------------------------------------------------*/
4136 int float32_unordered_quiet(float32 a
, float32 b
, float_status
*status
)
4138 a
= float32_squash_input_denormal(a
, status
);
4139 b
= float32_squash_input_denormal(b
, status
);
4141 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4142 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4144 if (float32_is_signaling_nan(a
, status
)
4145 || float32_is_signaling_nan(b
, status
)) {
4146 float_raise(float_flag_invalid
, status
);
4153 /*----------------------------------------------------------------------------
4154 | If `a' is denormal and we are in flush-to-zero mode then set the
4155 | input-denormal exception and return zero. Otherwise just return the value.
4156 *----------------------------------------------------------------------------*/
4157 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
4159 if (status
->flush_inputs_to_zero
) {
4160 if (extractFloat16Exp(a
) == 0 && extractFloat16Frac(a
) != 0) {
4161 float_raise(float_flag_input_denormal
, status
);
4162 return make_float16(float16_val(a
) & 0x8000);
4168 /*----------------------------------------------------------------------------
4169 | Returns the result of converting the double-precision floating-point value
4170 | `a' to the extended double-precision floating-point format. The conversion
4171 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4173 *----------------------------------------------------------------------------*/
4175 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
4181 a
= float64_squash_input_denormal(a
, status
);
4182 aSig
= extractFloat64Frac( a
);
4183 aExp
= extractFloat64Exp( a
);
4184 aSign
= extractFloat64Sign( a
);
4185 if ( aExp
== 0x7FF ) {
4187 return commonNaNToFloatx80(float64ToCommonNaN(a
, status
), status
);
4189 return packFloatx80(aSign
,
4190 floatx80_infinity_high
,
4191 floatx80_infinity_low
);
4194 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
4195 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4199 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
4203 /*----------------------------------------------------------------------------
4204 | Returns the result of converting the double-precision floating-point value
4205 | `a' to the quadruple-precision floating-point format. The conversion is
4206 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4208 *----------------------------------------------------------------------------*/
4210 float128
float64_to_float128(float64 a
, float_status
*status
)
4214 uint64_t aSig
, zSig0
, zSig1
;
4216 a
= float64_squash_input_denormal(a
, status
);
4217 aSig
= extractFloat64Frac( a
);
4218 aExp
= extractFloat64Exp( a
);
4219 aSign
= extractFloat64Sign( a
);
4220 if ( aExp
== 0x7FF ) {
4222 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
4224 return packFloat128( aSign
, 0x7FFF, 0, 0 );
4227 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
4228 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4231 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
4232 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
4237 /*----------------------------------------------------------------------------
4238 | Returns the remainder of the double-precision floating-point value `a'
4239 | with respect to the corresponding value `b'. The operation is performed
4240 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4241 *----------------------------------------------------------------------------*/
4243 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
4246 int aExp
, bExp
, expDiff
;
4247 uint64_t aSig
, bSig
;
4248 uint64_t q
, alternateASig
;
4251 a
= float64_squash_input_denormal(a
, status
);
4252 b
= float64_squash_input_denormal(b
, status
);
4253 aSig
= extractFloat64Frac( a
);
4254 aExp
= extractFloat64Exp( a
);
4255 aSign
= extractFloat64Sign( a
);
4256 bSig
= extractFloat64Frac( b
);
4257 bExp
= extractFloat64Exp( b
);
4258 if ( aExp
== 0x7FF ) {
4259 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
4260 return propagateFloat64NaN(a
, b
, status
);
4262 float_raise(float_flag_invalid
, status
);
4263 return float64_default_nan(status
);
4265 if ( bExp
== 0x7FF ) {
4267 return propagateFloat64NaN(a
, b
, status
);
4273 float_raise(float_flag_invalid
, status
);
4274 return float64_default_nan(status
);
4276 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4279 if ( aSig
== 0 ) return a
;
4280 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4282 expDiff
= aExp
- bExp
;
4283 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
4284 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4285 if ( expDiff
< 0 ) {
4286 if ( expDiff
< -1 ) return a
;
4289 q
= ( bSig
<= aSig
);
4290 if ( q
) aSig
-= bSig
;
4292 while ( 0 < expDiff
) {
4293 q
= estimateDiv128To64( aSig
, 0, bSig
);
4294 q
= ( 2 < q
) ? q
- 2 : 0;
4295 aSig
= - ( ( bSig
>>2 ) * q
);
4299 if ( 0 < expDiff
) {
4300 q
= estimateDiv128To64( aSig
, 0, bSig
);
4301 q
= ( 2 < q
) ? q
- 2 : 0;
4304 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4311 alternateASig
= aSig
;
4314 } while ( 0 <= (int64_t) aSig
);
4315 sigMean
= aSig
+ alternateASig
;
4316 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4317 aSig
= alternateASig
;
4319 zSign
= ( (int64_t) aSig
< 0 );
4320 if ( zSign
) aSig
= - aSig
;
4321 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
4325 /*----------------------------------------------------------------------------
4326 | Returns the binary log of the double-precision floating-point value `a'.
4327 | The operation is performed according to the IEC/IEEE Standard for Binary
4328 | Floating-Point Arithmetic.
4329 *----------------------------------------------------------------------------*/
4330 float64
float64_log2(float64 a
, float_status
*status
)
4334 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
4335 a
= float64_squash_input_denormal(a
, status
);
4337 aSig
= extractFloat64Frac( a
);
4338 aExp
= extractFloat64Exp( a
);
4339 aSign
= extractFloat64Sign( a
);
4342 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
4343 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4346 float_raise(float_flag_invalid
, status
);
4347 return float64_default_nan(status
);
4349 if ( aExp
== 0x7FF ) {
4351 return propagateFloat64NaN(a
, float64_zero
, status
);
4357 aSig
|= LIT64( 0x0010000000000000 );
4359 zSig
= (uint64_t)aExp
<< 52;
4360 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
4361 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
4362 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
4363 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
4371 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
4374 /*----------------------------------------------------------------------------
4375 | Returns 1 if the double-precision floating-point value `a' is equal to the
4376 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4377 | if either operand is a NaN. Otherwise, the comparison is performed
4378 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4379 *----------------------------------------------------------------------------*/
4381 int float64_eq(float64 a
, float64 b
, float_status
*status
)
4384 a
= float64_squash_input_denormal(a
, status
);
4385 b
= float64_squash_input_denormal(b
, status
);
4387 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4388 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4390 float_raise(float_flag_invalid
, status
);
4393 av
= float64_val(a
);
4394 bv
= float64_val(b
);
4395 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4399 /*----------------------------------------------------------------------------
4400 | Returns 1 if the double-precision floating-point value `a' is less than or
4401 | equal to the corresponding value `b', and 0 otherwise. The invalid
4402 | exception is raised if either operand is a NaN. The comparison is performed
4403 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4404 *----------------------------------------------------------------------------*/
4406 int float64_le(float64 a
, float64 b
, float_status
*status
)
4410 a
= float64_squash_input_denormal(a
, status
);
4411 b
= float64_squash_input_denormal(b
, status
);
4413 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4414 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4416 float_raise(float_flag_invalid
, status
);
4419 aSign
= extractFloat64Sign( a
);
4420 bSign
= extractFloat64Sign( b
);
4421 av
= float64_val(a
);
4422 bv
= float64_val(b
);
4423 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4424 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4428 /*----------------------------------------------------------------------------
4429 | Returns 1 if the double-precision floating-point value `a' is less than
4430 | the corresponding value `b', and 0 otherwise. The invalid exception is
4431 | raised if either operand is a NaN. The comparison is performed according
4432 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4433 *----------------------------------------------------------------------------*/
4435 int float64_lt(float64 a
, float64 b
, float_status
*status
)
4440 a
= float64_squash_input_denormal(a
, status
);
4441 b
= float64_squash_input_denormal(b
, status
);
4442 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4443 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4445 float_raise(float_flag_invalid
, status
);
4448 aSign
= extractFloat64Sign( a
);
4449 bSign
= extractFloat64Sign( b
);
4450 av
= float64_val(a
);
4451 bv
= float64_val(b
);
4452 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4453 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4457 /*----------------------------------------------------------------------------
4458 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4459 | be compared, and 0 otherwise. The invalid exception is raised if either
4460 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4461 | Standard for Binary Floating-Point Arithmetic.
4462 *----------------------------------------------------------------------------*/
4464 int float64_unordered(float64 a
, float64 b
, float_status
*status
)
4466 a
= float64_squash_input_denormal(a
, status
);
4467 b
= float64_squash_input_denormal(b
, status
);
4469 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4470 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4472 float_raise(float_flag_invalid
, status
);
4478 /*----------------------------------------------------------------------------
4479 | Returns 1 if the double-precision floating-point value `a' is equal to the
4480 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4481 | exception.The comparison is performed according to the IEC/IEEE Standard
4482 | for Binary Floating-Point Arithmetic.
4483 *----------------------------------------------------------------------------*/
4485 int float64_eq_quiet(float64 a
, float64 b
, float_status
*status
)
4488 a
= float64_squash_input_denormal(a
, status
);
4489 b
= float64_squash_input_denormal(b
, status
);
4491 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4492 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4494 if (float64_is_signaling_nan(a
, status
)
4495 || float64_is_signaling_nan(b
, status
)) {
4496 float_raise(float_flag_invalid
, status
);
4500 av
= float64_val(a
);
4501 bv
= float64_val(b
);
4502 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4506 /*----------------------------------------------------------------------------
4507 | Returns 1 if the double-precision floating-point value `a' is less than or
4508 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4509 | cause an exception. Otherwise, the comparison is performed according to the
4510 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4511 *----------------------------------------------------------------------------*/
4513 int float64_le_quiet(float64 a
, float64 b
, float_status
*status
)
4517 a
= float64_squash_input_denormal(a
, status
);
4518 b
= float64_squash_input_denormal(b
, status
);
4520 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4521 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4523 if (float64_is_signaling_nan(a
, status
)
4524 || float64_is_signaling_nan(b
, status
)) {
4525 float_raise(float_flag_invalid
, status
);
4529 aSign
= extractFloat64Sign( a
);
4530 bSign
= extractFloat64Sign( b
);
4531 av
= float64_val(a
);
4532 bv
= float64_val(b
);
4533 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4534 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4538 /*----------------------------------------------------------------------------
4539 | Returns 1 if the double-precision floating-point value `a' is less than
4540 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4541 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4542 | Standard for Binary Floating-Point Arithmetic.
4543 *----------------------------------------------------------------------------*/
4545 int float64_lt_quiet(float64 a
, float64 b
, float_status
*status
)
4549 a
= float64_squash_input_denormal(a
, status
);
4550 b
= float64_squash_input_denormal(b
, status
);
4552 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4553 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4555 if (float64_is_signaling_nan(a
, status
)
4556 || float64_is_signaling_nan(b
, status
)) {
4557 float_raise(float_flag_invalid
, status
);
4561 aSign
= extractFloat64Sign( a
);
4562 bSign
= extractFloat64Sign( b
);
4563 av
= float64_val(a
);
4564 bv
= float64_val(b
);
4565 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4566 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4570 /*----------------------------------------------------------------------------
4571 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4572 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4573 | comparison is performed according to the IEC/IEEE Standard for Binary
4574 | Floating-Point Arithmetic.
4575 *----------------------------------------------------------------------------*/
4577 int float64_unordered_quiet(float64 a
, float64 b
, float_status
*status
)
4579 a
= float64_squash_input_denormal(a
, status
);
4580 b
= float64_squash_input_denormal(b
, status
);
4582 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4583 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4585 if (float64_is_signaling_nan(a
, status
)
4586 || float64_is_signaling_nan(b
, status
)) {
4587 float_raise(float_flag_invalid
, status
);
4594 /*----------------------------------------------------------------------------
4595 | Returns the result of converting the extended double-precision floating-
4596 | point value `a' to the 32-bit two's complement integer format. The
4597 | conversion is performed according to the IEC/IEEE Standard for Binary
4598 | Floating-Point Arithmetic---which means in particular that the conversion
4599 | is rounded according to the current rounding mode. If `a' is a NaN, the
4600 | largest positive integer is returned. Otherwise, if the conversion
4601 | overflows, the largest integer with the same sign as `a' is returned.
4602 *----------------------------------------------------------------------------*/
4604 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
4607 int32_t aExp
, shiftCount
;
4610 if (floatx80_invalid_encoding(a
)) {
4611 float_raise(float_flag_invalid
, status
);
4614 aSig
= extractFloatx80Frac( a
);
4615 aExp
= extractFloatx80Exp( a
);
4616 aSign
= extractFloatx80Sign( a
);
4617 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4618 shiftCount
= 0x4037 - aExp
;
4619 if ( shiftCount
<= 0 ) shiftCount
= 1;
4620 shift64RightJamming( aSig
, shiftCount
, &aSig
);
4621 return roundAndPackInt32(aSign
, aSig
, status
);
4625 /*----------------------------------------------------------------------------
4626 | Returns the result of converting the extended double-precision floating-
4627 | point value `a' to the 32-bit two's complement integer format. The
4628 | conversion is performed according to the IEC/IEEE Standard for Binary
4629 | Floating-Point Arithmetic, except that the conversion is always rounded
4630 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4631 | Otherwise, if the conversion overflows, the largest integer with the same
4632 | sign as `a' is returned.
4633 *----------------------------------------------------------------------------*/
4635 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
4638 int32_t aExp
, shiftCount
;
4639 uint64_t aSig
, savedASig
;
4642 if (floatx80_invalid_encoding(a
)) {
4643 float_raise(float_flag_invalid
, status
);
4646 aSig
= extractFloatx80Frac( a
);
4647 aExp
= extractFloatx80Exp( a
);
4648 aSign
= extractFloatx80Sign( a
);
4649 if ( 0x401E < aExp
) {
4650 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4653 else if ( aExp
< 0x3FFF ) {
4655 status
->float_exception_flags
|= float_flag_inexact
;
4659 shiftCount
= 0x403E - aExp
;
4661 aSig
>>= shiftCount
;
4663 if ( aSign
) z
= - z
;
4664 if ( ( z
< 0 ) ^ aSign
) {
4666 float_raise(float_flag_invalid
, status
);
4667 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
4669 if ( ( aSig
<<shiftCount
) != savedASig
) {
4670 status
->float_exception_flags
|= float_flag_inexact
;
4676 /*----------------------------------------------------------------------------
4677 | Returns the result of converting the extended double-precision floating-
4678 | point value `a' to the 64-bit two's complement integer format. The
4679 | conversion is performed according to the IEC/IEEE Standard for Binary
4680 | Floating-Point Arithmetic---which means in particular that the conversion
4681 | is rounded according to the current rounding mode. If `a' is a NaN,
4682 | the largest positive integer is returned. Otherwise, if the conversion
4683 | overflows, the largest integer with the same sign as `a' is returned.
4684 *----------------------------------------------------------------------------*/
4686 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
4689 int32_t aExp
, shiftCount
;
4690 uint64_t aSig
, aSigExtra
;
4692 if (floatx80_invalid_encoding(a
)) {
4693 float_raise(float_flag_invalid
, status
);
4696 aSig
= extractFloatx80Frac( a
);
4697 aExp
= extractFloatx80Exp( a
);
4698 aSign
= extractFloatx80Sign( a
);
4699 shiftCount
= 0x403E - aExp
;
4700 if ( shiftCount
<= 0 ) {
4702 float_raise(float_flag_invalid
, status
);
4703 if (!aSign
|| floatx80_is_any_nan(a
)) {
4704 return LIT64( 0x7FFFFFFFFFFFFFFF );
4706 return (int64_t) LIT64( 0x8000000000000000 );
4711 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
4713 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
4717 /*----------------------------------------------------------------------------
4718 | Returns the result of converting the extended double-precision floating-
4719 | point value `a' to the 64-bit two's complement integer format. The
4720 | conversion is performed according to the IEC/IEEE Standard for Binary
4721 | Floating-Point Arithmetic, except that the conversion is always rounded
4722 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4723 | Otherwise, if the conversion overflows, the largest integer with the same
4724 | sign as `a' is returned.
4725 *----------------------------------------------------------------------------*/
4727 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
4730 int32_t aExp
, shiftCount
;
4734 if (floatx80_invalid_encoding(a
)) {
4735 float_raise(float_flag_invalid
, status
);
4738 aSig
= extractFloatx80Frac( a
);
4739 aExp
= extractFloatx80Exp( a
);
4740 aSign
= extractFloatx80Sign( a
);
4741 shiftCount
= aExp
- 0x403E;
4742 if ( 0 <= shiftCount
) {
4743 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
4744 if ( ( a
.high
!= 0xC03E ) || aSig
) {
4745 float_raise(float_flag_invalid
, status
);
4746 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
4747 return LIT64( 0x7FFFFFFFFFFFFFFF );
4750 return (int64_t) LIT64( 0x8000000000000000 );
4752 else if ( aExp
< 0x3FFF ) {
4754 status
->float_exception_flags
|= float_flag_inexact
;
4758 z
= aSig
>>( - shiftCount
);
4759 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
4760 status
->float_exception_flags
|= float_flag_inexact
;
4762 if ( aSign
) z
= - z
;
4767 /*----------------------------------------------------------------------------
4768 | Returns the result of converting the extended double-precision floating-
4769 | point value `a' to the single-precision floating-point format. The
4770 | conversion is performed according to the IEC/IEEE Standard for Binary
4771 | Floating-Point Arithmetic.
4772 *----------------------------------------------------------------------------*/
4774 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
4780 if (floatx80_invalid_encoding(a
)) {
4781 float_raise(float_flag_invalid
, status
);
4782 return float32_default_nan(status
);
4784 aSig
= extractFloatx80Frac( a
);
4785 aExp
= extractFloatx80Exp( a
);
4786 aSign
= extractFloatx80Sign( a
);
4787 if ( aExp
== 0x7FFF ) {
4788 if ( (uint64_t) ( aSig
<<1 ) ) {
4789 return commonNaNToFloat32(floatx80ToCommonNaN(a
, status
), status
);
4791 return packFloat32( aSign
, 0xFF, 0 );
4793 shift64RightJamming( aSig
, 33, &aSig
);
4794 if ( aExp
|| aSig
) aExp
-= 0x3F81;
4795 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
4799 /*----------------------------------------------------------------------------
4800 | Returns the result of converting the extended double-precision floating-
4801 | point value `a' to the double-precision floating-point format. The
4802 | conversion is performed according to the IEC/IEEE Standard for Binary
4803 | Floating-Point Arithmetic.
4804 *----------------------------------------------------------------------------*/
4806 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
4810 uint64_t aSig
, zSig
;
4812 if (floatx80_invalid_encoding(a
)) {
4813 float_raise(float_flag_invalid
, status
);
4814 return float64_default_nan(status
);
4816 aSig
= extractFloatx80Frac( a
);
4817 aExp
= extractFloatx80Exp( a
);
4818 aSign
= extractFloatx80Sign( a
);
4819 if ( aExp
== 0x7FFF ) {
4820 if ( (uint64_t) ( aSig
<<1 ) ) {
4821 return commonNaNToFloat64(floatx80ToCommonNaN(a
, status
), status
);
4823 return packFloat64( aSign
, 0x7FF, 0 );
4825 shift64RightJamming( aSig
, 1, &zSig
);
4826 if ( aExp
|| aSig
) aExp
-= 0x3C01;
4827 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
4831 /*----------------------------------------------------------------------------
4832 | Returns the result of converting the extended double-precision floating-
4833 | point value `a' to the quadruple-precision floating-point format. The
4834 | conversion is performed according to the IEC/IEEE Standard for Binary
4835 | Floating-Point Arithmetic.
4836 *----------------------------------------------------------------------------*/
4838 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
4842 uint64_t aSig
, zSig0
, zSig1
;
4844 if (floatx80_invalid_encoding(a
)) {
4845 float_raise(float_flag_invalid
, status
);
4846 return float128_default_nan(status
);
4848 aSig
= extractFloatx80Frac( a
);
4849 aExp
= extractFloatx80Exp( a
);
4850 aSign
= extractFloatx80Sign( a
);
4851 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
4852 return commonNaNToFloat128(floatx80ToCommonNaN(a
, status
), status
);
4854 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
4855 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
4859 /*----------------------------------------------------------------------------
4860 | Rounds the extended double-precision floating-point value `a'
4861 | to the precision provided by floatx80_rounding_precision and returns the
4862 | result as an extended double-precision floating-point value.
4863 | The operation is performed according to the IEC/IEEE Standard for Binary
4864 | Floating-Point Arithmetic.
4865 *----------------------------------------------------------------------------*/
4867 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
4869 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
4870 extractFloatx80Sign(a
),
4871 extractFloatx80Exp(a
),
4872 extractFloatx80Frac(a
), 0, status
);
4875 /*----------------------------------------------------------------------------
4876 | Rounds the extended double-precision floating-point value `a' to an integer,
4877 | and returns the result as an extended quadruple-precision floating-point
4878 | value. The operation is performed according to the IEC/IEEE Standard for
4879 | Binary Floating-Point Arithmetic.
4880 *----------------------------------------------------------------------------*/
4882 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
4886 uint64_t lastBitMask
, roundBitsMask
;
4889 if (floatx80_invalid_encoding(a
)) {
4890 float_raise(float_flag_invalid
, status
);
4891 return floatx80_default_nan(status
);
4893 aExp
= extractFloatx80Exp( a
);
4894 if ( 0x403E <= aExp
) {
4895 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
4896 return propagateFloatx80NaN(a
, a
, status
);
4900 if ( aExp
< 0x3FFF ) {
4902 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
4905 status
->float_exception_flags
|= float_flag_inexact
;
4906 aSign
= extractFloatx80Sign( a
);
4907 switch (status
->float_rounding_mode
) {
4908 case float_round_nearest_even
:
4909 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
4912 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
4915 case float_round_ties_away
:
4916 if (aExp
== 0x3FFE) {
4917 return packFloatx80(aSign
, 0x3FFF, LIT64(0x8000000000000000));
4920 case float_round_down
:
4923 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4924 : packFloatx80( 0, 0, 0 );
4925 case float_round_up
:
4927 aSign
? packFloatx80( 1, 0, 0 )
4928 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4930 return packFloatx80( aSign
, 0, 0 );
4933 lastBitMask
<<= 0x403E - aExp
;
4934 roundBitsMask
= lastBitMask
- 1;
4936 switch (status
->float_rounding_mode
) {
4937 case float_round_nearest_even
:
4938 z
.low
+= lastBitMask
>>1;
4939 if ((z
.low
& roundBitsMask
) == 0) {
4940 z
.low
&= ~lastBitMask
;
4943 case float_round_ties_away
:
4944 z
.low
+= lastBitMask
>> 1;
4946 case float_round_to_zero
:
4948 case float_round_up
:
4949 if (!extractFloatx80Sign(z
)) {
4950 z
.low
+= roundBitsMask
;
4953 case float_round_down
:
4954 if (extractFloatx80Sign(z
)) {
4955 z
.low
+= roundBitsMask
;
4961 z
.low
&= ~ roundBitsMask
;
4964 z
.low
= LIT64( 0x8000000000000000 );
4966 if (z
.low
!= a
.low
) {
4967 status
->float_exception_flags
|= float_flag_inexact
;
4973 /*----------------------------------------------------------------------------
4974 | Returns the result of adding the absolute values of the extended double-
4975 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4976 | negated before being returned. `zSign' is ignored if the result is a NaN.
4977 | The addition is performed according to the IEC/IEEE Standard for Binary
4978 | Floating-Point Arithmetic.
4979 *----------------------------------------------------------------------------*/
4981 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
4982 float_status
*status
)
4984 int32_t aExp
, bExp
, zExp
;
4985 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4988 aSig
= extractFloatx80Frac( a
);
4989 aExp
= extractFloatx80Exp( a
);
4990 bSig
= extractFloatx80Frac( b
);
4991 bExp
= extractFloatx80Exp( b
);
4992 expDiff
= aExp
- bExp
;
4993 if ( 0 < expDiff
) {
4994 if ( aExp
== 0x7FFF ) {
4995 if ((uint64_t)(aSig
<< 1)) {
4996 return propagateFloatx80NaN(a
, b
, status
);
5000 if ( bExp
== 0 ) --expDiff
;
5001 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5004 else if ( expDiff
< 0 ) {
5005 if ( bExp
== 0x7FFF ) {
5006 if ((uint64_t)(bSig
<< 1)) {
5007 return propagateFloatx80NaN(a
, b
, status
);
5009 return packFloatx80(zSign
,
5010 floatx80_infinity_high
,
5011 floatx80_infinity_low
);
5013 if ( aExp
== 0 ) ++expDiff
;
5014 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5018 if ( aExp
== 0x7FFF ) {
5019 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5020 return propagateFloatx80NaN(a
, b
, status
);
5025 zSig0
= aSig
+ bSig
;
5027 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5033 zSig0
= aSig
+ bSig
;
5034 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5036 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5037 zSig0
|= LIT64( 0x8000000000000000 );
5040 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5041 zSign
, zExp
, zSig0
, zSig1
, status
);
5044 /*----------------------------------------------------------------------------
5045 | Returns the result of subtracting the absolute values of the extended
5046 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5047 | difference is negated before being returned. `zSign' is ignored if the
5048 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5049 | Standard for Binary Floating-Point Arithmetic.
5050 *----------------------------------------------------------------------------*/
5052 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
5053 float_status
*status
)
5055 int32_t aExp
, bExp
, zExp
;
5056 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5059 aSig
= extractFloatx80Frac( a
);
5060 aExp
= extractFloatx80Exp( a
);
5061 bSig
= extractFloatx80Frac( b
);
5062 bExp
= extractFloatx80Exp( b
);
5063 expDiff
= aExp
- bExp
;
5064 if ( 0 < expDiff
) goto aExpBigger
;
5065 if ( expDiff
< 0 ) goto bExpBigger
;
5066 if ( aExp
== 0x7FFF ) {
5067 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5068 return propagateFloatx80NaN(a
, b
, status
);
5070 float_raise(float_flag_invalid
, status
);
5071 return floatx80_default_nan(status
);
5078 if ( bSig
< aSig
) goto aBigger
;
5079 if ( aSig
< bSig
) goto bBigger
;
5080 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
5082 if ( bExp
== 0x7FFF ) {
5083 if ((uint64_t)(bSig
<< 1)) {
5084 return propagateFloatx80NaN(a
, b
, status
);
5086 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
5087 floatx80_infinity_low
);
5089 if ( aExp
== 0 ) ++expDiff
;
5090 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5092 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
5095 goto normalizeRoundAndPack
;
5097 if ( aExp
== 0x7FFF ) {
5098 if ((uint64_t)(aSig
<< 1)) {
5099 return propagateFloatx80NaN(a
, b
, status
);
5103 if ( bExp
== 0 ) --expDiff
;
5104 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5106 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
5108 normalizeRoundAndPack
:
5109 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
5110 zSign
, zExp
, zSig0
, zSig1
, status
);
5113 /*----------------------------------------------------------------------------
5114 | Returns the result of adding the extended double-precision floating-point
5115 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5116 | Standard for Binary Floating-Point Arithmetic.
5117 *----------------------------------------------------------------------------*/
5119 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
5123 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5124 float_raise(float_flag_invalid
, status
);
5125 return floatx80_default_nan(status
);
5127 aSign
= extractFloatx80Sign( a
);
5128 bSign
= extractFloatx80Sign( b
);
5129 if ( aSign
== bSign
) {
5130 return addFloatx80Sigs(a
, b
, aSign
, status
);
5133 return subFloatx80Sigs(a
, b
, aSign
, status
);
5138 /*----------------------------------------------------------------------------
5139 | Returns the result of subtracting the extended double-precision floating-
5140 | point values `a' and `b'. The operation is performed according to the
5141 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5142 *----------------------------------------------------------------------------*/
5144 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5148 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5149 float_raise(float_flag_invalid
, status
);
5150 return floatx80_default_nan(status
);
5152 aSign
= extractFloatx80Sign( a
);
5153 bSign
= extractFloatx80Sign( b
);
5154 if ( aSign
== bSign
) {
5155 return subFloatx80Sigs(a
, b
, aSign
, status
);
5158 return addFloatx80Sigs(a
, b
, aSign
, status
);
5163 /*----------------------------------------------------------------------------
5164 | Returns the result of multiplying the extended double-precision floating-
5165 | point values `a' and `b'. The operation is performed according to the
5166 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5167 *----------------------------------------------------------------------------*/
5169 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
5171 flag aSign
, bSign
, zSign
;
5172 int32_t aExp
, bExp
, zExp
;
5173 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5175 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5176 float_raise(float_flag_invalid
, status
);
5177 return floatx80_default_nan(status
);
5179 aSig
= extractFloatx80Frac( a
);
5180 aExp
= extractFloatx80Exp( a
);
5181 aSign
= extractFloatx80Sign( a
);
5182 bSig
= extractFloatx80Frac( b
);
5183 bExp
= extractFloatx80Exp( b
);
5184 bSign
= extractFloatx80Sign( b
);
5185 zSign
= aSign
^ bSign
;
5186 if ( aExp
== 0x7FFF ) {
5187 if ( (uint64_t) ( aSig
<<1 )
5188 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5189 return propagateFloatx80NaN(a
, b
, status
);
5191 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
5192 return packFloatx80(zSign
, floatx80_infinity_high
,
5193 floatx80_infinity_low
);
5195 if ( bExp
== 0x7FFF ) {
5196 if ((uint64_t)(bSig
<< 1)) {
5197 return propagateFloatx80NaN(a
, b
, status
);
5199 if ( ( aExp
| aSig
) == 0 ) {
5201 float_raise(float_flag_invalid
, status
);
5202 return floatx80_default_nan(status
);
5204 return packFloatx80(zSign
, floatx80_infinity_high
,
5205 floatx80_infinity_low
);
5208 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5209 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5212 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5213 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5215 zExp
= aExp
+ bExp
- 0x3FFE;
5216 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
5217 if ( 0 < (int64_t) zSig0
) {
5218 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5221 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5222 zSign
, zExp
, zSig0
, zSig1
, status
);
5225 /*----------------------------------------------------------------------------
5226 | Returns the result of dividing the extended double-precision floating-point
5227 | value `a' by the corresponding value `b'. The operation is performed
5228 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5229 *----------------------------------------------------------------------------*/
5231 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
5233 flag aSign
, bSign
, zSign
;
5234 int32_t aExp
, bExp
, zExp
;
5235 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5236 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
5238 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5239 float_raise(float_flag_invalid
, status
);
5240 return floatx80_default_nan(status
);
5242 aSig
= extractFloatx80Frac( a
);
5243 aExp
= extractFloatx80Exp( a
);
5244 aSign
= extractFloatx80Sign( a
);
5245 bSig
= extractFloatx80Frac( b
);
5246 bExp
= extractFloatx80Exp( b
);
5247 bSign
= extractFloatx80Sign( b
);
5248 zSign
= aSign
^ bSign
;
5249 if ( aExp
== 0x7FFF ) {
5250 if ((uint64_t)(aSig
<< 1)) {
5251 return propagateFloatx80NaN(a
, b
, status
);
5253 if ( bExp
== 0x7FFF ) {
5254 if ((uint64_t)(bSig
<< 1)) {
5255 return propagateFloatx80NaN(a
, b
, status
);
5259 return packFloatx80(zSign
, floatx80_infinity_high
,
5260 floatx80_infinity_low
);
5262 if ( bExp
== 0x7FFF ) {
5263 if ((uint64_t)(bSig
<< 1)) {
5264 return propagateFloatx80NaN(a
, b
, status
);
5266 return packFloatx80( zSign
, 0, 0 );
5270 if ( ( aExp
| aSig
) == 0 ) {
5272 float_raise(float_flag_invalid
, status
);
5273 return floatx80_default_nan(status
);
5275 float_raise(float_flag_divbyzero
, status
);
5276 return packFloatx80(zSign
, floatx80_infinity_high
,
5277 floatx80_infinity_low
);
5279 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5282 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5283 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5285 zExp
= aExp
- bExp
+ 0x3FFE;
5287 if ( bSig
<= aSig
) {
5288 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
5291 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
5292 mul64To128( bSig
, zSig0
, &term0
, &term1
);
5293 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
5294 while ( (int64_t) rem0
< 0 ) {
5296 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
5298 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
5299 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
5300 mul64To128( bSig
, zSig1
, &term1
, &term2
);
5301 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5302 while ( (int64_t) rem1
< 0 ) {
5304 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
5306 zSig1
|= ( ( rem1
| rem2
) != 0 );
5308 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5309 zSign
, zExp
, zSig0
, zSig1
, status
);
5312 /*----------------------------------------------------------------------------
5313 | Returns the remainder of the extended double-precision floating-point value
5314 | `a' with respect to the corresponding value `b'. The operation is performed
5315 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5316 *----------------------------------------------------------------------------*/
5318 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
5321 int32_t aExp
, bExp
, expDiff
;
5322 uint64_t aSig0
, aSig1
, bSig
;
5323 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
5325 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5326 float_raise(float_flag_invalid
, status
);
5327 return floatx80_default_nan(status
);
5329 aSig0
= extractFloatx80Frac( a
);
5330 aExp
= extractFloatx80Exp( a
);
5331 aSign
= extractFloatx80Sign( a
);
5332 bSig
= extractFloatx80Frac( b
);
5333 bExp
= extractFloatx80Exp( b
);
5334 if ( aExp
== 0x7FFF ) {
5335 if ( (uint64_t) ( aSig0
<<1 )
5336 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5337 return propagateFloatx80NaN(a
, b
, status
);
5341 if ( bExp
== 0x7FFF ) {
5342 if ((uint64_t)(bSig
<< 1)) {
5343 return propagateFloatx80NaN(a
, b
, status
);
5350 float_raise(float_flag_invalid
, status
);
5351 return floatx80_default_nan(status
);
5353 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5356 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
5357 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5359 bSig
|= LIT64( 0x8000000000000000 );
5361 expDiff
= aExp
- bExp
;
5363 if ( expDiff
< 0 ) {
5364 if ( expDiff
< -1 ) return a
;
5365 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
5368 q
= ( bSig
<= aSig0
);
5369 if ( q
) aSig0
-= bSig
;
5371 while ( 0 < expDiff
) {
5372 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5373 q
= ( 2 < q
) ? q
- 2 : 0;
5374 mul64To128( bSig
, q
, &term0
, &term1
);
5375 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5376 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
5380 if ( 0 < expDiff
) {
5381 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5382 q
= ( 2 < q
) ? q
- 2 : 0;
5384 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
5385 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5386 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
5387 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
5389 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5396 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
5397 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5398 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5401 aSig0
= alternateASig0
;
5402 aSig1
= alternateASig1
;
5406 normalizeRoundAndPackFloatx80(
5407 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
5411 /*----------------------------------------------------------------------------
5412 | Returns the square root of the extended double-precision floating-point
5413 | value `a'. The operation is performed according to the IEC/IEEE Standard
5414 | for Binary Floating-Point Arithmetic.
5415 *----------------------------------------------------------------------------*/
5417 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
5421 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
5422 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5424 if (floatx80_invalid_encoding(a
)) {
5425 float_raise(float_flag_invalid
, status
);
5426 return floatx80_default_nan(status
);
5428 aSig0
= extractFloatx80Frac( a
);
5429 aExp
= extractFloatx80Exp( a
);
5430 aSign
= extractFloatx80Sign( a
);
5431 if ( aExp
== 0x7FFF ) {
5432 if ((uint64_t)(aSig0
<< 1)) {
5433 return propagateFloatx80NaN(a
, a
, status
);
5435 if ( ! aSign
) return a
;
5439 if ( ( aExp
| aSig0
) == 0 ) return a
;
5441 float_raise(float_flag_invalid
, status
);
5442 return floatx80_default_nan(status
);
5445 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
5446 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5448 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
5449 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
5450 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
5451 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5452 doubleZSig0
= zSig0
<<1;
5453 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5454 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5455 while ( (int64_t) rem0
< 0 ) {
5458 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5460 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5461 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5462 if ( zSig1
== 0 ) zSig1
= 1;
5463 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5464 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5465 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5466 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5467 while ( (int64_t) rem1
< 0 ) {
5469 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5471 term2
|= doubleZSig0
;
5472 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5474 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5476 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
5477 zSig0
|= doubleZSig0
;
5478 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5479 0, zExp
, zSig0
, zSig1
, status
);
5482 /*----------------------------------------------------------------------------
5483 | Returns 1 if the extended double-precision floating-point value `a' is equal
5484 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5485 | raised if either operand is a NaN. Otherwise, the comparison is performed
5486 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5487 *----------------------------------------------------------------------------*/
5489 int floatx80_eq(floatx80 a
, floatx80 b
, float_status
*status
)
5492 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5493 || (extractFloatx80Exp(a
) == 0x7FFF
5494 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5495 || (extractFloatx80Exp(b
) == 0x7FFF
5496 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5498 float_raise(float_flag_invalid
, status
);
5503 && ( ( a
.high
== b
.high
)
5505 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5510 /*----------------------------------------------------------------------------
5511 | Returns 1 if the extended double-precision floating-point value `a' is
5512 | less than or equal to the corresponding value `b', and 0 otherwise. The
5513 | invalid exception is raised if either operand is a NaN. The comparison is
5514 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5516 *----------------------------------------------------------------------------*/
5518 int floatx80_le(floatx80 a
, floatx80 b
, float_status
*status
)
5522 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5523 || (extractFloatx80Exp(a
) == 0x7FFF
5524 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5525 || (extractFloatx80Exp(b
) == 0x7FFF
5526 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5528 float_raise(float_flag_invalid
, status
);
5531 aSign
= extractFloatx80Sign( a
);
5532 bSign
= extractFloatx80Sign( b
);
5533 if ( aSign
!= bSign
) {
5536 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5540 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5541 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5545 /*----------------------------------------------------------------------------
5546 | Returns 1 if the extended double-precision floating-point value `a' is
5547 | less than the corresponding value `b', and 0 otherwise. The invalid
5548 | exception is raised if either operand is a NaN. The comparison is performed
5549 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5550 *----------------------------------------------------------------------------*/
5552 int floatx80_lt(floatx80 a
, floatx80 b
, float_status
*status
)
5556 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5557 || (extractFloatx80Exp(a
) == 0x7FFF
5558 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5559 || (extractFloatx80Exp(b
) == 0x7FFF
5560 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5562 float_raise(float_flag_invalid
, status
);
5565 aSign
= extractFloatx80Sign( a
);
5566 bSign
= extractFloatx80Sign( b
);
5567 if ( aSign
!= bSign
) {
5570 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5574 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5575 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5579 /*----------------------------------------------------------------------------
5580 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5581 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5582 | either operand is a NaN. The comparison is performed according to the
5583 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5584 *----------------------------------------------------------------------------*/
5585 int floatx80_unordered(floatx80 a
, floatx80 b
, float_status
*status
)
5587 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5588 || (extractFloatx80Exp(a
) == 0x7FFF
5589 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5590 || (extractFloatx80Exp(b
) == 0x7FFF
5591 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5593 float_raise(float_flag_invalid
, status
);
5599 /*----------------------------------------------------------------------------
5600 | Returns 1 if the extended double-precision floating-point value `a' is
5601 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5602 | cause an exception. The comparison is performed according to the IEC/IEEE
5603 | Standard for Binary Floating-Point Arithmetic.
5604 *----------------------------------------------------------------------------*/
5606 int floatx80_eq_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5609 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5610 float_raise(float_flag_invalid
, status
);
5613 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5614 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5615 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5616 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5618 if (floatx80_is_signaling_nan(a
, status
)
5619 || floatx80_is_signaling_nan(b
, status
)) {
5620 float_raise(float_flag_invalid
, status
);
5626 && ( ( a
.high
== b
.high
)
5628 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5633 /*----------------------------------------------------------------------------
5634 | Returns 1 if the extended double-precision floating-point value `a' is less
5635 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5636 | do not cause an exception. Otherwise, the comparison is performed according
5637 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5638 *----------------------------------------------------------------------------*/
5640 int floatx80_le_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5644 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5645 float_raise(float_flag_invalid
, status
);
5648 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5649 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5650 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5651 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5653 if (floatx80_is_signaling_nan(a
, status
)
5654 || floatx80_is_signaling_nan(b
, status
)) {
5655 float_raise(float_flag_invalid
, status
);
5659 aSign
= extractFloatx80Sign( a
);
5660 bSign
= extractFloatx80Sign( b
);
5661 if ( aSign
!= bSign
) {
5664 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5668 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5669 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5673 /*----------------------------------------------------------------------------
5674 | Returns 1 if the extended double-precision floating-point value `a' is less
5675 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5676 | an exception. Otherwise, the comparison is performed according to the
5677 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5678 *----------------------------------------------------------------------------*/
5680 int floatx80_lt_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5684 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5685 float_raise(float_flag_invalid
, status
);
5688 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5689 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5690 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5691 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5693 if (floatx80_is_signaling_nan(a
, status
)
5694 || floatx80_is_signaling_nan(b
, status
)) {
5695 float_raise(float_flag_invalid
, status
);
5699 aSign
= extractFloatx80Sign( a
);
5700 bSign
= extractFloatx80Sign( b
);
5701 if ( aSign
!= bSign
) {
5704 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5708 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5709 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5713 /*----------------------------------------------------------------------------
5714 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5715 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5716 | The comparison is performed according to the IEC/IEEE Standard for Binary
5717 | Floating-Point Arithmetic.
5718 *----------------------------------------------------------------------------*/
5719 int floatx80_unordered_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5721 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5722 float_raise(float_flag_invalid
, status
);
5725 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5726 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5727 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5728 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5730 if (floatx80_is_signaling_nan(a
, status
)
5731 || floatx80_is_signaling_nan(b
, status
)) {
5732 float_raise(float_flag_invalid
, status
);
5739 /*----------------------------------------------------------------------------
5740 | Returns the result of converting the quadruple-precision floating-point
5741 | value `a' to the 32-bit two's complement integer format. The conversion
5742 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5743 | Arithmetic---which means in particular that the conversion is rounded
5744 | according to the current rounding mode. If `a' is a NaN, the largest
5745 | positive integer is returned. Otherwise, if the conversion overflows, the
5746 | largest integer with the same sign as `a' is returned.
5747 *----------------------------------------------------------------------------*/
5749 int32_t float128_to_int32(float128 a
, float_status
*status
)
5752 int32_t aExp
, shiftCount
;
5753 uint64_t aSig0
, aSig1
;
5755 aSig1
= extractFloat128Frac1( a
);
5756 aSig0
= extractFloat128Frac0( a
);
5757 aExp
= extractFloat128Exp( a
);
5758 aSign
= extractFloat128Sign( a
);
5759 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
5760 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5761 aSig0
|= ( aSig1
!= 0 );
5762 shiftCount
= 0x4028 - aExp
;
5763 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
5764 return roundAndPackInt32(aSign
, aSig0
, status
);
5768 /*----------------------------------------------------------------------------
5769 | Returns the result of converting the quadruple-precision floating-point
5770 | value `a' to the 32-bit two's complement integer format. The conversion
5771 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5772 | Arithmetic, except that the conversion is always rounded toward zero. If
5773 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5774 | conversion overflows, the largest integer with the same sign as `a' is
5776 *----------------------------------------------------------------------------*/
5778 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
5781 int32_t aExp
, shiftCount
;
5782 uint64_t aSig0
, aSig1
, savedASig
;
5785 aSig1
= extractFloat128Frac1( a
);
5786 aSig0
= extractFloat128Frac0( a
);
5787 aExp
= extractFloat128Exp( a
);
5788 aSign
= extractFloat128Sign( a
);
5789 aSig0
|= ( aSig1
!= 0 );
5790 if ( 0x401E < aExp
) {
5791 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
5794 else if ( aExp
< 0x3FFF ) {
5795 if (aExp
|| aSig0
) {
5796 status
->float_exception_flags
|= float_flag_inexact
;
5800 aSig0
|= LIT64( 0x0001000000000000 );
5801 shiftCount
= 0x402F - aExp
;
5803 aSig0
>>= shiftCount
;
5805 if ( aSign
) z
= - z
;
5806 if ( ( z
< 0 ) ^ aSign
) {
5808 float_raise(float_flag_invalid
, status
);
5809 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5811 if ( ( aSig0
<<shiftCount
) != savedASig
) {
5812 status
->float_exception_flags
|= float_flag_inexact
;
5818 /*----------------------------------------------------------------------------
5819 | Returns the result of converting the quadruple-precision floating-point
5820 | value `a' to the 64-bit two's complement integer format. The conversion
5821 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5822 | Arithmetic---which means in particular that the conversion is rounded
5823 | according to the current rounding mode. If `a' is a NaN, the largest
5824 | positive integer is returned. Otherwise, if the conversion overflows, the
5825 | largest integer with the same sign as `a' is returned.
5826 *----------------------------------------------------------------------------*/
5828 int64_t float128_to_int64(float128 a
, float_status
*status
)
5831 int32_t aExp
, shiftCount
;
5832 uint64_t aSig0
, aSig1
;
5834 aSig1
= extractFloat128Frac1( a
);
5835 aSig0
= extractFloat128Frac0( a
);
5836 aExp
= extractFloat128Exp( a
);
5837 aSign
= extractFloat128Sign( a
);
5838 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5839 shiftCount
= 0x402F - aExp
;
5840 if ( shiftCount
<= 0 ) {
5841 if ( 0x403E < aExp
) {
5842 float_raise(float_flag_invalid
, status
);
5844 || ( ( aExp
== 0x7FFF )
5845 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
5848 return LIT64( 0x7FFFFFFFFFFFFFFF );
5850 return (int64_t) LIT64( 0x8000000000000000 );
5852 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
5855 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
5857 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
5861 /*----------------------------------------------------------------------------
5862 | Returns the result of converting the quadruple-precision floating-point
5863 | value `a' to the 64-bit two's complement integer format. The conversion
5864 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5865 | Arithmetic, except that the conversion is always rounded toward zero.
5866 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5867 | the conversion overflows, the largest integer with the same sign as `a' is
5869 *----------------------------------------------------------------------------*/
5871 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
5874 int32_t aExp
, shiftCount
;
5875 uint64_t aSig0
, aSig1
;
5878 aSig1
= extractFloat128Frac1( a
);
5879 aSig0
= extractFloat128Frac0( a
);
5880 aExp
= extractFloat128Exp( a
);
5881 aSign
= extractFloat128Sign( a
);
5882 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5883 shiftCount
= aExp
- 0x402F;
5884 if ( 0 < shiftCount
) {
5885 if ( 0x403E <= aExp
) {
5886 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
5887 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
5888 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
5890 status
->float_exception_flags
|= float_flag_inexact
;
5894 float_raise(float_flag_invalid
, status
);
5895 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
5896 return LIT64( 0x7FFFFFFFFFFFFFFF );
5899 return (int64_t) LIT64( 0x8000000000000000 );
5901 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
5902 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
5903 status
->float_exception_flags
|= float_flag_inexact
;
5907 if ( aExp
< 0x3FFF ) {
5908 if ( aExp
| aSig0
| aSig1
) {
5909 status
->float_exception_flags
|= float_flag_inexact
;
5913 z
= aSig0
>>( - shiftCount
);
5915 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
5916 status
->float_exception_flags
|= float_flag_inexact
;
5919 if ( aSign
) z
= - z
;
5924 /*----------------------------------------------------------------------------
5925 | Returns the result of converting the quadruple-precision floating-point value
5926 | `a' to the 64-bit unsigned integer format. The conversion is
5927 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5928 | Arithmetic---which means in particular that the conversion is rounded
5929 | according to the current rounding mode. If `a' is a NaN, the largest
5930 | positive integer is returned. If the conversion overflows, the
5931 | largest unsigned integer is returned. If 'a' is negative, the value is
5932 | rounded and zero is returned; negative values that do not round to zero
5933 | will raise the inexact exception.
5934 *----------------------------------------------------------------------------*/
5936 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
5941 uint64_t aSig0
, aSig1
;
5943 aSig0
= extractFloat128Frac0(a
);
5944 aSig1
= extractFloat128Frac1(a
);
5945 aExp
= extractFloat128Exp(a
);
5946 aSign
= extractFloat128Sign(a
);
5947 if (aSign
&& (aExp
> 0x3FFE)) {
5948 float_raise(float_flag_invalid
, status
);
5949 if (float128_is_any_nan(a
)) {
5950 return LIT64(0xFFFFFFFFFFFFFFFF);
5956 aSig0
|= LIT64(0x0001000000000000);
5958 shiftCount
= 0x402F - aExp
;
5959 if (shiftCount
<= 0) {
5960 if (0x403E < aExp
) {
5961 float_raise(float_flag_invalid
, status
);
5962 return LIT64(0xFFFFFFFFFFFFFFFF);
5964 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
5966 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
5968 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
5971 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
5974 signed char current_rounding_mode
= status
->float_rounding_mode
;
5976 set_float_rounding_mode(float_round_to_zero
, status
);
5977 v
= float128_to_uint64(a
, status
);
5978 set_float_rounding_mode(current_rounding_mode
, status
);
5983 /*----------------------------------------------------------------------------
5984 | Returns the result of converting the quadruple-precision floating-point
5985 | value `a' to the 32-bit unsigned integer format. The conversion
5986 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5987 | Arithmetic except that the conversion is always rounded toward zero.
5988 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5989 | if the conversion overflows, the largest unsigned integer is returned.
5990 | If 'a' is negative, the value is rounded and zero is returned; negative
5991 | values that do not round to zero will raise the inexact exception.
5992 *----------------------------------------------------------------------------*/
5994 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
5998 int old_exc_flags
= get_float_exception_flags(status
);
6000 v
= float128_to_uint64_round_to_zero(a
, status
);
6001 if (v
> 0xffffffff) {
6006 set_float_exception_flags(old_exc_flags
, status
);
6007 float_raise(float_flag_invalid
, status
);
6011 /*----------------------------------------------------------------------------
6012 | Returns the result of converting the quadruple-precision floating-point
6013 | value `a' to the single-precision floating-point format. The conversion
6014 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6016 *----------------------------------------------------------------------------*/
6018 float32
float128_to_float32(float128 a
, float_status
*status
)
6022 uint64_t aSig0
, aSig1
;
6025 aSig1
= extractFloat128Frac1( a
);
6026 aSig0
= extractFloat128Frac0( a
);
6027 aExp
= extractFloat128Exp( a
);
6028 aSign
= extractFloat128Sign( a
);
6029 if ( aExp
== 0x7FFF ) {
6030 if ( aSig0
| aSig1
) {
6031 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
6033 return packFloat32( aSign
, 0xFF, 0 );
6035 aSig0
|= ( aSig1
!= 0 );
6036 shift64RightJamming( aSig0
, 18, &aSig0
);
6038 if ( aExp
|| zSig
) {
6042 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
6046 /*----------------------------------------------------------------------------
6047 | Returns the result of converting the quadruple-precision floating-point
6048 | value `a' to the double-precision floating-point format. The conversion
6049 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6051 *----------------------------------------------------------------------------*/
6053 float64
float128_to_float64(float128 a
, float_status
*status
)
6057 uint64_t aSig0
, aSig1
;
6059 aSig1
= extractFloat128Frac1( a
);
6060 aSig0
= extractFloat128Frac0( a
);
6061 aExp
= extractFloat128Exp( a
);
6062 aSign
= extractFloat128Sign( a
);
6063 if ( aExp
== 0x7FFF ) {
6064 if ( aSig0
| aSig1
) {
6065 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
6067 return packFloat64( aSign
, 0x7FF, 0 );
6069 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6070 aSig0
|= ( aSig1
!= 0 );
6071 if ( aExp
|| aSig0
) {
6072 aSig0
|= LIT64( 0x4000000000000000 );
6075 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
6079 /*----------------------------------------------------------------------------
6080 | Returns the result of converting the quadruple-precision floating-point
6081 | value `a' to the extended double-precision floating-point format. The
6082 | conversion is performed according to the IEC/IEEE Standard for Binary
6083 | Floating-Point Arithmetic.
6084 *----------------------------------------------------------------------------*/
6086 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6090 uint64_t aSig0
, aSig1
;
6092 aSig1
= extractFloat128Frac1( a
);
6093 aSig0
= extractFloat128Frac0( a
);
6094 aExp
= extractFloat128Exp( a
);
6095 aSign
= extractFloat128Sign( a
);
6096 if ( aExp
== 0x7FFF ) {
6097 if ( aSig0
| aSig1
) {
6098 return commonNaNToFloatx80(float128ToCommonNaN(a
, status
), status
);
6100 return packFloatx80(aSign
, floatx80_infinity_high
,
6101 floatx80_infinity_low
);
6104 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6105 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6108 aSig0
|= LIT64( 0x0001000000000000 );
6110 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6111 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6115 /*----------------------------------------------------------------------------
6116 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6117 | returns the result as a quadruple-precision floating-point value. The
6118 | operation is performed according to the IEC/IEEE Standard for Binary
6119 | Floating-Point Arithmetic.
6120 *----------------------------------------------------------------------------*/
6122 float128
float128_round_to_int(float128 a
, float_status
*status
)
6126 uint64_t lastBitMask
, roundBitsMask
;
6129 aExp
= extractFloat128Exp( a
);
6130 if ( 0x402F <= aExp
) {
6131 if ( 0x406F <= aExp
) {
6132 if ( ( aExp
== 0x7FFF )
6133 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6135 return propagateFloat128NaN(a
, a
, status
);
6140 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6141 roundBitsMask
= lastBitMask
- 1;
6143 switch (status
->float_rounding_mode
) {
6144 case float_round_nearest_even
:
6145 if ( lastBitMask
) {
6146 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6147 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6150 if ( (int64_t) z
.low
< 0 ) {
6152 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6156 case float_round_ties_away
:
6158 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6160 if ((int64_t) z
.low
< 0) {
6165 case float_round_to_zero
:
6167 case float_round_up
:
6168 if (!extractFloat128Sign(z
)) {
6169 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6172 case float_round_down
:
6173 if (extractFloat128Sign(z
)) {
6174 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6180 z
.low
&= ~ roundBitsMask
;
6183 if ( aExp
< 0x3FFF ) {
6184 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6185 status
->float_exception_flags
|= float_flag_inexact
;
6186 aSign
= extractFloat128Sign( a
);
6187 switch (status
->float_rounding_mode
) {
6188 case float_round_nearest_even
:
6189 if ( ( aExp
== 0x3FFE )
6190 && ( extractFloat128Frac0( a
)
6191 | extractFloat128Frac1( a
) )
6193 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6196 case float_round_ties_away
:
6197 if (aExp
== 0x3FFE) {
6198 return packFloat128(aSign
, 0x3FFF, 0, 0);
6201 case float_round_down
:
6203 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6204 : packFloat128( 0, 0, 0, 0 );
6205 case float_round_up
:
6207 aSign
? packFloat128( 1, 0, 0, 0 )
6208 : packFloat128( 0, 0x3FFF, 0, 0 );
6210 return packFloat128( aSign
, 0, 0, 0 );
6213 lastBitMask
<<= 0x402F - aExp
;
6214 roundBitsMask
= lastBitMask
- 1;
6217 switch (status
->float_rounding_mode
) {
6218 case float_round_nearest_even
:
6219 z
.high
+= lastBitMask
>>1;
6220 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6221 z
.high
&= ~ lastBitMask
;
6224 case float_round_ties_away
:
6225 z
.high
+= lastBitMask
>>1;
6227 case float_round_to_zero
:
6229 case float_round_up
:
6230 if (!extractFloat128Sign(z
)) {
6231 z
.high
|= ( a
.low
!= 0 );
6232 z
.high
+= roundBitsMask
;
6235 case float_round_down
:
6236 if (extractFloat128Sign(z
)) {
6237 z
.high
|= (a
.low
!= 0);
6238 z
.high
+= roundBitsMask
;
6244 z
.high
&= ~ roundBitsMask
;
6246 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
6247 status
->float_exception_flags
|= float_flag_inexact
;
6253 /*----------------------------------------------------------------------------
6254 | Returns the result of adding the absolute values of the quadruple-precision
6255 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6256 | before being returned. `zSign' is ignored if the result is a NaN.
6257 | The addition is performed according to the IEC/IEEE Standard for Binary
6258 | Floating-Point Arithmetic.
6259 *----------------------------------------------------------------------------*/
6261 static float128
addFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6262 float_status
*status
)
6264 int32_t aExp
, bExp
, zExp
;
6265 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6268 aSig1
= extractFloat128Frac1( a
);
6269 aSig0
= extractFloat128Frac0( a
);
6270 aExp
= extractFloat128Exp( a
);
6271 bSig1
= extractFloat128Frac1( b
);
6272 bSig0
= extractFloat128Frac0( b
);
6273 bExp
= extractFloat128Exp( b
);
6274 expDiff
= aExp
- bExp
;
6275 if ( 0 < expDiff
) {
6276 if ( aExp
== 0x7FFF ) {
6277 if (aSig0
| aSig1
) {
6278 return propagateFloat128NaN(a
, b
, status
);
6286 bSig0
|= LIT64( 0x0001000000000000 );
6288 shift128ExtraRightJamming(
6289 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
6292 else if ( expDiff
< 0 ) {
6293 if ( bExp
== 0x7FFF ) {
6294 if (bSig0
| bSig1
) {
6295 return propagateFloat128NaN(a
, b
, status
);
6297 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6303 aSig0
|= LIT64( 0x0001000000000000 );
6305 shift128ExtraRightJamming(
6306 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
6310 if ( aExp
== 0x7FFF ) {
6311 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6312 return propagateFloat128NaN(a
, b
, status
);
6316 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6318 if (status
->flush_to_zero
) {
6319 if (zSig0
| zSig1
) {
6320 float_raise(float_flag_output_denormal
, status
);
6322 return packFloat128(zSign
, 0, 0, 0);
6324 return packFloat128( zSign
, 0, zSig0
, zSig1
);
6327 zSig0
|= LIT64( 0x0002000000000000 );
6331 aSig0
|= LIT64( 0x0001000000000000 );
6332 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6334 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
6337 shift128ExtraRightJamming(
6338 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6340 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6344 /*----------------------------------------------------------------------------
6345 | Returns the result of subtracting the absolute values of the quadruple-
6346 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6347 | difference is negated before being returned. `zSign' is ignored if the
6348 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6349 | Standard for Binary Floating-Point Arithmetic.
6350 *----------------------------------------------------------------------------*/
6352 static float128
subFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6353 float_status
*status
)
6355 int32_t aExp
, bExp
, zExp
;
6356 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
6359 aSig1
= extractFloat128Frac1( a
);
6360 aSig0
= extractFloat128Frac0( a
);
6361 aExp
= extractFloat128Exp( a
);
6362 bSig1
= extractFloat128Frac1( b
);
6363 bSig0
= extractFloat128Frac0( b
);
6364 bExp
= extractFloat128Exp( b
);
6365 expDiff
= aExp
- bExp
;
6366 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6367 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
6368 if ( 0 < expDiff
) goto aExpBigger
;
6369 if ( expDiff
< 0 ) goto bExpBigger
;
6370 if ( aExp
== 0x7FFF ) {
6371 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6372 return propagateFloat128NaN(a
, b
, status
);
6374 float_raise(float_flag_invalid
, status
);
6375 return float128_default_nan(status
);
6381 if ( bSig0
< aSig0
) goto aBigger
;
6382 if ( aSig0
< bSig0
) goto bBigger
;
6383 if ( bSig1
< aSig1
) goto aBigger
;
6384 if ( aSig1
< bSig1
) goto bBigger
;
6385 return packFloat128(status
->float_rounding_mode
== float_round_down
,
6388 if ( bExp
== 0x7FFF ) {
6389 if (bSig0
| bSig1
) {
6390 return propagateFloat128NaN(a
, b
, status
);
6392 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
6398 aSig0
|= LIT64( 0x4000000000000000 );
6400 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6401 bSig0
|= LIT64( 0x4000000000000000 );
6403 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6406 goto normalizeRoundAndPack
;
6408 if ( aExp
== 0x7FFF ) {
6409 if (aSig0
| aSig1
) {
6410 return propagateFloat128NaN(a
, b
, status
);
6418 bSig0
|= LIT64( 0x4000000000000000 );
6420 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
6421 aSig0
|= LIT64( 0x4000000000000000 );
6423 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6425 normalizeRoundAndPack
:
6427 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
6432 /*----------------------------------------------------------------------------
6433 | Returns the result of adding the quadruple-precision floating-point values
6434 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6435 | for Binary Floating-Point Arithmetic.
6436 *----------------------------------------------------------------------------*/
6438 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
6442 aSign
= extractFloat128Sign( a
);
6443 bSign
= extractFloat128Sign( b
);
6444 if ( aSign
== bSign
) {
6445 return addFloat128Sigs(a
, b
, aSign
, status
);
6448 return subFloat128Sigs(a
, b
, aSign
, status
);
6453 /*----------------------------------------------------------------------------
6454 | Returns the result of subtracting the quadruple-precision floating-point
6455 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6456 | Standard for Binary Floating-Point Arithmetic.
6457 *----------------------------------------------------------------------------*/
6459 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
6463 aSign
= extractFloat128Sign( a
);
6464 bSign
= extractFloat128Sign( b
);
6465 if ( aSign
== bSign
) {
6466 return subFloat128Sigs(a
, b
, aSign
, status
);
6469 return addFloat128Sigs(a
, b
, aSign
, status
);
6474 /*----------------------------------------------------------------------------
6475 | Returns the result of multiplying the quadruple-precision floating-point
6476 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6477 | Standard for Binary Floating-Point Arithmetic.
6478 *----------------------------------------------------------------------------*/
6480 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
6482 flag aSign
, bSign
, zSign
;
6483 int32_t aExp
, bExp
, zExp
;
6484 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
6486 aSig1
= extractFloat128Frac1( a
);
6487 aSig0
= extractFloat128Frac0( a
);
6488 aExp
= extractFloat128Exp( a
);
6489 aSign
= extractFloat128Sign( a
);
6490 bSig1
= extractFloat128Frac1( b
);
6491 bSig0
= extractFloat128Frac0( b
);
6492 bExp
= extractFloat128Exp( b
);
6493 bSign
= extractFloat128Sign( b
);
6494 zSign
= aSign
^ bSign
;
6495 if ( aExp
== 0x7FFF ) {
6496 if ( ( aSig0
| aSig1
)
6497 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6498 return propagateFloat128NaN(a
, b
, status
);
6500 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
6501 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6503 if ( bExp
== 0x7FFF ) {
6504 if (bSig0
| bSig1
) {
6505 return propagateFloat128NaN(a
, b
, status
);
6507 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6509 float_raise(float_flag_invalid
, status
);
6510 return float128_default_nan(status
);
6512 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6515 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6516 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6519 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6520 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6522 zExp
= aExp
+ bExp
- 0x4000;
6523 aSig0
|= LIT64( 0x0001000000000000 );
6524 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
6525 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
6526 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6527 zSig2
|= ( zSig3
!= 0 );
6528 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
6529 shift128ExtraRightJamming(
6530 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6533 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6537 /*----------------------------------------------------------------------------
6538 | Returns the result of dividing the quadruple-precision floating-point value
6539 | `a' by the corresponding value `b'. The operation is performed according to
6540 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6541 *----------------------------------------------------------------------------*/
6543 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
6545 flag aSign
, bSign
, zSign
;
6546 int32_t aExp
, bExp
, zExp
;
6547 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6548 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6550 aSig1
= extractFloat128Frac1( a
);
6551 aSig0
= extractFloat128Frac0( a
);
6552 aExp
= extractFloat128Exp( a
);
6553 aSign
= extractFloat128Sign( a
);
6554 bSig1
= extractFloat128Frac1( b
);
6555 bSig0
= extractFloat128Frac0( b
);
6556 bExp
= extractFloat128Exp( b
);
6557 bSign
= extractFloat128Sign( b
);
6558 zSign
= aSign
^ bSign
;
6559 if ( aExp
== 0x7FFF ) {
6560 if (aSig0
| aSig1
) {
6561 return propagateFloat128NaN(a
, b
, status
);
6563 if ( bExp
== 0x7FFF ) {
6564 if (bSig0
| bSig1
) {
6565 return propagateFloat128NaN(a
, b
, status
);
6569 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6571 if ( bExp
== 0x7FFF ) {
6572 if (bSig0
| bSig1
) {
6573 return propagateFloat128NaN(a
, b
, status
);
6575 return packFloat128( zSign
, 0, 0, 0 );
6578 if ( ( bSig0
| bSig1
) == 0 ) {
6579 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6581 float_raise(float_flag_invalid
, status
);
6582 return float128_default_nan(status
);
6584 float_raise(float_flag_divbyzero
, status
);
6585 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6587 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6590 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6591 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6593 zExp
= aExp
- bExp
+ 0x3FFD;
6595 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
6597 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6598 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
6599 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
6602 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6603 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
6604 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
6605 while ( (int64_t) rem0
< 0 ) {
6607 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
6609 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
6610 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
6611 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
6612 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
6613 while ( (int64_t) rem1
< 0 ) {
6615 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
6617 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6619 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
6620 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6624 /*----------------------------------------------------------------------------
6625 | Returns the remainder of the quadruple-precision floating-point value `a'
6626 | with respect to the corresponding value `b'. The operation is performed
6627 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6628 *----------------------------------------------------------------------------*/
6630 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
6633 int32_t aExp
, bExp
, expDiff
;
6634 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
6635 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
6638 aSig1
= extractFloat128Frac1( a
);
6639 aSig0
= extractFloat128Frac0( a
);
6640 aExp
= extractFloat128Exp( a
);
6641 aSign
= extractFloat128Sign( a
);
6642 bSig1
= extractFloat128Frac1( b
);
6643 bSig0
= extractFloat128Frac0( b
);
6644 bExp
= extractFloat128Exp( b
);
6645 if ( aExp
== 0x7FFF ) {
6646 if ( ( aSig0
| aSig1
)
6647 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6648 return propagateFloat128NaN(a
, b
, status
);
6652 if ( bExp
== 0x7FFF ) {
6653 if (bSig0
| bSig1
) {
6654 return propagateFloat128NaN(a
, b
, status
);
6659 if ( ( bSig0
| bSig1
) == 0 ) {
6661 float_raise(float_flag_invalid
, status
);
6662 return float128_default_nan(status
);
6664 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6667 if ( ( aSig0
| aSig1
) == 0 ) return a
;
6668 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6670 expDiff
= aExp
- bExp
;
6671 if ( expDiff
< -1 ) return a
;
6673 aSig0
| LIT64( 0x0001000000000000 ),
6675 15 - ( expDiff
< 0 ),
6680 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6681 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
6682 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6684 while ( 0 < expDiff
) {
6685 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6686 q
= ( 4 < q
) ? q
- 4 : 0;
6687 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6688 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
6689 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
6690 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
6693 if ( -64 < expDiff
) {
6694 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6695 q
= ( 4 < q
) ? q
- 4 : 0;
6697 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6699 if ( expDiff
< 0 ) {
6700 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6703 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
6705 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6706 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
6709 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
6710 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6713 alternateASig0
= aSig0
;
6714 alternateASig1
= aSig1
;
6716 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6717 } while ( 0 <= (int64_t) aSig0
);
6719 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
6720 if ( ( sigMean0
< 0 )
6721 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
6722 aSig0
= alternateASig0
;
6723 aSig1
= alternateASig1
;
6725 zSign
= ( (int64_t) aSig0
< 0 );
6726 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
6727 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
6731 /*----------------------------------------------------------------------------
6732 | Returns the square root of the quadruple-precision floating-point value `a'.
6733 | The operation is performed according to the IEC/IEEE Standard for Binary
6734 | Floating-Point Arithmetic.
6735 *----------------------------------------------------------------------------*/
6737 float128
float128_sqrt(float128 a
, float_status
*status
)
6741 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
6742 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6744 aSig1
= extractFloat128Frac1( a
);
6745 aSig0
= extractFloat128Frac0( a
);
6746 aExp
= extractFloat128Exp( a
);
6747 aSign
= extractFloat128Sign( a
);
6748 if ( aExp
== 0x7FFF ) {
6749 if (aSig0
| aSig1
) {
6750 return propagateFloat128NaN(a
, a
, status
);
6752 if ( ! aSign
) return a
;
6756 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
6758 float_raise(float_flag_invalid
, status
);
6759 return float128_default_nan(status
);
6762 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
6763 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6765 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
6766 aSig0
|= LIT64( 0x0001000000000000 );
6767 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
6768 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
6769 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6770 doubleZSig0
= zSig0
<<1;
6771 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6772 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6773 while ( (int64_t) rem0
< 0 ) {
6776 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6778 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6779 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
6780 if ( zSig1
== 0 ) zSig1
= 1;
6781 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6782 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6783 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6784 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6785 while ( (int64_t) rem1
< 0 ) {
6787 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6789 term2
|= doubleZSig0
;
6790 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6792 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6794 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
6795 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
6799 /*----------------------------------------------------------------------------
6800 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6801 | the corresponding value `b', and 0 otherwise. The invalid exception is
6802 | raised if either operand is a NaN. Otherwise, the comparison is performed
6803 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6804 *----------------------------------------------------------------------------*/
6806 int float128_eq(float128 a
, float128 b
, float_status
*status
)
6809 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6810 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6811 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6812 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6814 float_raise(float_flag_invalid
, status
);
6819 && ( ( a
.high
== b
.high
)
6821 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6826 /*----------------------------------------------------------------------------
6827 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6828 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6829 | exception is raised if either operand is a NaN. The comparison is performed
6830 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6831 *----------------------------------------------------------------------------*/
6833 int float128_le(float128 a
, float128 b
, float_status
*status
)
6837 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6838 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6839 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6840 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6842 float_raise(float_flag_invalid
, status
);
6845 aSign
= extractFloat128Sign( a
);
6846 bSign
= extractFloat128Sign( b
);
6847 if ( aSign
!= bSign
) {
6850 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6854 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6855 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6859 /*----------------------------------------------------------------------------
6860 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6861 | the corresponding value `b', and 0 otherwise. The invalid exception is
6862 | raised if either operand is a NaN. The comparison is performed according
6863 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6864 *----------------------------------------------------------------------------*/
6866 int float128_lt(float128 a
, float128 b
, float_status
*status
)
6870 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6871 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6872 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6873 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6875 float_raise(float_flag_invalid
, status
);
6878 aSign
= extractFloat128Sign( a
);
6879 bSign
= extractFloat128Sign( b
);
6880 if ( aSign
!= bSign
) {
6883 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6887 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6888 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6892 /*----------------------------------------------------------------------------
6893 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6894 | be compared, and 0 otherwise. The invalid exception is raised if either
6895 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6896 | Standard for Binary Floating-Point Arithmetic.
6897 *----------------------------------------------------------------------------*/
6899 int float128_unordered(float128 a
, float128 b
, float_status
*status
)
6901 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6902 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6903 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6904 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6906 float_raise(float_flag_invalid
, status
);
6912 /*----------------------------------------------------------------------------
6913 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6914 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6915 | exception. The comparison is performed according to the IEC/IEEE Standard
6916 | for Binary Floating-Point Arithmetic.
6917 *----------------------------------------------------------------------------*/
6919 int float128_eq_quiet(float128 a
, float128 b
, float_status
*status
)
6922 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6923 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6924 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6925 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6927 if (float128_is_signaling_nan(a
, status
)
6928 || float128_is_signaling_nan(b
, status
)) {
6929 float_raise(float_flag_invalid
, status
);
6935 && ( ( a
.high
== b
.high
)
6937 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6942 /*----------------------------------------------------------------------------
6943 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6944 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6945 | cause an exception. Otherwise, the comparison is performed according to the
6946 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6947 *----------------------------------------------------------------------------*/
6949 int float128_le_quiet(float128 a
, float128 b
, float_status
*status
)
6953 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6954 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6955 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6956 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6958 if (float128_is_signaling_nan(a
, status
)
6959 || float128_is_signaling_nan(b
, status
)) {
6960 float_raise(float_flag_invalid
, status
);
6964 aSign
= extractFloat128Sign( a
);
6965 bSign
= extractFloat128Sign( b
);
6966 if ( aSign
!= bSign
) {
6969 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6973 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6974 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6978 /*----------------------------------------------------------------------------
6979 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6980 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6981 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6982 | Standard for Binary Floating-Point Arithmetic.
6983 *----------------------------------------------------------------------------*/
6985 int float128_lt_quiet(float128 a
, float128 b
, float_status
*status
)
6989 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6990 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6991 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6992 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6994 if (float128_is_signaling_nan(a
, status
)
6995 || float128_is_signaling_nan(b
, status
)) {
6996 float_raise(float_flag_invalid
, status
);
7000 aSign
= extractFloat128Sign( a
);
7001 bSign
= extractFloat128Sign( b
);
7002 if ( aSign
!= bSign
) {
7005 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7009 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
7010 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
7014 /*----------------------------------------------------------------------------
7015 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7016 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7017 | comparison is performed according to the IEC/IEEE Standard for Binary
7018 | Floating-Point Arithmetic.
7019 *----------------------------------------------------------------------------*/
7021 int float128_unordered_quiet(float128 a
, float128 b
, float_status
*status
)
7023 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7024 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7025 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7026 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7028 if (float128_is_signaling_nan(a
, status
)
7029 || float128_is_signaling_nan(b
, status
)) {
7030 float_raise(float_flag_invalid
, status
);
7037 static inline int floatx80_compare_internal(floatx80 a
, floatx80 b
,
7038 int is_quiet
, float_status
*status
)
7042 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
7043 float_raise(float_flag_invalid
, status
);
7044 return float_relation_unordered
;
7046 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7047 ( extractFloatx80Frac( a
)<<1 ) ) ||
7048 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7049 ( extractFloatx80Frac( b
)<<1 ) )) {
7051 floatx80_is_signaling_nan(a
, status
) ||
7052 floatx80_is_signaling_nan(b
, status
)) {
7053 float_raise(float_flag_invalid
, status
);
7055 return float_relation_unordered
;
7057 aSign
= extractFloatx80Sign( a
);
7058 bSign
= extractFloatx80Sign( b
);
7059 if ( aSign
!= bSign
) {
7061 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7062 ( ( a
.low
| b
.low
) == 0 ) ) {
7064 return float_relation_equal
;
7066 return 1 - (2 * aSign
);
7069 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7070 return float_relation_equal
;
7072 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7077 int floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
7079 return floatx80_compare_internal(a
, b
, 0, status
);
7082 int floatx80_compare_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
7084 return floatx80_compare_internal(a
, b
, 1, status
);
7087 static inline int float128_compare_internal(float128 a
, float128 b
,
7088 int is_quiet
, float_status
*status
)
7092 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7093 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7094 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7095 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7097 float128_is_signaling_nan(a
, status
) ||
7098 float128_is_signaling_nan(b
, status
)) {
7099 float_raise(float_flag_invalid
, status
);
7101 return float_relation_unordered
;
7103 aSign
= extractFloat128Sign( a
);
7104 bSign
= extractFloat128Sign( b
);
7105 if ( aSign
!= bSign
) {
7106 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7108 return float_relation_equal
;
7110 return 1 - (2 * aSign
);
7113 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7114 return float_relation_equal
;
7116 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7121 int float128_compare(float128 a
, float128 b
, float_status
*status
)
7123 return float128_compare_internal(a
, b
, 0, status
);
7126 int float128_compare_quiet(float128 a
, float128 b
, float_status
*status
)
7128 return float128_compare_internal(a
, b
, 1, status
);
7131 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7137 if (floatx80_invalid_encoding(a
)) {
7138 float_raise(float_flag_invalid
, status
);
7139 return floatx80_default_nan(status
);
7141 aSig
= extractFloatx80Frac( a
);
7142 aExp
= extractFloatx80Exp( a
);
7143 aSign
= extractFloatx80Sign( a
);
7145 if ( aExp
== 0x7FFF ) {
7147 return propagateFloatx80NaN(a
, a
, status
);
7161 } else if (n
< -0x10000) {
7166 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7167 aSign
, aExp
, aSig
, 0, status
);
7170 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7174 uint64_t aSig0
, aSig1
;
7176 aSig1
= extractFloat128Frac1( a
);
7177 aSig0
= extractFloat128Frac0( a
);
7178 aExp
= extractFloat128Exp( a
);
7179 aSign
= extractFloat128Sign( a
);
7180 if ( aExp
== 0x7FFF ) {
7181 if ( aSig0
| aSig1
) {
7182 return propagateFloat128NaN(a
, a
, status
);
7187 aSig0
|= LIT64( 0x0001000000000000 );
7188 } else if (aSig0
== 0 && aSig1
== 0) {
7196 } else if (n
< -0x10000) {
7201 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1