4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
90 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 s->float_exception_flags |= float_flag_input_denormal; \
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck
, float32
)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck
, float64
)
141 #undef GEN_INPUT_FLUSH__NOCHECK
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
146 if (likely(!s->flush_inputs_to_zero)) { \
149 soft_t ## _input_flush__nocheck(a, s); \
152 GEN_INPUT_FLUSH1(float32_input_flush1
, float32
)
153 GEN_INPUT_FLUSH1(float64_input_flush1
, float64
)
154 #undef GEN_INPUT_FLUSH1
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
159 if (likely(!s->flush_inputs_to_zero)) { \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
166 GEN_INPUT_FLUSH2(float32_input_flush2
, float32
)
167 GEN_INPUT_FLUSH2(float64_input_flush2
, float64
)
168 #undef GEN_INPUT_FLUSH2
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
173 if (likely(!s->flush_inputs_to_zero)) { \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
181 GEN_INPUT_FLUSH3(float32_input_flush3
, float32
)
182 GEN_INPUT_FLUSH3(float64_input_flush3
, float64
)
183 #undef GEN_INPUT_FLUSH3
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
215 # define QEMU_HARDFLOAT_USE_ISINF 0
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
235 static inline bool can_use_fpu(const float_status
*s
)
237 if (QEMU_NO_HARDFLOAT
) {
240 return likely(s
->float_exception_flags
& float_flag_inexact
&&
241 s
->float_rounding_mode
== float_round_nearest_even
);
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
266 typedef bool (*f32_check_fn
)(union_float32 a
, union_float32 b
);
267 typedef bool (*f64_check_fn
)(union_float64 a
, union_float64 b
);
269 typedef float32 (*soft_f32_op2_fn
)(float32 a
, float32 b
, float_status
*s
);
270 typedef float64 (*soft_f64_op2_fn
)(float64 a
, float64 b
, float_status
*s
);
271 typedef float (*hard_f32_op2_fn
)(float a
, float b
);
272 typedef double (*hard_f64_op2_fn
)(double a
, double b
);
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a
, union_float32 b
)
277 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
282 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
283 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
285 return float32_is_zero_or_normal(a
.s
) &&
286 float32_is_zero_or_normal(b
.s
);
289 static inline bool f64_is_zon2(union_float64 a
, union_float64 b
)
291 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
292 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
293 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
295 return float64_is_zero_or_normal(a
.s
) &&
296 float64_is_zero_or_normal(b
.s
);
299 /* 3-input is-zero-or-normal */
301 bool f32_is_zon3(union_float32 a
, union_float32 b
, union_float32 c
)
303 if (QEMU_HARDFLOAT_3F32_USE_FP
) {
304 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
305 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
306 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
308 return float32_is_zero_or_normal(a
.s
) &&
309 float32_is_zero_or_normal(b
.s
) &&
310 float32_is_zero_or_normal(c
.s
);
314 bool f64_is_zon3(union_float64 a
, union_float64 b
, union_float64 c
)
316 if (QEMU_HARDFLOAT_3F64_USE_FP
) {
317 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
318 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
319 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
321 return float64_is_zero_or_normal(a
.s
) &&
322 float64_is_zero_or_normal(b
.s
) &&
323 float64_is_zero_or_normal(c
.s
);
326 static inline bool f32_is_inf(union_float32 a
)
328 if (QEMU_HARDFLOAT_USE_ISINF
) {
331 return float32_is_infinity(a
.s
);
334 static inline bool f64_is_inf(union_float64 a
)
336 if (QEMU_HARDFLOAT_USE_ISINF
) {
339 return float64_is_infinity(a
.s
);
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa
, float32 xb
, float_status
*s
,
345 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
,
346 f32_check_fn pre
, f32_check_fn post
,
347 f32_check_fn fast_test
, soft_f32_op2_fn fast_op
)
349 union_float32 ua
, ub
, ur
;
354 if (unlikely(!can_use_fpu(s
))) {
358 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
359 if (unlikely(!pre(ua
, ub
))) {
362 if (fast_test
&& fast_test(ua
, ub
)) {
363 return fast_op(ua
.s
, ub
.s
, s
);
366 ur
.h
= hard(ua
.h
, ub
.h
);
367 if (unlikely(f32_is_inf(ur
))) {
368 s
->float_exception_flags
|= float_flag_overflow
;
369 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
)) {
370 if (post
== NULL
|| post(ua
, ub
)) {
377 return soft(ua
.s
, ub
.s
, s
);
380 static inline float64
381 float64_gen2(float64 xa
, float64 xb
, float_status
*s
,
382 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
,
383 f64_check_fn pre
, f64_check_fn post
,
384 f64_check_fn fast_test
, soft_f64_op2_fn fast_op
)
386 union_float64 ua
, ub
, ur
;
391 if (unlikely(!can_use_fpu(s
))) {
395 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
396 if (unlikely(!pre(ua
, ub
))) {
399 if (fast_test
&& fast_test(ua
, ub
)) {
400 return fast_op(ua
.s
, ub
.s
, s
);
403 ur
.h
= hard(ua
.h
, ub
.h
);
404 if (unlikely(f64_is_inf(ur
))) {
405 s
->float_exception_flags
|= float_flag_overflow
;
406 } else if (unlikely(fabs(ur
.h
) <= DBL_MIN
)) {
407 if (post
== NULL
|| post(ua
, ub
)) {
414 return soft(ua
.s
, ub
.s
, s
);
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the single-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
421 static inline uint32_t extractFloat32Frac(float32 a
)
423 return float32_val(a
) & 0x007FFFFF;
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the single-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
430 static inline int extractFloat32Exp(float32 a
)
432 return (float32_val(a
) >> 23) & 0xFF;
435 /*----------------------------------------------------------------------------
436 | Returns the sign bit of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
439 static inline flag
extractFloat32Sign(float32 a
)
441 return float32_val(a
) >> 31;
444 /*----------------------------------------------------------------------------
445 | Returns the fraction bits of the double-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
448 static inline uint64_t extractFloat64Frac(float64 a
)
450 return float64_val(a
) & UINT64_C(0x000FFFFFFFFFFFFF);
453 /*----------------------------------------------------------------------------
454 | Returns the exponent bits of the double-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
457 static inline int extractFloat64Exp(float64 a
)
459 return (float64_val(a
) >> 52) & 0x7FF;
462 /*----------------------------------------------------------------------------
463 | Returns the sign bit of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
466 static inline flag
extractFloat64Sign(float64 a
)
468 return float64_val(a
) >> 63;
472 * Classify a floating point number. Everything above float_class_qnan
473 * is a NaN so cls >= float_class_qnan is any NaN.
476 typedef enum __attribute__ ((__packed__
)) {
477 float_class_unclassified
,
481 float_class_qnan
, /* all NaNs from here */
485 /* Simple helpers for checking if, or what kind of, NaN we have */
486 static inline __attribute__((unused
)) bool is_nan(FloatClass c
)
488 return unlikely(c
>= float_class_qnan
);
491 static inline __attribute__((unused
)) bool is_snan(FloatClass c
)
493 return c
== float_class_snan
;
496 static inline __attribute__((unused
)) bool is_qnan(FloatClass c
)
498 return c
== float_class_qnan
;
502 * Structure holding all of the decomposed parts of a float. The
503 * exponent is unbiased and the fraction is normalized. All
504 * calculations are done with a 64 bit fraction and then rounded as
505 * appropriate for the final format.
507 * Thanks to the packed FloatClass a decent compiler should be able to
508 * fit the whole structure into registers and avoid using the stack
509 * for parameter passing.
519 #define DECOMPOSED_BINARY_POINT (64 - 2)
520 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
521 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
523 /* Structure holding all of the relevant parameters for a format.
524 * exp_size: the size of the exponent field
525 * exp_bias: the offset applied to the exponent field
526 * exp_max: the maximum normalised exponent
527 * frac_size: the size of the fraction field
528 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
529 * The following are computed based the size of fraction
530 * frac_lsb: least significant bit of fraction
531 * frac_lsbm1: the bit below the least significant bit (for rounding)
532 * round_mask/roundeven_mask: masks used for rounding
533 * The following optional modifiers are available:
534 * arm_althp: handle ARM Alternative Half Precision
545 uint64_t roundeven_mask
;
549 /* Expand fields based on the size of exponent and fraction */
550 #define FLOAT_PARAMS(E, F) \
552 .exp_bias = ((1 << E) - 1) >> 1, \
553 .exp_max = (1 << E) - 1, \
555 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
556 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
557 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
558 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
559 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
561 static const FloatFmt float16_params
= {
565 static const FloatFmt float16_params_ahp
= {
570 static const FloatFmt float32_params
= {
574 static const FloatFmt float64_params
= {
578 /* Unpack a float to parts, but do not canonicalize. */
579 static inline FloatParts
unpack_raw(FloatFmt fmt
, uint64_t raw
)
581 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
583 return (FloatParts
) {
584 .cls
= float_class_unclassified
,
585 .sign
= extract64(raw
, sign_pos
, 1),
586 .exp
= extract64(raw
, fmt
.frac_size
, fmt
.exp_size
),
587 .frac
= extract64(raw
, 0, fmt
.frac_size
),
591 static inline FloatParts
float16_unpack_raw(float16 f
)
593 return unpack_raw(float16_params
, f
);
596 static inline FloatParts
float32_unpack_raw(float32 f
)
598 return unpack_raw(float32_params
, f
);
601 static inline FloatParts
float64_unpack_raw(float64 f
)
603 return unpack_raw(float64_params
, f
);
606 /* Pack a float from parts, but do not canonicalize. */
607 static inline uint64_t pack_raw(FloatFmt fmt
, FloatParts p
)
609 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
610 uint64_t ret
= deposit64(p
.frac
, fmt
.frac_size
, fmt
.exp_size
, p
.exp
);
611 return deposit64(ret
, sign_pos
, 1, p
.sign
);
614 static inline float16
float16_pack_raw(FloatParts p
)
616 return make_float16(pack_raw(float16_params
, p
));
619 static inline float32
float32_pack_raw(FloatParts p
)
621 return make_float32(pack_raw(float32_params
, p
));
624 static inline float64
float64_pack_raw(FloatParts p
)
626 return make_float64(pack_raw(float64_params
, p
));
629 /*----------------------------------------------------------------------------
630 | Functions and definitions to determine: (1) whether tininess for underflow
631 | is detected before or after rounding by default, (2) what (if anything)
632 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
633 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
634 | are propagated from function inputs to output. These details are target-
636 *----------------------------------------------------------------------------*/
637 #include "softfloat-specialize.inc.c"
639 /* Canonicalize EXP and FRAC, setting CLS. */
640 static FloatParts
sf_canonicalize(FloatParts part
, const FloatFmt
*parm
,
641 float_status
*status
)
643 if (part
.exp
== parm
->exp_max
&& !parm
->arm_althp
) {
644 if (part
.frac
== 0) {
645 part
.cls
= float_class_inf
;
647 part
.frac
<<= parm
->frac_shift
;
648 part
.cls
= (parts_is_snan_frac(part
.frac
, status
)
649 ? float_class_snan
: float_class_qnan
);
651 } else if (part
.exp
== 0) {
652 if (likely(part
.frac
== 0)) {
653 part
.cls
= float_class_zero
;
654 } else if (status
->flush_inputs_to_zero
) {
655 float_raise(float_flag_input_denormal
, status
);
656 part
.cls
= float_class_zero
;
659 int shift
= clz64(part
.frac
) - 1;
660 part
.cls
= float_class_normal
;
661 part
.exp
= parm
->frac_shift
- parm
->exp_bias
- shift
+ 1;
665 part
.cls
= float_class_normal
;
666 part
.exp
-= parm
->exp_bias
;
667 part
.frac
= DECOMPOSED_IMPLICIT_BIT
+ (part
.frac
<< parm
->frac_shift
);
672 /* Round and uncanonicalize a floating-point number by parts. There
673 * are FRAC_SHIFT bits that may require rounding at the bottom of the
674 * fraction; these bits will be removed. The exponent will be biased
675 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
678 static FloatParts
round_canonical(FloatParts p
, float_status
*s
,
679 const FloatFmt
*parm
)
681 const uint64_t frac_lsb
= parm
->frac_lsb
;
682 const uint64_t frac_lsbm1
= parm
->frac_lsbm1
;
683 const uint64_t round_mask
= parm
->round_mask
;
684 const uint64_t roundeven_mask
= parm
->roundeven_mask
;
685 const int exp_max
= parm
->exp_max
;
686 const int frac_shift
= parm
->frac_shift
;
695 case float_class_normal
:
696 switch (s
->float_rounding_mode
) {
697 case float_round_nearest_even
:
698 overflow_norm
= false;
699 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
701 case float_round_ties_away
:
702 overflow_norm
= false;
705 case float_round_to_zero
:
706 overflow_norm
= true;
710 inc
= p
.sign
? 0 : round_mask
;
711 overflow_norm
= p
.sign
;
713 case float_round_down
:
714 inc
= p
.sign
? round_mask
: 0;
715 overflow_norm
= !p
.sign
;
717 case float_round_to_odd
:
718 overflow_norm
= true;
719 inc
= frac
& frac_lsb
? 0 : round_mask
;
722 g_assert_not_reached();
725 exp
+= parm
->exp_bias
;
726 if (likely(exp
> 0)) {
727 if (frac
& round_mask
) {
728 flags
|= float_flag_inexact
;
730 if (frac
& DECOMPOSED_OVERFLOW_BIT
) {
737 if (parm
->arm_althp
) {
738 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
739 if (unlikely(exp
> exp_max
)) {
740 /* Overflow. Return the maximum normal. */
741 flags
= float_flag_invalid
;
745 } else if (unlikely(exp
>= exp_max
)) {
746 flags
|= float_flag_overflow
| float_flag_inexact
;
751 p
.cls
= float_class_inf
;
755 } else if (s
->flush_to_zero
) {
756 flags
|= float_flag_output_denormal
;
757 p
.cls
= float_class_zero
;
760 bool is_tiny
= (s
->float_detect_tininess
761 == float_tininess_before_rounding
)
763 || !((frac
+ inc
) & DECOMPOSED_OVERFLOW_BIT
);
765 shift64RightJamming(frac
, 1 - exp
, &frac
);
766 if (frac
& round_mask
) {
767 /* Need to recompute round-to-even. */
768 switch (s
->float_rounding_mode
) {
769 case float_round_nearest_even
:
770 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
773 case float_round_to_odd
:
774 inc
= frac
& frac_lsb
? 0 : round_mask
;
777 flags
|= float_flag_inexact
;
781 exp
= (frac
& DECOMPOSED_IMPLICIT_BIT
? 1 : 0);
784 if (is_tiny
&& (flags
& float_flag_inexact
)) {
785 flags
|= float_flag_underflow
;
787 if (exp
== 0 && frac
== 0) {
788 p
.cls
= float_class_zero
;
793 case float_class_zero
:
799 case float_class_inf
:
801 assert(!parm
->arm_althp
);
806 case float_class_qnan
:
807 case float_class_snan
:
808 assert(!parm
->arm_althp
);
810 frac
>>= parm
->frac_shift
;
814 g_assert_not_reached();
817 float_raise(flags
, s
);
823 /* Explicit FloatFmt version */
824 static FloatParts
float16a_unpack_canonical(float16 f
, float_status
*s
,
825 const FloatFmt
*params
)
827 return sf_canonicalize(float16_unpack_raw(f
), params
, s
);
830 static FloatParts
float16_unpack_canonical(float16 f
, float_status
*s
)
832 return float16a_unpack_canonical(f
, s
, &float16_params
);
835 static float16
float16a_round_pack_canonical(FloatParts p
, float_status
*s
,
836 const FloatFmt
*params
)
838 return float16_pack_raw(round_canonical(p
, s
, params
));
841 static float16
float16_round_pack_canonical(FloatParts p
, float_status
*s
)
843 return float16a_round_pack_canonical(p
, s
, &float16_params
);
846 static FloatParts
float32_unpack_canonical(float32 f
, float_status
*s
)
848 return sf_canonicalize(float32_unpack_raw(f
), &float32_params
, s
);
851 static float32
float32_round_pack_canonical(FloatParts p
, float_status
*s
)
853 return float32_pack_raw(round_canonical(p
, s
, &float32_params
));
856 static FloatParts
float64_unpack_canonical(float64 f
, float_status
*s
)
858 return sf_canonicalize(float64_unpack_raw(f
), &float64_params
, s
);
861 static float64
float64_round_pack_canonical(FloatParts p
, float_status
*s
)
863 return float64_pack_raw(round_canonical(p
, s
, &float64_params
));
866 static FloatParts
return_nan(FloatParts a
, float_status
*s
)
869 case float_class_snan
:
870 s
->float_exception_flags
|= float_flag_invalid
;
871 a
= parts_silence_nan(a
, s
);
873 case float_class_qnan
:
874 if (s
->default_nan_mode
) {
875 return parts_default_nan(s
);
880 g_assert_not_reached();
885 static FloatParts
pick_nan(FloatParts a
, FloatParts b
, float_status
*s
)
887 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
888 s
->float_exception_flags
|= float_flag_invalid
;
891 if (s
->default_nan_mode
) {
892 return parts_default_nan(s
);
894 if (pickNaN(a
.cls
, b
.cls
,
896 (a
.frac
== b
.frac
&& a
.sign
< b
.sign
))) {
899 if (is_snan(a
.cls
)) {
900 return parts_silence_nan(a
, s
);
906 static FloatParts
pick_nan_muladd(FloatParts a
, FloatParts b
, FloatParts c
,
907 bool inf_zero
, float_status
*s
)
911 if (is_snan(a
.cls
) || is_snan(b
.cls
) || is_snan(c
.cls
)) {
912 s
->float_exception_flags
|= float_flag_invalid
;
915 which
= pickNaNMulAdd(a
.cls
, b
.cls
, c
.cls
, inf_zero
, s
);
917 if (s
->default_nan_mode
) {
918 /* Note that this check is after pickNaNMulAdd so that function
919 * has an opportunity to set the Invalid flag.
934 return parts_default_nan(s
);
936 g_assert_not_reached();
939 if (is_snan(a
.cls
)) {
940 return parts_silence_nan(a
, s
);
946 * Returns the result of adding or subtracting the values of the
947 * floating-point values `a' and `b'. The operation is performed
948 * according to the IEC/IEEE Standard for Binary Floating-Point
952 static FloatParts
addsub_floats(FloatParts a
, FloatParts b
, bool subtract
,
955 bool a_sign
= a
.sign
;
956 bool b_sign
= b
.sign
^ subtract
;
958 if (a_sign
!= b_sign
) {
961 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
962 if (a
.exp
> b
.exp
|| (a
.exp
== b
.exp
&& a
.frac
>= b
.frac
)) {
963 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
964 a
.frac
= a
.frac
- b
.frac
;
966 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
967 a
.frac
= b
.frac
- a
.frac
;
973 a
.cls
= float_class_zero
;
974 a
.sign
= s
->float_rounding_mode
== float_round_down
;
976 int shift
= clz64(a
.frac
) - 1;
977 a
.frac
= a
.frac
<< shift
;
978 a
.exp
= a
.exp
- shift
;
983 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
984 return pick_nan(a
, b
, s
);
986 if (a
.cls
== float_class_inf
) {
987 if (b
.cls
== float_class_inf
) {
988 float_raise(float_flag_invalid
, s
);
989 return parts_default_nan(s
);
993 if (a
.cls
== float_class_zero
&& b
.cls
== float_class_zero
) {
994 a
.sign
= s
->float_rounding_mode
== float_round_down
;
997 if (a
.cls
== float_class_zero
|| b
.cls
== float_class_inf
) {
1001 if (b
.cls
== float_class_zero
) {
1006 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1007 if (a
.exp
> b
.exp
) {
1008 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
1009 } else if (a
.exp
< b
.exp
) {
1010 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
1014 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
1015 shift64RightJamming(a
.frac
, 1, &a
.frac
);
1020 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1021 return pick_nan(a
, b
, s
);
1023 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1026 if (b
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1031 g_assert_not_reached();
1035 * Returns the result of adding or subtracting the floating-point
1036 * values `a' and `b'. The operation is performed according to the
1037 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1040 float16 QEMU_FLATTEN
float16_add(float16 a
, float16 b
, float_status
*status
)
1042 FloatParts pa
= float16_unpack_canonical(a
, status
);
1043 FloatParts pb
= float16_unpack_canonical(b
, status
);
1044 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
1046 return float16_round_pack_canonical(pr
, status
);
1049 float16 QEMU_FLATTEN
float16_sub(float16 a
, float16 b
, float_status
*status
)
1051 FloatParts pa
= float16_unpack_canonical(a
, status
);
1052 FloatParts pb
= float16_unpack_canonical(b
, status
);
1053 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
1055 return float16_round_pack_canonical(pr
, status
);
1058 static float32 QEMU_SOFTFLOAT_ATTR
1059 soft_f32_addsub(float32 a
, float32 b
, bool subtract
, float_status
*status
)
1061 FloatParts pa
= float32_unpack_canonical(a
, status
);
1062 FloatParts pb
= float32_unpack_canonical(b
, status
);
1063 FloatParts pr
= addsub_floats(pa
, pb
, subtract
, status
);
1065 return float32_round_pack_canonical(pr
, status
);
1068 static inline float32
soft_f32_add(float32 a
, float32 b
, float_status
*status
)
1070 return soft_f32_addsub(a
, b
, false, status
);
1073 static inline float32
soft_f32_sub(float32 a
, float32 b
, float_status
*status
)
1075 return soft_f32_addsub(a
, b
, true, status
);
1078 static float64 QEMU_SOFTFLOAT_ATTR
1079 soft_f64_addsub(float64 a
, float64 b
, bool subtract
, float_status
*status
)
1081 FloatParts pa
= float64_unpack_canonical(a
, status
);
1082 FloatParts pb
= float64_unpack_canonical(b
, status
);
1083 FloatParts pr
= addsub_floats(pa
, pb
, subtract
, status
);
1085 return float64_round_pack_canonical(pr
, status
);
1088 static inline float64
soft_f64_add(float64 a
, float64 b
, float_status
*status
)
1090 return soft_f64_addsub(a
, b
, false, status
);
1093 static inline float64
soft_f64_sub(float64 a
, float64 b
, float_status
*status
)
1095 return soft_f64_addsub(a
, b
, true, status
);
1098 static float hard_f32_add(float a
, float b
)
1103 static float hard_f32_sub(float a
, float b
)
1108 static double hard_f64_add(double a
, double b
)
1113 static double hard_f64_sub(double a
, double b
)
1118 static bool f32_addsub_post(union_float32 a
, union_float32 b
)
1120 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1121 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1123 return !(float32_is_zero(a
.s
) && float32_is_zero(b
.s
));
1126 static bool f64_addsub_post(union_float64 a
, union_float64 b
)
1128 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1129 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1131 return !(float64_is_zero(a
.s
) && float64_is_zero(b
.s
));
1135 static float32
float32_addsub(float32 a
, float32 b
, float_status
*s
,
1136 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
)
1138 return float32_gen2(a
, b
, s
, hard
, soft
,
1139 f32_is_zon2
, f32_addsub_post
, NULL
, NULL
);
1142 static float64
float64_addsub(float64 a
, float64 b
, float_status
*s
,
1143 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
)
1145 return float64_gen2(a
, b
, s
, hard
, soft
,
1146 f64_is_zon2
, f64_addsub_post
, NULL
, NULL
);
1149 float32 QEMU_FLATTEN
1150 float32_add(float32 a
, float32 b
, float_status
*s
)
1152 return float32_addsub(a
, b
, s
, hard_f32_add
, soft_f32_add
);
1155 float32 QEMU_FLATTEN
1156 float32_sub(float32 a
, float32 b
, float_status
*s
)
1158 return float32_addsub(a
, b
, s
, hard_f32_sub
, soft_f32_sub
);
1161 float64 QEMU_FLATTEN
1162 float64_add(float64 a
, float64 b
, float_status
*s
)
1164 return float64_addsub(a
, b
, s
, hard_f64_add
, soft_f64_add
);
1167 float64 QEMU_FLATTEN
1168 float64_sub(float64 a
, float64 b
, float_status
*s
)
1170 return float64_addsub(a
, b
, s
, hard_f64_sub
, soft_f64_sub
);
1174 * Returns the result of multiplying the floating-point values `a' and
1175 * `b'. The operation is performed according to the IEC/IEEE Standard
1176 * for Binary Floating-Point Arithmetic.
1179 static FloatParts
mul_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1181 bool sign
= a
.sign
^ b
.sign
;
1183 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1185 int exp
= a
.exp
+ b
.exp
;
1187 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1188 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1189 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
1190 shift64RightJamming(lo
, 1, &lo
);
1200 /* handle all the NaN cases */
1201 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1202 return pick_nan(a
, b
, s
);
1204 /* Inf * Zero == NaN */
1205 if ((a
.cls
== float_class_inf
&& b
.cls
== float_class_zero
) ||
1206 (a
.cls
== float_class_zero
&& b
.cls
== float_class_inf
)) {
1207 s
->float_exception_flags
|= float_flag_invalid
;
1208 return parts_default_nan(s
);
1210 /* Multiply by 0 or Inf */
1211 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1215 if (b
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1219 g_assert_not_reached();
1222 float16 QEMU_FLATTEN
float16_mul(float16 a
, float16 b
, float_status
*status
)
1224 FloatParts pa
= float16_unpack_canonical(a
, status
);
1225 FloatParts pb
= float16_unpack_canonical(b
, status
);
1226 FloatParts pr
= mul_floats(pa
, pb
, status
);
1228 return float16_round_pack_canonical(pr
, status
);
1231 static float32 QEMU_SOFTFLOAT_ATTR
1232 soft_f32_mul(float32 a
, float32 b
, float_status
*status
)
1234 FloatParts pa
= float32_unpack_canonical(a
, status
);
1235 FloatParts pb
= float32_unpack_canonical(b
, status
);
1236 FloatParts pr
= mul_floats(pa
, pb
, status
);
1238 return float32_round_pack_canonical(pr
, status
);
1241 static float64 QEMU_SOFTFLOAT_ATTR
1242 soft_f64_mul(float64 a
, float64 b
, float_status
*status
)
1244 FloatParts pa
= float64_unpack_canonical(a
, status
);
1245 FloatParts pb
= float64_unpack_canonical(b
, status
);
1246 FloatParts pr
= mul_floats(pa
, pb
, status
);
1248 return float64_round_pack_canonical(pr
, status
);
1251 static float hard_f32_mul(float a
, float b
)
1256 static double hard_f64_mul(double a
, double b
)
1261 static bool f32_mul_fast_test(union_float32 a
, union_float32 b
)
1263 return float32_is_zero(a
.s
) || float32_is_zero(b
.s
);
1266 static bool f64_mul_fast_test(union_float64 a
, union_float64 b
)
1268 return float64_is_zero(a
.s
) || float64_is_zero(b
.s
);
1271 static float32
f32_mul_fast_op(float32 a
, float32 b
, float_status
*s
)
1273 bool signbit
= float32_is_neg(a
) ^ float32_is_neg(b
);
1275 return float32_set_sign(float32_zero
, signbit
);
1278 static float64
f64_mul_fast_op(float64 a
, float64 b
, float_status
*s
)
1280 bool signbit
= float64_is_neg(a
) ^ float64_is_neg(b
);
1282 return float64_set_sign(float64_zero
, signbit
);
1285 float32 QEMU_FLATTEN
1286 float32_mul(float32 a
, float32 b
, float_status
*s
)
1288 return float32_gen2(a
, b
, s
, hard_f32_mul
, soft_f32_mul
,
1289 f32_is_zon2
, NULL
, f32_mul_fast_test
, f32_mul_fast_op
);
1292 float64 QEMU_FLATTEN
1293 float64_mul(float64 a
, float64 b
, float_status
*s
)
1295 return float64_gen2(a
, b
, s
, hard_f64_mul
, soft_f64_mul
,
1296 f64_is_zon2
, NULL
, f64_mul_fast_test
, f64_mul_fast_op
);
1300 * Returns the result of multiplying the floating-point values `a' and
1301 * `b' then adding 'c', with no intermediate rounding step after the
1302 * multiplication. The operation is performed according to the
1303 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1304 * The flags argument allows the caller to select negation of the
1305 * addend, the intermediate product, or the final result. (The
1306 * difference between this and having the caller do a separate
1307 * negation is that negating externally will flip the sign bit on
1311 static FloatParts
muladd_floats(FloatParts a
, FloatParts b
, FloatParts c
,
1312 int flags
, float_status
*s
)
1314 bool inf_zero
= ((1 << a
.cls
) | (1 << b
.cls
)) ==
1315 ((1 << float_class_inf
) | (1 << float_class_zero
));
1317 bool sign_flip
= flags
& float_muladd_negate_result
;
1322 /* It is implementation-defined whether the cases of (0,inf,qnan)
1323 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1324 * they return if they do), so we have to hand this information
1325 * off to the target-specific pick-a-NaN routine.
1327 if (is_nan(a
.cls
) || is_nan(b
.cls
) || is_nan(c
.cls
)) {
1328 return pick_nan_muladd(a
, b
, c
, inf_zero
, s
);
1332 s
->float_exception_flags
|= float_flag_invalid
;
1333 return parts_default_nan(s
);
1336 if (flags
& float_muladd_negate_c
) {
1340 p_sign
= a
.sign
^ b
.sign
;
1342 if (flags
& float_muladd_negate_product
) {
1346 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_inf
) {
1347 p_class
= float_class_inf
;
1348 } else if (a
.cls
== float_class_zero
|| b
.cls
== float_class_zero
) {
1349 p_class
= float_class_zero
;
1351 p_class
= float_class_normal
;
1354 if (c
.cls
== float_class_inf
) {
1355 if (p_class
== float_class_inf
&& p_sign
!= c
.sign
) {
1356 s
->float_exception_flags
|= float_flag_invalid
;
1357 return parts_default_nan(s
);
1359 a
.cls
= float_class_inf
;
1360 a
.sign
= c
.sign
^ sign_flip
;
1365 if (p_class
== float_class_inf
) {
1366 a
.cls
= float_class_inf
;
1367 a
.sign
= p_sign
^ sign_flip
;
1371 if (p_class
== float_class_zero
) {
1372 if (c
.cls
== float_class_zero
) {
1373 if (p_sign
!= c
.sign
) {
1374 p_sign
= s
->float_rounding_mode
== float_round_down
;
1377 } else if (flags
& float_muladd_halve_result
) {
1380 c
.sign
^= sign_flip
;
1384 /* a & b should be normals now... */
1385 assert(a
.cls
== float_class_normal
&&
1386 b
.cls
== float_class_normal
);
1388 p_exp
= a
.exp
+ b
.exp
;
1390 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1393 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1394 /* binary point now at bit 124 */
1396 /* check for overflow */
1397 if (hi
& (1ULL << (DECOMPOSED_BINARY_POINT
* 2 + 1 - 64))) {
1398 shift128RightJamming(hi
, lo
, 1, &hi
, &lo
);
1403 if (c
.cls
== float_class_zero
) {
1404 /* move binary point back to 62 */
1405 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1407 int exp_diff
= p_exp
- c
.exp
;
1408 if (p_sign
== c
.sign
) {
1410 if (exp_diff
<= 0) {
1411 shift128RightJamming(hi
, lo
,
1412 DECOMPOSED_BINARY_POINT
- exp_diff
,
1417 uint64_t c_hi
, c_lo
;
1418 /* shift c to the same binary point as the product (124) */
1421 shift128RightJamming(c_hi
, c_lo
,
1424 add128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1425 /* move binary point back to 62 */
1426 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1429 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
1430 shift64RightJamming(lo
, 1, &lo
);
1436 uint64_t c_hi
, c_lo
;
1437 /* make C binary point match product at bit 124 */
1441 if (exp_diff
<= 0) {
1442 shift128RightJamming(hi
, lo
, -exp_diff
, &hi
, &lo
);
1445 (hi
> c_hi
|| (hi
== c_hi
&& lo
>= c_lo
))) {
1446 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1448 sub128(c_hi
, c_lo
, hi
, lo
, &hi
, &lo
);
1453 shift128RightJamming(c_hi
, c_lo
,
1456 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1459 if (hi
== 0 && lo
== 0) {
1460 a
.cls
= float_class_zero
;
1461 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1462 a
.sign
^= sign_flip
;
1469 shift
= clz64(lo
) + 64;
1471 /* Normalizing to a binary point of 124 is the
1472 correct adjust for the exponent. However since we're
1473 shifting, we might as well put the binary point back
1474 at 62 where we really want it. Therefore shift as
1475 if we're leaving 1 bit at the top of the word, but
1476 adjust the exponent as if we're leaving 3 bits. */
1479 lo
= lo
<< (shift
- 64);
1481 hi
= (hi
<< shift
) | (lo
>> (64 - shift
));
1482 lo
= hi
| ((lo
<< shift
) != 0);
1489 if (flags
& float_muladd_halve_result
) {
1493 /* finally prepare our result */
1494 a
.cls
= float_class_normal
;
1495 a
.sign
= p_sign
^ sign_flip
;
1502 float16 QEMU_FLATTEN
float16_muladd(float16 a
, float16 b
, float16 c
,
1503 int flags
, float_status
*status
)
1505 FloatParts pa
= float16_unpack_canonical(a
, status
);
1506 FloatParts pb
= float16_unpack_canonical(b
, status
);
1507 FloatParts pc
= float16_unpack_canonical(c
, status
);
1508 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1510 return float16_round_pack_canonical(pr
, status
);
1513 static float32 QEMU_SOFTFLOAT_ATTR
1514 soft_f32_muladd(float32 a
, float32 b
, float32 c
, int flags
,
1515 float_status
*status
)
1517 FloatParts pa
= float32_unpack_canonical(a
, status
);
1518 FloatParts pb
= float32_unpack_canonical(b
, status
);
1519 FloatParts pc
= float32_unpack_canonical(c
, status
);
1520 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1522 return float32_round_pack_canonical(pr
, status
);
1525 static float64 QEMU_SOFTFLOAT_ATTR
1526 soft_f64_muladd(float64 a
, float64 b
, float64 c
, int flags
,
1527 float_status
*status
)
1529 FloatParts pa
= float64_unpack_canonical(a
, status
);
1530 FloatParts pb
= float64_unpack_canonical(b
, status
);
1531 FloatParts pc
= float64_unpack_canonical(c
, status
);
1532 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1534 return float64_round_pack_canonical(pr
, status
);
1537 static bool force_soft_fma
;
1539 float32 QEMU_FLATTEN
1540 float32_muladd(float32 xa
, float32 xb
, float32 xc
, int flags
, float_status
*s
)
1542 union_float32 ua
, ub
, uc
, ur
;
1548 if (unlikely(!can_use_fpu(s
))) {
1551 if (unlikely(flags
& float_muladd_halve_result
)) {
1555 float32_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1556 if (unlikely(!f32_is_zon3(ua
, ub
, uc
))) {
1560 if (unlikely(force_soft_fma
)) {
1565 * When (a || b) == 0, there's no need to check for under/over flow,
1566 * since we know the addend is (normal || 0) and the product is 0.
1568 if (float32_is_zero(ua
.s
) || float32_is_zero(ub
.s
)) {
1572 prod_sign
= float32_is_neg(ua
.s
) ^ float32_is_neg(ub
.s
);
1573 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1574 up
.s
= float32_set_sign(float32_zero
, prod_sign
);
1576 if (flags
& float_muladd_negate_c
) {
1581 union_float32 ua_orig
= ua
;
1582 union_float32 uc_orig
= uc
;
1584 if (flags
& float_muladd_negate_product
) {
1587 if (flags
& float_muladd_negate_c
) {
1591 ur
.h
= fmaf(ua
.h
, ub
.h
, uc
.h
);
1593 if (unlikely(f32_is_inf(ur
))) {
1594 s
->float_exception_flags
|= float_flag_overflow
;
1595 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
)) {
1601 if (flags
& float_muladd_negate_result
) {
1602 return float32_chs(ur
.s
);
1607 return soft_f32_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1610 float64 QEMU_FLATTEN
1611 float64_muladd(float64 xa
, float64 xb
, float64 xc
, int flags
, float_status
*s
)
1613 union_float64 ua
, ub
, uc
, ur
;
1619 if (unlikely(!can_use_fpu(s
))) {
1622 if (unlikely(flags
& float_muladd_halve_result
)) {
1626 float64_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1627 if (unlikely(!f64_is_zon3(ua
, ub
, uc
))) {
1631 if (unlikely(force_soft_fma
)) {
1636 * When (a || b) == 0, there's no need to check for under/over flow,
1637 * since we know the addend is (normal || 0) and the product is 0.
1639 if (float64_is_zero(ua
.s
) || float64_is_zero(ub
.s
)) {
1643 prod_sign
= float64_is_neg(ua
.s
) ^ float64_is_neg(ub
.s
);
1644 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1645 up
.s
= float64_set_sign(float64_zero
, prod_sign
);
1647 if (flags
& float_muladd_negate_c
) {
1652 union_float64 ua_orig
= ua
;
1653 union_float64 uc_orig
= uc
;
1655 if (flags
& float_muladd_negate_product
) {
1658 if (flags
& float_muladd_negate_c
) {
1662 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
1664 if (unlikely(f64_is_inf(ur
))) {
1665 s
->float_exception_flags
|= float_flag_overflow
;
1666 } else if (unlikely(fabs(ur
.h
) <= FLT_MIN
)) {
1672 if (flags
& float_muladd_negate_result
) {
1673 return float64_chs(ur
.s
);
1678 return soft_f64_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1682 * Returns the result of dividing the floating-point value `a' by the
1683 * corresponding value `b'. The operation is performed according to
1684 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1687 static FloatParts
div_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1689 bool sign
= a
.sign
^ b
.sign
;
1691 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1692 uint64_t n0
, n1
, q
, r
;
1693 int exp
= a
.exp
- b
.exp
;
1696 * We want a 2*N / N-bit division to produce exactly an N-bit
1697 * result, so that we do not lose any precision and so that we
1698 * do not have to renormalize afterward. If A.frac < B.frac,
1699 * then division would produce an (N-1)-bit result; shift A left
1700 * by one to produce the an N-bit result, and decrement the
1701 * exponent to match.
1703 * The udiv_qrnnd algorithm that we're using requires normalization,
1704 * i.e. the msb of the denominator must be set. Since we know that
1705 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1706 * by one (more), and the remainder must be shifted right by one.
1708 if (a
.frac
< b
.frac
) {
1710 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 2, &n1
, &n0
);
1712 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1, &n1
, &n0
);
1714 q
= udiv_qrnnd(&r
, n1
, n0
, b
.frac
<< 1);
1717 * Set lsb if there is a remainder, to set inexact.
1718 * As mentioned above, to find the actual value of the remainder we
1719 * would need to shift right, but (1) we are only concerned about
1720 * non-zero-ness, and (2) the remainder will always be even because
1721 * both inputs to the division primitive are even.
1723 a
.frac
= q
| (r
!= 0);
1728 /* handle all the NaN cases */
1729 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1730 return pick_nan(a
, b
, s
);
1732 /* 0/0 or Inf/Inf */
1735 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1736 s
->float_exception_flags
|= float_flag_invalid
;
1737 return parts_default_nan(s
);
1739 /* Inf / x or 0 / x */
1740 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1745 if (b
.cls
== float_class_zero
) {
1746 s
->float_exception_flags
|= float_flag_divbyzero
;
1747 a
.cls
= float_class_inf
;
1752 if (b
.cls
== float_class_inf
) {
1753 a
.cls
= float_class_zero
;
1757 g_assert_not_reached();
1760 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1762 FloatParts pa
= float16_unpack_canonical(a
, status
);
1763 FloatParts pb
= float16_unpack_canonical(b
, status
);
1764 FloatParts pr
= div_floats(pa
, pb
, status
);
1766 return float16_round_pack_canonical(pr
, status
);
1769 static float32 QEMU_SOFTFLOAT_ATTR
1770 soft_f32_div(float32 a
, float32 b
, float_status
*status
)
1772 FloatParts pa
= float32_unpack_canonical(a
, status
);
1773 FloatParts pb
= float32_unpack_canonical(b
, status
);
1774 FloatParts pr
= div_floats(pa
, pb
, status
);
1776 return float32_round_pack_canonical(pr
, status
);
1779 static float64 QEMU_SOFTFLOAT_ATTR
1780 soft_f64_div(float64 a
, float64 b
, float_status
*status
)
1782 FloatParts pa
= float64_unpack_canonical(a
, status
);
1783 FloatParts pb
= float64_unpack_canonical(b
, status
);
1784 FloatParts pr
= div_floats(pa
, pb
, status
);
1786 return float64_round_pack_canonical(pr
, status
);
1789 static float hard_f32_div(float a
, float b
)
1794 static double hard_f64_div(double a
, double b
)
1799 static bool f32_div_pre(union_float32 a
, union_float32 b
)
1801 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1802 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1803 fpclassify(b
.h
) == FP_NORMAL
;
1805 return float32_is_zero_or_normal(a
.s
) && float32_is_normal(b
.s
);
1808 static bool f64_div_pre(union_float64 a
, union_float64 b
)
1810 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1811 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1812 fpclassify(b
.h
) == FP_NORMAL
;
1814 return float64_is_zero_or_normal(a
.s
) && float64_is_normal(b
.s
);
1817 static bool f32_div_post(union_float32 a
, union_float32 b
)
1819 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1820 return fpclassify(a
.h
) != FP_ZERO
;
1822 return !float32_is_zero(a
.s
);
1825 static bool f64_div_post(union_float64 a
, union_float64 b
)
1827 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1828 return fpclassify(a
.h
) != FP_ZERO
;
1830 return !float64_is_zero(a
.s
);
1833 float32 QEMU_FLATTEN
1834 float32_div(float32 a
, float32 b
, float_status
*s
)
1836 return float32_gen2(a
, b
, s
, hard_f32_div
, soft_f32_div
,
1837 f32_div_pre
, f32_div_post
, NULL
, NULL
);
1840 float64 QEMU_FLATTEN
1841 float64_div(float64 a
, float64 b
, float_status
*s
)
1843 return float64_gen2(a
, b
, s
, hard_f64_div
, soft_f64_div
,
1844 f64_div_pre
, f64_div_post
, NULL
, NULL
);
1848 * Float to Float conversions
1850 * Returns the result of converting one float format to another. The
1851 * conversion is performed according to the IEC/IEEE Standard for
1852 * Binary Floating-Point Arithmetic.
1854 * The float_to_float helper only needs to take care of raising
1855 * invalid exceptions and handling the conversion on NaNs.
1858 static FloatParts
float_to_float(FloatParts a
, const FloatFmt
*dstf
,
1861 if (dstf
->arm_althp
) {
1863 case float_class_qnan
:
1864 case float_class_snan
:
1865 /* There is no NaN in the destination format. Raise Invalid
1866 * and return a zero with the sign of the input NaN.
1868 s
->float_exception_flags
|= float_flag_invalid
;
1869 a
.cls
= float_class_zero
;
1874 case float_class_inf
:
1875 /* There is no Inf in the destination format. Raise Invalid
1876 * and return the maximum normal with the correct sign.
1878 s
->float_exception_flags
|= float_flag_invalid
;
1879 a
.cls
= float_class_normal
;
1880 a
.exp
= dstf
->exp_max
;
1881 a
.frac
= ((1ull << dstf
->frac_size
) - 1) << dstf
->frac_shift
;
1887 } else if (is_nan(a
.cls
)) {
1888 if (is_snan(a
.cls
)) {
1889 s
->float_exception_flags
|= float_flag_invalid
;
1890 a
= parts_silence_nan(a
, s
);
1892 if (s
->default_nan_mode
) {
1893 return parts_default_nan(s
);
1899 float32
float16_to_float32(float16 a
, bool ieee
, float_status
*s
)
1901 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1902 FloatParts p
= float16a_unpack_canonical(a
, s
, fmt16
);
1903 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
1904 return float32_round_pack_canonical(pr
, s
);
1907 float64
float16_to_float64(float16 a
, bool ieee
, float_status
*s
)
1909 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1910 FloatParts p
= float16a_unpack_canonical(a
, s
, fmt16
);
1911 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
1912 return float64_round_pack_canonical(pr
, s
);
1915 float16
float32_to_float16(float32 a
, bool ieee
, float_status
*s
)
1917 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1918 FloatParts p
= float32_unpack_canonical(a
, s
);
1919 FloatParts pr
= float_to_float(p
, fmt16
, s
);
1920 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1923 static float64 QEMU_SOFTFLOAT_ATTR
1924 soft_float32_to_float64(float32 a
, float_status
*s
)
1926 FloatParts p
= float32_unpack_canonical(a
, s
);
1927 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
1928 return float64_round_pack_canonical(pr
, s
);
1931 float64
float32_to_float64(float32 a
, float_status
*s
)
1933 if (likely(float32_is_normal(a
))) {
1934 /* Widening conversion can never produce inexact results. */
1940 } else if (float32_is_zero(a
)) {
1941 return float64_set_sign(float64_zero
, float32_is_neg(a
));
1943 return soft_float32_to_float64(a
, s
);
1947 float16
float64_to_float16(float64 a
, bool ieee
, float_status
*s
)
1949 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1950 FloatParts p
= float64_unpack_canonical(a
, s
);
1951 FloatParts pr
= float_to_float(p
, fmt16
, s
);
1952 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1955 float32
float64_to_float32(float64 a
, float_status
*s
)
1957 FloatParts p
= float64_unpack_canonical(a
, s
);
1958 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
1959 return float32_round_pack_canonical(pr
, s
);
1963 * Rounds the floating-point value `a' to an integer, and returns the
1964 * result as a floating-point value. The operation is performed
1965 * according to the IEC/IEEE Standard for Binary Floating-Point
1969 static FloatParts
round_to_int(FloatParts a
, int rmode
,
1970 int scale
, float_status
*s
)
1973 case float_class_qnan
:
1974 case float_class_snan
:
1975 return return_nan(a
, s
);
1977 case float_class_zero
:
1978 case float_class_inf
:
1979 /* already "integral" */
1982 case float_class_normal
:
1983 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
1986 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
1987 /* already integral */
1992 /* all fractional */
1993 s
->float_exception_flags
|= float_flag_inexact
;
1995 case float_round_nearest_even
:
1996 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
1998 case float_round_ties_away
:
1999 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
2001 case float_round_to_zero
:
2004 case float_round_up
:
2007 case float_round_down
:
2010 case float_round_to_odd
:
2014 g_assert_not_reached();
2018 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
2021 a
.cls
= float_class_zero
;
2024 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
2025 uint64_t frac_lsbm1
= frac_lsb
>> 1;
2026 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
2027 uint64_t rnd_mask
= rnd_even_mask
>> 1;
2031 case float_round_nearest_even
:
2032 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
2034 case float_round_ties_away
:
2037 case float_round_to_zero
:
2040 case float_round_up
:
2041 inc
= a
.sign
? 0 : rnd_mask
;
2043 case float_round_down
:
2044 inc
= a
.sign
? rnd_mask
: 0;
2046 case float_round_to_odd
:
2047 inc
= a
.frac
& frac_lsb
? 0 : rnd_mask
;
2050 g_assert_not_reached();
2053 if (a
.frac
& rnd_mask
) {
2054 s
->float_exception_flags
|= float_flag_inexact
;
2056 a
.frac
&= ~rnd_mask
;
2057 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
2065 g_assert_not_reached();
2070 float16
float16_round_to_int(float16 a
, float_status
*s
)
2072 FloatParts pa
= float16_unpack_canonical(a
, s
);
2073 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2074 return float16_round_pack_canonical(pr
, s
);
2077 float32
float32_round_to_int(float32 a
, float_status
*s
)
2079 FloatParts pa
= float32_unpack_canonical(a
, s
);
2080 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2081 return float32_round_pack_canonical(pr
, s
);
2084 float64
float64_round_to_int(float64 a
, float_status
*s
)
2086 FloatParts pa
= float64_unpack_canonical(a
, s
);
2087 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2088 return float64_round_pack_canonical(pr
, s
);
2092 * Returns the result of converting the floating-point value `a' to
2093 * the two's complement integer format. The conversion is performed
2094 * according to the IEC/IEEE Standard for Binary Floating-Point
2095 * Arithmetic---which means in particular that the conversion is
2096 * rounded according to the current rounding mode. If `a' is a NaN,
2097 * the largest positive integer is returned. Otherwise, if the
2098 * conversion overflows, the largest integer with the same sign as `a'
2102 static int64_t round_to_int_and_pack(FloatParts in
, int rmode
, int scale
,
2103 int64_t min
, int64_t max
,
2107 int orig_flags
= get_float_exception_flags(s
);
2108 FloatParts p
= round_to_int(in
, rmode
, scale
, s
);
2111 case float_class_snan
:
2112 case float_class_qnan
:
2113 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2115 case float_class_inf
:
2116 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2117 return p
.sign
? min
: max
;
2118 case float_class_zero
:
2120 case float_class_normal
:
2121 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
2122 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2123 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
2124 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
2129 if (r
<= -(uint64_t) min
) {
2132 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2139 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2144 g_assert_not_reached();
2148 int16_t float16_to_int16_scalbn(float16 a
, int rmode
, int scale
,
2151 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2152 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2155 int32_t float16_to_int32_scalbn(float16 a
, int rmode
, int scale
,
2158 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2159 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2162 int64_t float16_to_int64_scalbn(float16 a
, int rmode
, int scale
,
2165 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2166 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2169 int16_t float32_to_int16_scalbn(float32 a
, int rmode
, int scale
,
2172 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2173 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2176 int32_t float32_to_int32_scalbn(float32 a
, int rmode
, int scale
,
2179 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2180 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2183 int64_t float32_to_int64_scalbn(float32 a
, int rmode
, int scale
,
2186 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2187 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2190 int16_t float64_to_int16_scalbn(float64 a
, int rmode
, int scale
,
2193 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2194 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2197 int32_t float64_to_int32_scalbn(float64 a
, int rmode
, int scale
,
2200 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2201 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2204 int64_t float64_to_int64_scalbn(float64 a
, int rmode
, int scale
,
2207 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2208 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2211 int16_t float16_to_int16(float16 a
, float_status
*s
)
2213 return float16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2216 int32_t float16_to_int32(float16 a
, float_status
*s
)
2218 return float16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2221 int64_t float16_to_int64(float16 a
, float_status
*s
)
2223 return float16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2226 int16_t float32_to_int16(float32 a
, float_status
*s
)
2228 return float32_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2231 int32_t float32_to_int32(float32 a
, float_status
*s
)
2233 return float32_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2236 int64_t float32_to_int64(float32 a
, float_status
*s
)
2238 return float32_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2241 int16_t float64_to_int16(float64 a
, float_status
*s
)
2243 return float64_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2246 int32_t float64_to_int32(float64 a
, float_status
*s
)
2248 return float64_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2251 int64_t float64_to_int64(float64 a
, float_status
*s
)
2253 return float64_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2256 int16_t float16_to_int16_round_to_zero(float16 a
, float_status
*s
)
2258 return float16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2261 int32_t float16_to_int32_round_to_zero(float16 a
, float_status
*s
)
2263 return float16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2266 int64_t float16_to_int64_round_to_zero(float16 a
, float_status
*s
)
2268 return float16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2271 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*s
)
2273 return float32_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2276 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*s
)
2278 return float32_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2281 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*s
)
2283 return float32_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2286 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*s
)
2288 return float64_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2291 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*s
)
2293 return float64_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2296 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*s
)
2298 return float64_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2302 * Returns the result of converting the floating-point value `a' to
2303 * the unsigned integer format. The conversion is performed according
2304 * to the IEC/IEEE Standard for Binary Floating-Point
2305 * Arithmetic---which means in particular that the conversion is
2306 * rounded according to the current rounding mode. If `a' is a NaN,
2307 * the largest unsigned integer is returned. Otherwise, if the
2308 * conversion overflows, the largest unsigned integer is returned. If
2309 * the 'a' is negative, the result is rounded and zero is returned;
2310 * values that do not round to zero will raise the inexact exception
2314 static uint64_t round_to_uint_and_pack(FloatParts in
, int rmode
, int scale
,
2315 uint64_t max
, float_status
*s
)
2317 int orig_flags
= get_float_exception_flags(s
);
2318 FloatParts p
= round_to_int(in
, rmode
, scale
, s
);
2322 case float_class_snan
:
2323 case float_class_qnan
:
2324 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2326 case float_class_inf
:
2327 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2328 return p
.sign
? 0 : max
;
2329 case float_class_zero
:
2331 case float_class_normal
:
2333 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2337 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
2338 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2339 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
2340 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
2342 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2346 /* For uint64 this will never trip, but if p.exp is too large
2347 * to shift a decomposed fraction we shall have exited via the
2351 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2356 g_assert_not_reached();
2360 uint16_t float16_to_uint16_scalbn(float16 a
, int rmode
, int scale
,
2363 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2364 rmode
, scale
, UINT16_MAX
, s
);
2367 uint32_t float16_to_uint32_scalbn(float16 a
, int rmode
, int scale
,
2370 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2371 rmode
, scale
, UINT32_MAX
, s
);
2374 uint64_t float16_to_uint64_scalbn(float16 a
, int rmode
, int scale
,
2377 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2378 rmode
, scale
, UINT64_MAX
, s
);
2381 uint16_t float32_to_uint16_scalbn(float32 a
, int rmode
, int scale
,
2384 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2385 rmode
, scale
, UINT16_MAX
, s
);
2388 uint32_t float32_to_uint32_scalbn(float32 a
, int rmode
, int scale
,
2391 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2392 rmode
, scale
, UINT32_MAX
, s
);
2395 uint64_t float32_to_uint64_scalbn(float32 a
, int rmode
, int scale
,
2398 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2399 rmode
, scale
, UINT64_MAX
, s
);
2402 uint16_t float64_to_uint16_scalbn(float64 a
, int rmode
, int scale
,
2405 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2406 rmode
, scale
, UINT16_MAX
, s
);
2409 uint32_t float64_to_uint32_scalbn(float64 a
, int rmode
, int scale
,
2412 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2413 rmode
, scale
, UINT32_MAX
, s
);
2416 uint64_t float64_to_uint64_scalbn(float64 a
, int rmode
, int scale
,
2419 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2420 rmode
, scale
, UINT64_MAX
, s
);
2423 uint16_t float16_to_uint16(float16 a
, float_status
*s
)
2425 return float16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2428 uint32_t float16_to_uint32(float16 a
, float_status
*s
)
2430 return float16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2433 uint64_t float16_to_uint64(float16 a
, float_status
*s
)
2435 return float16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2438 uint16_t float32_to_uint16(float32 a
, float_status
*s
)
2440 return float32_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2443 uint32_t float32_to_uint32(float32 a
, float_status
*s
)
2445 return float32_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2448 uint64_t float32_to_uint64(float32 a
, float_status
*s
)
2450 return float32_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2453 uint16_t float64_to_uint16(float64 a
, float_status
*s
)
2455 return float64_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2458 uint32_t float64_to_uint32(float64 a
, float_status
*s
)
2460 return float64_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2463 uint64_t float64_to_uint64(float64 a
, float_status
*s
)
2465 return float64_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2468 uint16_t float16_to_uint16_round_to_zero(float16 a
, float_status
*s
)
2470 return float16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2473 uint32_t float16_to_uint32_round_to_zero(float16 a
, float_status
*s
)
2475 return float16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2478 uint64_t float16_to_uint64_round_to_zero(float16 a
, float_status
*s
)
2480 return float16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2483 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*s
)
2485 return float32_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2488 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*s
)
2490 return float32_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2493 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*s
)
2495 return float32_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2498 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*s
)
2500 return float64_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2503 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*s
)
2505 return float64_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2508 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*s
)
2510 return float64_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2514 * Integer to float conversions
2516 * Returns the result of converting the two's complement integer `a'
2517 * to the floating-point format. The conversion is performed according
2518 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2521 static FloatParts
int_to_float(int64_t a
, int scale
, float_status
*status
)
2523 FloatParts r
= { .sign
= false };
2526 r
.cls
= float_class_zero
;
2531 r
.cls
= float_class_normal
;
2536 shift
= clz64(f
) - 1;
2537 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2539 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2540 r
.frac
= (shift
< 0 ? DECOMPOSED_IMPLICIT_BIT
: f
<< shift
);
2546 float16
int64_to_float16_scalbn(int64_t a
, int scale
, float_status
*status
)
2548 FloatParts pa
= int_to_float(a
, scale
, status
);
2549 return float16_round_pack_canonical(pa
, status
);
2552 float16
int32_to_float16_scalbn(int32_t a
, int scale
, float_status
*status
)
2554 return int64_to_float16_scalbn(a
, scale
, status
);
2557 float16
int16_to_float16_scalbn(int16_t a
, int scale
, float_status
*status
)
2559 return int64_to_float16_scalbn(a
, scale
, status
);
2562 float16
int64_to_float16(int64_t a
, float_status
*status
)
2564 return int64_to_float16_scalbn(a
, 0, status
);
2567 float16
int32_to_float16(int32_t a
, float_status
*status
)
2569 return int64_to_float16_scalbn(a
, 0, status
);
2572 float16
int16_to_float16(int16_t a
, float_status
*status
)
2574 return int64_to_float16_scalbn(a
, 0, status
);
2577 float32
int64_to_float32_scalbn(int64_t a
, int scale
, float_status
*status
)
2579 FloatParts pa
= int_to_float(a
, scale
, status
);
2580 return float32_round_pack_canonical(pa
, status
);
2583 float32
int32_to_float32_scalbn(int32_t a
, int scale
, float_status
*status
)
2585 return int64_to_float32_scalbn(a
, scale
, status
);
2588 float32
int16_to_float32_scalbn(int16_t a
, int scale
, float_status
*status
)
2590 return int64_to_float32_scalbn(a
, scale
, status
);
2593 float32
int64_to_float32(int64_t a
, float_status
*status
)
2595 return int64_to_float32_scalbn(a
, 0, status
);
2598 float32
int32_to_float32(int32_t a
, float_status
*status
)
2600 return int64_to_float32_scalbn(a
, 0, status
);
2603 float32
int16_to_float32(int16_t a
, float_status
*status
)
2605 return int64_to_float32_scalbn(a
, 0, status
);
2608 float64
int64_to_float64_scalbn(int64_t a
, int scale
, float_status
*status
)
2610 FloatParts pa
= int_to_float(a
, scale
, status
);
2611 return float64_round_pack_canonical(pa
, status
);
2614 float64
int32_to_float64_scalbn(int32_t a
, int scale
, float_status
*status
)
2616 return int64_to_float64_scalbn(a
, scale
, status
);
2619 float64
int16_to_float64_scalbn(int16_t a
, int scale
, float_status
*status
)
2621 return int64_to_float64_scalbn(a
, scale
, status
);
2624 float64
int64_to_float64(int64_t a
, float_status
*status
)
2626 return int64_to_float64_scalbn(a
, 0, status
);
2629 float64
int32_to_float64(int32_t a
, float_status
*status
)
2631 return int64_to_float64_scalbn(a
, 0, status
);
2634 float64
int16_to_float64(int16_t a
, float_status
*status
)
2636 return int64_to_float64_scalbn(a
, 0, status
);
2641 * Unsigned Integer to float conversions
2643 * Returns the result of converting the unsigned integer `a' to the
2644 * floating-point format. The conversion is performed according to the
2645 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2648 static FloatParts
uint_to_float(uint64_t a
, int scale
, float_status
*status
)
2650 FloatParts r
= { .sign
= false };
2653 r
.cls
= float_class_zero
;
2655 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2656 r
.cls
= float_class_normal
;
2657 if ((int64_t)a
< 0) {
2658 r
.exp
= DECOMPOSED_BINARY_POINT
+ 1 + scale
;
2659 shift64RightJamming(a
, 1, &a
);
2662 int shift
= clz64(a
) - 1;
2663 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2664 r
.frac
= a
<< shift
;
2671 float16
uint64_to_float16_scalbn(uint64_t a
, int scale
, float_status
*status
)
2673 FloatParts pa
= uint_to_float(a
, scale
, status
);
2674 return float16_round_pack_canonical(pa
, status
);
2677 float16
uint32_to_float16_scalbn(uint32_t a
, int scale
, float_status
*status
)
2679 return uint64_to_float16_scalbn(a
, scale
, status
);
2682 float16
uint16_to_float16_scalbn(uint16_t a
, int scale
, float_status
*status
)
2684 return uint64_to_float16_scalbn(a
, scale
, status
);
2687 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
2689 return uint64_to_float16_scalbn(a
, 0, status
);
2692 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
2694 return uint64_to_float16_scalbn(a
, 0, status
);
2697 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
2699 return uint64_to_float16_scalbn(a
, 0, status
);
2702 float32
uint64_to_float32_scalbn(uint64_t a
, int scale
, float_status
*status
)
2704 FloatParts pa
= uint_to_float(a
, scale
, status
);
2705 return float32_round_pack_canonical(pa
, status
);
2708 float32
uint32_to_float32_scalbn(uint32_t a
, int scale
, float_status
*status
)
2710 return uint64_to_float32_scalbn(a
, scale
, status
);
2713 float32
uint16_to_float32_scalbn(uint16_t a
, int scale
, float_status
*status
)
2715 return uint64_to_float32_scalbn(a
, scale
, status
);
2718 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
2720 return uint64_to_float32_scalbn(a
, 0, status
);
2723 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
2725 return uint64_to_float32_scalbn(a
, 0, status
);
2728 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
2730 return uint64_to_float32_scalbn(a
, 0, status
);
2733 float64
uint64_to_float64_scalbn(uint64_t a
, int scale
, float_status
*status
)
2735 FloatParts pa
= uint_to_float(a
, scale
, status
);
2736 return float64_round_pack_canonical(pa
, status
);
2739 float64
uint32_to_float64_scalbn(uint32_t a
, int scale
, float_status
*status
)
2741 return uint64_to_float64_scalbn(a
, scale
, status
);
2744 float64
uint16_to_float64_scalbn(uint16_t a
, int scale
, float_status
*status
)
2746 return uint64_to_float64_scalbn(a
, scale
, status
);
2749 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
2751 return uint64_to_float64_scalbn(a
, 0, status
);
2754 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
2756 return uint64_to_float64_scalbn(a
, 0, status
);
2759 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
2761 return uint64_to_float64_scalbn(a
, 0, status
);
2765 /* min() and max() functions. These can't be implemented as
2766 * 'compare and pick one input' because that would mishandle
2767 * NaNs and +0 vs -0.
2769 * minnum() and maxnum() functions. These are similar to the min()
2770 * and max() functions but if one of the arguments is a QNaN and
2771 * the other is numerical then the numerical argument is returned.
2772 * SNaNs will get quietened before being returned.
2773 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2774 * and maxNum() operations. min() and max() are the typical min/max
2775 * semantics provided by many CPUs which predate that specification.
2777 * minnummag() and maxnummag() functions correspond to minNumMag()
2778 * and minNumMag() from the IEEE-754 2008.
2780 static FloatParts
minmax_floats(FloatParts a
, FloatParts b
, bool ismin
,
2781 bool ieee
, bool ismag
, float_status
*s
)
2783 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
2785 /* Takes two floating-point values `a' and `b', one of
2786 * which is a NaN, and returns the appropriate NaN
2787 * result. If either `a' or `b' is a signaling NaN,
2788 * the invalid exception is raised.
2790 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
2791 return pick_nan(a
, b
, s
);
2792 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
2794 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
2798 return pick_nan(a
, b
, s
);
2803 case float_class_normal
:
2806 case float_class_inf
:
2809 case float_class_zero
:
2813 g_assert_not_reached();
2817 case float_class_normal
:
2820 case float_class_inf
:
2823 case float_class_zero
:
2827 g_assert_not_reached();
2831 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
2832 bool a_less
= a_exp
< b_exp
;
2833 if (a_exp
== b_exp
) {
2834 a_less
= a
.frac
< b
.frac
;
2836 return a_less
^ ismin
? b
: a
;
2839 if (a
.sign
== b
.sign
) {
2840 bool a_less
= a_exp
< b_exp
;
2841 if (a_exp
== b_exp
) {
2842 a_less
= a
.frac
< b
.frac
;
2844 return a
.sign
^ a_less
^ ismin
? b
: a
;
2846 return a
.sign
^ ismin
? b
: a
;
2851 #define MINMAX(sz, name, ismin, isiee, ismag) \
2852 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
2855 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2856 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2857 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
2859 return float ## sz ## _round_pack_canonical(pr, s); \
2862 MINMAX(16, min
, true, false, false)
2863 MINMAX(16, minnum
, true, true, false)
2864 MINMAX(16, minnummag
, true, true, true)
2865 MINMAX(16, max
, false, false, false)
2866 MINMAX(16, maxnum
, false, true, false)
2867 MINMAX(16, maxnummag
, false, true, true)
2869 MINMAX(32, min
, true, false, false)
2870 MINMAX(32, minnum
, true, true, false)
2871 MINMAX(32, minnummag
, true, true, true)
2872 MINMAX(32, max
, false, false, false)
2873 MINMAX(32, maxnum
, false, true, false)
2874 MINMAX(32, maxnummag
, false, true, true)
2876 MINMAX(64, min
, true, false, false)
2877 MINMAX(64, minnum
, true, true, false)
2878 MINMAX(64, minnummag
, true, true, true)
2879 MINMAX(64, max
, false, false, false)
2880 MINMAX(64, maxnum
, false, true, false)
2881 MINMAX(64, maxnummag
, false, true, true)
2885 /* Floating point compare */
2886 static int compare_floats(FloatParts a
, FloatParts b
, bool is_quiet
,
2889 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
2891 a
.cls
== float_class_snan
||
2892 b
.cls
== float_class_snan
) {
2893 s
->float_exception_flags
|= float_flag_invalid
;
2895 return float_relation_unordered
;
2898 if (a
.cls
== float_class_zero
) {
2899 if (b
.cls
== float_class_zero
) {
2900 return float_relation_equal
;
2902 return b
.sign
? float_relation_greater
: float_relation_less
;
2903 } else if (b
.cls
== float_class_zero
) {
2904 return a
.sign
? float_relation_less
: float_relation_greater
;
2907 /* The only really important thing about infinity is its sign. If
2908 * both are infinities the sign marks the smallest of the two.
2910 if (a
.cls
== float_class_inf
) {
2911 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
2912 return float_relation_equal
;
2914 return a
.sign
? float_relation_less
: float_relation_greater
;
2915 } else if (b
.cls
== float_class_inf
) {
2916 return b
.sign
? float_relation_greater
: float_relation_less
;
2919 if (a
.sign
!= b
.sign
) {
2920 return a
.sign
? float_relation_less
: float_relation_greater
;
2923 if (a
.exp
== b
.exp
) {
2924 if (a
.frac
== b
.frac
) {
2925 return float_relation_equal
;
2928 return a
.frac
> b
.frac
?
2929 float_relation_less
: float_relation_greater
;
2931 return a
.frac
> b
.frac
?
2932 float_relation_greater
: float_relation_less
;
2936 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
2938 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
2943 #define COMPARE(name, attr, sz) \
2945 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
2947 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2948 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2949 return compare_floats(pa, pb, is_quiet, s); \
2952 COMPARE(soft_f16_compare
, QEMU_FLATTEN
, 16)
2953 COMPARE(soft_f32_compare
, QEMU_SOFTFLOAT_ATTR
, 32)
2954 COMPARE(soft_f64_compare
, QEMU_SOFTFLOAT_ATTR
, 64)
2958 int float16_compare(float16 a
, float16 b
, float_status
*s
)
2960 return soft_f16_compare(a
, b
, false, s
);
2963 int float16_compare_quiet(float16 a
, float16 b
, float_status
*s
)
2965 return soft_f16_compare(a
, b
, true, s
);
2968 static int QEMU_FLATTEN
2969 f32_compare(float32 xa
, float32 xb
, bool is_quiet
, float_status
*s
)
2971 union_float32 ua
, ub
;
2976 if (QEMU_NO_HARDFLOAT
) {
2980 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
2981 if (isgreaterequal(ua
.h
, ub
.h
)) {
2982 if (isgreater(ua
.h
, ub
.h
)) {
2983 return float_relation_greater
;
2985 return float_relation_equal
;
2987 if (likely(isless(ua
.h
, ub
.h
))) {
2988 return float_relation_less
;
2990 /* The only condition remaining is unordered.
2991 * Fall through to set flags.
2994 return soft_f32_compare(ua
.s
, ub
.s
, is_quiet
, s
);
2997 int float32_compare(float32 a
, float32 b
, float_status
*s
)
2999 return f32_compare(a
, b
, false, s
);
3002 int float32_compare_quiet(float32 a
, float32 b
, float_status
*s
)
3004 return f32_compare(a
, b
, true, s
);
3007 static int QEMU_FLATTEN
3008 f64_compare(float64 xa
, float64 xb
, bool is_quiet
, float_status
*s
)
3010 union_float64 ua
, ub
;
3015 if (QEMU_NO_HARDFLOAT
) {
3019 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
3020 if (isgreaterequal(ua
.h
, ub
.h
)) {
3021 if (isgreater(ua
.h
, ub
.h
)) {
3022 return float_relation_greater
;
3024 return float_relation_equal
;
3026 if (likely(isless(ua
.h
, ub
.h
))) {
3027 return float_relation_less
;
3029 /* The only condition remaining is unordered.
3030 * Fall through to set flags.
3033 return soft_f64_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3036 int float64_compare(float64 a
, float64 b
, float_status
*s
)
3038 return f64_compare(a
, b
, false, s
);
3041 int float64_compare_quiet(float64 a
, float64 b
, float_status
*s
)
3043 return f64_compare(a
, b
, true, s
);
3046 /* Multiply A by 2 raised to the power N. */
3047 static FloatParts
scalbn_decomposed(FloatParts a
, int n
, float_status
*s
)
3049 if (unlikely(is_nan(a
.cls
))) {
3050 return return_nan(a
, s
);
3052 if (a
.cls
== float_class_normal
) {
3053 /* The largest float type (even though not supported by FloatParts)
3054 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3055 * still allows rounding to infinity, without allowing overflow
3056 * within the int32_t that backs FloatParts.exp.
3058 n
= MIN(MAX(n
, -0x10000), 0x10000);
3064 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
3066 FloatParts pa
= float16_unpack_canonical(a
, status
);
3067 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3068 return float16_round_pack_canonical(pr
, status
);
3071 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
3073 FloatParts pa
= float32_unpack_canonical(a
, status
);
3074 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3075 return float32_round_pack_canonical(pr
, status
);
3078 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
3080 FloatParts pa
= float64_unpack_canonical(a
, status
);
3081 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3082 return float64_round_pack_canonical(pr
, status
);
3088 * The old softfloat code did an approximation step before zeroing in
3089 * on the final result. However for simpleness we just compute the
3090 * square root by iterating down from the implicit bit to enough extra
3091 * bits to ensure we get a correctly rounded result.
3093 * This does mean however the calculation is slower than before,
3094 * especially for 64 bit floats.
3097 static FloatParts
sqrt_float(FloatParts a
, float_status
*s
, const FloatFmt
*p
)
3099 uint64_t a_frac
, r_frac
, s_frac
;
3102 if (is_nan(a
.cls
)) {
3103 return return_nan(a
, s
);
3105 if (a
.cls
== float_class_zero
) {
3106 return a
; /* sqrt(+-0) = +-0 */
3109 s
->float_exception_flags
|= float_flag_invalid
;
3110 return parts_default_nan(s
);
3112 if (a
.cls
== float_class_inf
) {
3113 return a
; /* sqrt(+inf) = +inf */
3116 assert(a
.cls
== float_class_normal
);
3118 /* We need two overflow bits at the top. Adding room for that is a
3119 * right shift. If the exponent is odd, we can discard the low bit
3120 * by multiplying the fraction by 2; that's a left shift. Combine
3121 * those and we shift right if the exponent is even.
3129 /* Bit-by-bit computation of sqrt. */
3133 /* Iterate from implicit bit down to the 3 extra bits to compute a
3134 * properly rounded result. Remember we've inserted one more bit
3135 * at the top, so these positions are one less.
3137 bit
= DECOMPOSED_BINARY_POINT
- 1;
3138 last_bit
= MAX(p
->frac_shift
- 4, 0);
3140 uint64_t q
= 1ULL << bit
;
3141 uint64_t t_frac
= s_frac
+ q
;
3142 if (t_frac
<= a_frac
) {
3143 s_frac
= t_frac
+ q
;
3148 } while (--bit
>= last_bit
);
3150 /* Undo the right shift done above. If there is any remaining
3151 * fraction, the result is inexact. Set the sticky bit.
3153 a
.frac
= (r_frac
<< 1) + (a_frac
!= 0);
3158 float16 QEMU_FLATTEN
float16_sqrt(float16 a
, float_status
*status
)
3160 FloatParts pa
= float16_unpack_canonical(a
, status
);
3161 FloatParts pr
= sqrt_float(pa
, status
, &float16_params
);
3162 return float16_round_pack_canonical(pr
, status
);
3165 static float32 QEMU_SOFTFLOAT_ATTR
3166 soft_f32_sqrt(float32 a
, float_status
*status
)
3168 FloatParts pa
= float32_unpack_canonical(a
, status
);
3169 FloatParts pr
= sqrt_float(pa
, status
, &float32_params
);
3170 return float32_round_pack_canonical(pr
, status
);
3173 static float64 QEMU_SOFTFLOAT_ATTR
3174 soft_f64_sqrt(float64 a
, float_status
*status
)
3176 FloatParts pa
= float64_unpack_canonical(a
, status
);
3177 FloatParts pr
= sqrt_float(pa
, status
, &float64_params
);
3178 return float64_round_pack_canonical(pr
, status
);
3181 float32 QEMU_FLATTEN
float32_sqrt(float32 xa
, float_status
*s
)
3183 union_float32 ua
, ur
;
3186 if (unlikely(!can_use_fpu(s
))) {
3190 float32_input_flush1(&ua
.s
, s
);
3191 if (QEMU_HARDFLOAT_1F32_USE_FP
) {
3192 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3193 fpclassify(ua
.h
) == FP_ZERO
) ||
3197 } else if (unlikely(!float32_is_zero_or_normal(ua
.s
) ||
3198 float32_is_neg(ua
.s
))) {
3205 return soft_f32_sqrt(ua
.s
, s
);
3208 float64 QEMU_FLATTEN
float64_sqrt(float64 xa
, float_status
*s
)
3210 union_float64 ua
, ur
;
3213 if (unlikely(!can_use_fpu(s
))) {
3217 float64_input_flush1(&ua
.s
, s
);
3218 if (QEMU_HARDFLOAT_1F64_USE_FP
) {
3219 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3220 fpclassify(ua
.h
) == FP_ZERO
) ||
3224 } else if (unlikely(!float64_is_zero_or_normal(ua
.s
) ||
3225 float64_is_neg(ua
.s
))) {
3232 return soft_f64_sqrt(ua
.s
, s
);
3235 /*----------------------------------------------------------------------------
3236 | The pattern for a default generated NaN.
3237 *----------------------------------------------------------------------------*/
3239 float16
float16_default_nan(float_status
*status
)
3241 FloatParts p
= parts_default_nan(status
);
3242 p
.frac
>>= float16_params
.frac_shift
;
3243 return float16_pack_raw(p
);
3246 float32
float32_default_nan(float_status
*status
)
3248 FloatParts p
= parts_default_nan(status
);
3249 p
.frac
>>= float32_params
.frac_shift
;
3250 return float32_pack_raw(p
);
3253 float64
float64_default_nan(float_status
*status
)
3255 FloatParts p
= parts_default_nan(status
);
3256 p
.frac
>>= float64_params
.frac_shift
;
3257 return float64_pack_raw(p
);
3260 float128
float128_default_nan(float_status
*status
)
3262 FloatParts p
= parts_default_nan(status
);
3265 /* Extrapolate from the choices made by parts_default_nan to fill
3266 * in the quad-floating format. If the low bit is set, assume we
3267 * want to set all non-snan bits.
3269 r
.low
= -(p
.frac
& 1);
3270 r
.high
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- 48);
3271 r
.high
|= UINT64_C(0x7FFF000000000000);
3272 r
.high
|= (uint64_t)p
.sign
<< 63;
3277 /*----------------------------------------------------------------------------
3278 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3279 *----------------------------------------------------------------------------*/
3281 float16
float16_silence_nan(float16 a
, float_status
*status
)
3283 FloatParts p
= float16_unpack_raw(a
);
3284 p
.frac
<<= float16_params
.frac_shift
;
3285 p
= parts_silence_nan(p
, status
);
3286 p
.frac
>>= float16_params
.frac_shift
;
3287 return float16_pack_raw(p
);
3290 float32
float32_silence_nan(float32 a
, float_status
*status
)
3292 FloatParts p
= float32_unpack_raw(a
);
3293 p
.frac
<<= float32_params
.frac_shift
;
3294 p
= parts_silence_nan(p
, status
);
3295 p
.frac
>>= float32_params
.frac_shift
;
3296 return float32_pack_raw(p
);
3299 float64
float64_silence_nan(float64 a
, float_status
*status
)
3301 FloatParts p
= float64_unpack_raw(a
);
3302 p
.frac
<<= float64_params
.frac_shift
;
3303 p
= parts_silence_nan(p
, status
);
3304 p
.frac
>>= float64_params
.frac_shift
;
3305 return float64_pack_raw(p
);
3309 /*----------------------------------------------------------------------------
3310 | If `a' is denormal and we are in flush-to-zero mode then set the
3311 | input-denormal exception and return zero. Otherwise just return the value.
3312 *----------------------------------------------------------------------------*/
3314 static bool parts_squash_denormal(FloatParts p
, float_status
*status
)
3316 if (p
.exp
== 0 && p
.frac
!= 0) {
3317 float_raise(float_flag_input_denormal
, status
);
3324 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
3326 if (status
->flush_inputs_to_zero
) {
3327 FloatParts p
= float16_unpack_raw(a
);
3328 if (parts_squash_denormal(p
, status
)) {
3329 return float16_set_sign(float16_zero
, p
.sign
);
3335 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
3337 if (status
->flush_inputs_to_zero
) {
3338 FloatParts p
= float32_unpack_raw(a
);
3339 if (parts_squash_denormal(p
, status
)) {
3340 return float32_set_sign(float32_zero
, p
.sign
);
3346 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
3348 if (status
->flush_inputs_to_zero
) {
3349 FloatParts p
= float64_unpack_raw(a
);
3350 if (parts_squash_denormal(p
, status
)) {
3351 return float64_set_sign(float64_zero
, p
.sign
);
3357 /*----------------------------------------------------------------------------
3358 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3359 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3360 | input. If `zSign' is 1, the input is negated before being converted to an
3361 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3362 | is simply rounded to an integer, with the inexact exception raised if the
3363 | input cannot be represented exactly as an integer. However, if the fixed-
3364 | point input is too large, the invalid exception is raised and the largest
3365 | positive or negative integer is returned.
3366 *----------------------------------------------------------------------------*/
3368 static int32_t roundAndPackInt32(flag zSign
, uint64_t absZ
, float_status
*status
)
3370 int8_t roundingMode
;
3371 flag roundNearestEven
;
3372 int8_t roundIncrement
, roundBits
;
3375 roundingMode
= status
->float_rounding_mode
;
3376 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3377 switch (roundingMode
) {
3378 case float_round_nearest_even
:
3379 case float_round_ties_away
:
3380 roundIncrement
= 0x40;
3382 case float_round_to_zero
:
3385 case float_round_up
:
3386 roundIncrement
= zSign
? 0 : 0x7f;
3388 case float_round_down
:
3389 roundIncrement
= zSign
? 0x7f : 0;
3391 case float_round_to_odd
:
3392 roundIncrement
= absZ
& 0x80 ? 0 : 0x7f;
3397 roundBits
= absZ
& 0x7F;
3398 absZ
= ( absZ
+ roundIncrement
)>>7;
3399 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
3401 if ( zSign
) z
= - z
;
3402 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
3403 float_raise(float_flag_invalid
, status
);
3404 return zSign
? INT32_MIN
: INT32_MAX
;
3407 status
->float_exception_flags
|= float_flag_inexact
;
3413 /*----------------------------------------------------------------------------
3414 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3415 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3416 | and returns the properly rounded 64-bit integer corresponding to the input.
3417 | If `zSign' is 1, the input is negated before being converted to an integer.
3418 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3419 | the inexact exception raised if the input cannot be represented exactly as
3420 | an integer. However, if the fixed-point input is too large, the invalid
3421 | exception is raised and the largest positive or negative integer is
3423 *----------------------------------------------------------------------------*/
3425 static int64_t roundAndPackInt64(flag zSign
, uint64_t absZ0
, uint64_t absZ1
,
3426 float_status
*status
)
3428 int8_t roundingMode
;
3429 flag roundNearestEven
, increment
;
3432 roundingMode
= status
->float_rounding_mode
;
3433 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3434 switch (roundingMode
) {
3435 case float_round_nearest_even
:
3436 case float_round_ties_away
:
3437 increment
= ((int64_t) absZ1
< 0);
3439 case float_round_to_zero
:
3442 case float_round_up
:
3443 increment
= !zSign
&& absZ1
;
3445 case float_round_down
:
3446 increment
= zSign
&& absZ1
;
3448 case float_round_to_odd
:
3449 increment
= !(absZ0
& 1) && absZ1
;
3456 if ( absZ0
== 0 ) goto overflow
;
3457 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
3460 if ( zSign
) z
= - z
;
3461 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
3463 float_raise(float_flag_invalid
, status
);
3464 return zSign
? INT64_MIN
: INT64_MAX
;
3467 status
->float_exception_flags
|= float_flag_inexact
;
3473 /*----------------------------------------------------------------------------
3474 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3475 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3476 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3477 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3478 | with the inexact exception raised if the input cannot be represented exactly
3479 | as an integer. However, if the fixed-point input is too large, the invalid
3480 | exception is raised and the largest unsigned integer is returned.
3481 *----------------------------------------------------------------------------*/
3483 static int64_t roundAndPackUint64(flag zSign
, uint64_t absZ0
,
3484 uint64_t absZ1
, float_status
*status
)
3486 int8_t roundingMode
;
3487 flag roundNearestEven
, increment
;
3489 roundingMode
= status
->float_rounding_mode
;
3490 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
3491 switch (roundingMode
) {
3492 case float_round_nearest_even
:
3493 case float_round_ties_away
:
3494 increment
= ((int64_t)absZ1
< 0);
3496 case float_round_to_zero
:
3499 case float_round_up
:
3500 increment
= !zSign
&& absZ1
;
3502 case float_round_down
:
3503 increment
= zSign
&& absZ1
;
3505 case float_round_to_odd
:
3506 increment
= !(absZ0
& 1) && absZ1
;
3514 float_raise(float_flag_invalid
, status
);
3517 absZ0
&= ~(((uint64_t)(absZ1
<<1) == 0) & roundNearestEven
);
3520 if (zSign
&& absZ0
) {
3521 float_raise(float_flag_invalid
, status
);
3526 status
->float_exception_flags
|= float_flag_inexact
;
3531 /*----------------------------------------------------------------------------
3532 | Normalizes the subnormal single-precision floating-point value represented
3533 | by the denormalized significand `aSig'. The normalized exponent and
3534 | significand are stored at the locations pointed to by `zExpPtr' and
3535 | `zSigPtr', respectively.
3536 *----------------------------------------------------------------------------*/
3539 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
3543 shiftCount
= clz32(aSig
) - 8;
3544 *zSigPtr
= aSig
<<shiftCount
;
3545 *zExpPtr
= 1 - shiftCount
;
3549 /*----------------------------------------------------------------------------
3550 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3551 | and significand `zSig', and returns the proper single-precision floating-
3552 | point value corresponding to the abstract input. Ordinarily, the abstract
3553 | value is simply rounded and packed into the single-precision format, with
3554 | the inexact exception raised if the abstract input cannot be represented
3555 | exactly. However, if the abstract value is too large, the overflow and
3556 | inexact exceptions are raised and an infinity or maximal finite value is
3557 | returned. If the abstract value is too small, the input value is rounded to
3558 | a subnormal number, and the underflow and inexact exceptions are raised if
3559 | the abstract input cannot be represented exactly as a subnormal single-
3560 | precision floating-point number.
3561 | The input significand `zSig' has its binary point between bits 30
3562 | and 29, which is 7 bits to the left of the usual location. This shifted
3563 | significand must be normalized or smaller. If `zSig' is not normalized,
3564 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3565 | and it must not require rounding. In the usual case that `zSig' is
3566 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3567 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3568 | Binary Floating-Point Arithmetic.
3569 *----------------------------------------------------------------------------*/
3571 static float32
roundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
3572 float_status
*status
)
3574 int8_t roundingMode
;
3575 flag roundNearestEven
;
3576 int8_t roundIncrement
, roundBits
;
3579 roundingMode
= status
->float_rounding_mode
;
3580 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3581 switch (roundingMode
) {
3582 case float_round_nearest_even
:
3583 case float_round_ties_away
:
3584 roundIncrement
= 0x40;
3586 case float_round_to_zero
:
3589 case float_round_up
:
3590 roundIncrement
= zSign
? 0 : 0x7f;
3592 case float_round_down
:
3593 roundIncrement
= zSign
? 0x7f : 0;
3595 case float_round_to_odd
:
3596 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
3602 roundBits
= zSig
& 0x7F;
3603 if ( 0xFD <= (uint16_t) zExp
) {
3604 if ( ( 0xFD < zExp
)
3605 || ( ( zExp
== 0xFD )
3606 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
3608 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
3609 roundIncrement
!= 0;
3610 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3611 return packFloat32(zSign
, 0xFF, -!overflow_to_inf
);
3614 if (status
->flush_to_zero
) {
3615 float_raise(float_flag_output_denormal
, status
);
3616 return packFloat32(zSign
, 0, 0);
3619 (status
->float_detect_tininess
3620 == float_tininess_before_rounding
)
3622 || ( zSig
+ roundIncrement
< 0x80000000 );
3623 shift32RightJamming( zSig
, - zExp
, &zSig
);
3625 roundBits
= zSig
& 0x7F;
3626 if (isTiny
&& roundBits
) {
3627 float_raise(float_flag_underflow
, status
);
3629 if (roundingMode
== float_round_to_odd
) {
3631 * For round-to-odd case, the roundIncrement depends on
3632 * zSig which just changed.
3634 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
3639 status
->float_exception_flags
|= float_flag_inexact
;
3641 zSig
= ( zSig
+ roundIncrement
)>>7;
3642 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
3643 if ( zSig
== 0 ) zExp
= 0;
3644 return packFloat32( zSign
, zExp
, zSig
);
3648 /*----------------------------------------------------------------------------
3649 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3650 | and significand `zSig', and returns the proper single-precision floating-
3651 | point value corresponding to the abstract input. This routine is just like
3652 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3653 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3654 | floating-point exponent.
3655 *----------------------------------------------------------------------------*/
3658 normalizeRoundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
3659 float_status
*status
)
3663 shiftCount
= clz32(zSig
) - 1;
3664 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
3669 /*----------------------------------------------------------------------------
3670 | Normalizes the subnormal double-precision floating-point value represented
3671 | by the denormalized significand `aSig'. The normalized exponent and
3672 | significand are stored at the locations pointed to by `zExpPtr' and
3673 | `zSigPtr', respectively.
3674 *----------------------------------------------------------------------------*/
3677 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
3681 shiftCount
= clz64(aSig
) - 11;
3682 *zSigPtr
= aSig
<<shiftCount
;
3683 *zExpPtr
= 1 - shiftCount
;
3687 /*----------------------------------------------------------------------------
3688 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3689 | double-precision floating-point value, returning the result. After being
3690 | shifted into the proper positions, the three fields are simply added
3691 | together to form the result. This means that any integer portion of `zSig'
3692 | will be added into the exponent. Since a properly normalized significand
3693 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3694 | than the desired result exponent whenever `zSig' is a complete, normalized
3696 *----------------------------------------------------------------------------*/
3698 static inline float64
packFloat64(flag zSign
, int zExp
, uint64_t zSig
)
3701 return make_float64(
3702 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
3706 /*----------------------------------------------------------------------------
3707 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3708 | and significand `zSig', and returns the proper double-precision floating-
3709 | point value corresponding to the abstract input. Ordinarily, the abstract
3710 | value is simply rounded and packed into the double-precision format, with
3711 | the inexact exception raised if the abstract input cannot be represented
3712 | exactly. However, if the abstract value is too large, the overflow and
3713 | inexact exceptions are raised and an infinity or maximal finite value is
3714 | returned. If the abstract value is too small, the input value is rounded to
3715 | a subnormal number, and the underflow and inexact exceptions are raised if
3716 | the abstract input cannot be represented exactly as a subnormal double-
3717 | precision floating-point number.
3718 | The input significand `zSig' has its binary point between bits 62
3719 | and 61, which is 10 bits to the left of the usual location. This shifted
3720 | significand must be normalized or smaller. If `zSig' is not normalized,
3721 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3722 | and it must not require rounding. In the usual case that `zSig' is
3723 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3724 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3725 | Binary Floating-Point Arithmetic.
3726 *----------------------------------------------------------------------------*/
3728 static float64
roundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
3729 float_status
*status
)
3731 int8_t roundingMode
;
3732 flag roundNearestEven
;
3733 int roundIncrement
, roundBits
;
3736 roundingMode
= status
->float_rounding_mode
;
3737 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3738 switch (roundingMode
) {
3739 case float_round_nearest_even
:
3740 case float_round_ties_away
:
3741 roundIncrement
= 0x200;
3743 case float_round_to_zero
:
3746 case float_round_up
:
3747 roundIncrement
= zSign
? 0 : 0x3ff;
3749 case float_round_down
:
3750 roundIncrement
= zSign
? 0x3ff : 0;
3752 case float_round_to_odd
:
3753 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
3758 roundBits
= zSig
& 0x3FF;
3759 if ( 0x7FD <= (uint16_t) zExp
) {
3760 if ( ( 0x7FD < zExp
)
3761 || ( ( zExp
== 0x7FD )
3762 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
3764 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
3765 roundIncrement
!= 0;
3766 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3767 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
3770 if (status
->flush_to_zero
) {
3771 float_raise(float_flag_output_denormal
, status
);
3772 return packFloat64(zSign
, 0, 0);
3775 (status
->float_detect_tininess
3776 == float_tininess_before_rounding
)
3778 || ( zSig
+ roundIncrement
< UINT64_C(0x8000000000000000) );
3779 shift64RightJamming( zSig
, - zExp
, &zSig
);
3781 roundBits
= zSig
& 0x3FF;
3782 if (isTiny
&& roundBits
) {
3783 float_raise(float_flag_underflow
, status
);
3785 if (roundingMode
== float_round_to_odd
) {
3787 * For round-to-odd case, the roundIncrement depends on
3788 * zSig which just changed.
3790 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
3795 status
->float_exception_flags
|= float_flag_inexact
;
3797 zSig
= ( zSig
+ roundIncrement
)>>10;
3798 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
3799 if ( zSig
== 0 ) zExp
= 0;
3800 return packFloat64( zSign
, zExp
, zSig
);
3804 /*----------------------------------------------------------------------------
3805 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3806 | and significand `zSig', and returns the proper double-precision floating-
3807 | point value corresponding to the abstract input. This routine is just like
3808 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3809 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3810 | floating-point exponent.
3811 *----------------------------------------------------------------------------*/
3814 normalizeRoundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
3815 float_status
*status
)
3819 shiftCount
= clz64(zSig
) - 1;
3820 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
3825 /*----------------------------------------------------------------------------
3826 | Normalizes the subnormal extended double-precision floating-point value
3827 | represented by the denormalized significand `aSig'. The normalized exponent
3828 | and significand are stored at the locations pointed to by `zExpPtr' and
3829 | `zSigPtr', respectively.
3830 *----------------------------------------------------------------------------*/
3832 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
3837 shiftCount
= clz64(aSig
);
3838 *zSigPtr
= aSig
<<shiftCount
;
3839 *zExpPtr
= 1 - shiftCount
;
3842 /*----------------------------------------------------------------------------
3843 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3844 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3845 | and returns the proper extended double-precision floating-point value
3846 | corresponding to the abstract input. Ordinarily, the abstract value is
3847 | rounded and packed into the extended double-precision format, with the
3848 | inexact exception raised if the abstract input cannot be represented
3849 | exactly. However, if the abstract value is too large, the overflow and
3850 | inexact exceptions are raised and an infinity or maximal finite value is
3851 | returned. If the abstract value is too small, the input value is rounded to
3852 | a subnormal number, and the underflow and inexact exceptions are raised if
3853 | the abstract input cannot be represented exactly as a subnormal extended
3854 | double-precision floating-point number.
3855 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
3856 | number of bits as single or double precision, respectively. Otherwise, the
3857 | result is rounded to the full precision of the extended double-precision
3859 | The input significand must be normalized or smaller. If the input
3860 | significand is not normalized, `zExp' must be 0; in that case, the result
3861 | returned is a subnormal number, and it must not require rounding. The
3862 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3863 | Floating-Point Arithmetic.
3864 *----------------------------------------------------------------------------*/
3866 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, flag zSign
,
3867 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
3868 float_status
*status
)
3870 int8_t roundingMode
;
3871 flag roundNearestEven
, increment
, isTiny
;
3872 int64_t roundIncrement
, roundMask
, roundBits
;
3874 roundingMode
= status
->float_rounding_mode
;
3875 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3876 if ( roundingPrecision
== 80 ) goto precision80
;
3877 if ( roundingPrecision
== 64 ) {
3878 roundIncrement
= UINT64_C(0x0000000000000400);
3879 roundMask
= UINT64_C(0x00000000000007FF);
3881 else if ( roundingPrecision
== 32 ) {
3882 roundIncrement
= UINT64_C(0x0000008000000000);
3883 roundMask
= UINT64_C(0x000000FFFFFFFFFF);
3888 zSig0
|= ( zSig1
!= 0 );
3889 switch (roundingMode
) {
3890 case float_round_nearest_even
:
3891 case float_round_ties_away
:
3893 case float_round_to_zero
:
3896 case float_round_up
:
3897 roundIncrement
= zSign
? 0 : roundMask
;
3899 case float_round_down
:
3900 roundIncrement
= zSign
? roundMask
: 0;
3905 roundBits
= zSig0
& roundMask
;
3906 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
3907 if ( ( 0x7FFE < zExp
)
3908 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
3913 if (status
->flush_to_zero
) {
3914 float_raise(float_flag_output_denormal
, status
);
3915 return packFloatx80(zSign
, 0, 0);
3918 (status
->float_detect_tininess
3919 == float_tininess_before_rounding
)
3921 || ( zSig0
<= zSig0
+ roundIncrement
);
3922 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
3924 roundBits
= zSig0
& roundMask
;
3925 if (isTiny
&& roundBits
) {
3926 float_raise(float_flag_underflow
, status
);
3929 status
->float_exception_flags
|= float_flag_inexact
;
3931 zSig0
+= roundIncrement
;
3932 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
3933 roundIncrement
= roundMask
+ 1;
3934 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
3935 roundMask
|= roundIncrement
;
3937 zSig0
&= ~ roundMask
;
3938 return packFloatx80( zSign
, zExp
, zSig0
);
3942 status
->float_exception_flags
|= float_flag_inexact
;
3944 zSig0
+= roundIncrement
;
3945 if ( zSig0
< roundIncrement
) {
3947 zSig0
= UINT64_C(0x8000000000000000);
3949 roundIncrement
= roundMask
+ 1;
3950 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
3951 roundMask
|= roundIncrement
;
3953 zSig0
&= ~ roundMask
;
3954 if ( zSig0
== 0 ) zExp
= 0;
3955 return packFloatx80( zSign
, zExp
, zSig0
);
3957 switch (roundingMode
) {
3958 case float_round_nearest_even
:
3959 case float_round_ties_away
:
3960 increment
= ((int64_t)zSig1
< 0);
3962 case float_round_to_zero
:
3965 case float_round_up
:
3966 increment
= !zSign
&& zSig1
;
3968 case float_round_down
:
3969 increment
= zSign
&& zSig1
;
3974 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
3975 if ( ( 0x7FFE < zExp
)
3976 || ( ( zExp
== 0x7FFE )
3977 && ( zSig0
== UINT64_C(0xFFFFFFFFFFFFFFFF) )
3983 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3984 if ( ( roundingMode
== float_round_to_zero
)
3985 || ( zSign
&& ( roundingMode
== float_round_up
) )
3986 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
3988 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
3990 return packFloatx80(zSign
,
3991 floatx80_infinity_high
,
3992 floatx80_infinity_low
);
3996 (status
->float_detect_tininess
3997 == float_tininess_before_rounding
)
4000 || ( zSig0
< UINT64_C(0xFFFFFFFFFFFFFFFF) );
4001 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
4003 if (isTiny
&& zSig1
) {
4004 float_raise(float_flag_underflow
, status
);
4007 status
->float_exception_flags
|= float_flag_inexact
;
4009 switch (roundingMode
) {
4010 case float_round_nearest_even
:
4011 case float_round_ties_away
:
4012 increment
= ((int64_t)zSig1
< 0);
4014 case float_round_to_zero
:
4017 case float_round_up
:
4018 increment
= !zSign
&& zSig1
;
4020 case float_round_down
:
4021 increment
= zSign
&& zSig1
;
4029 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
4030 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4032 return packFloatx80( zSign
, zExp
, zSig0
);
4036 status
->float_exception_flags
|= float_flag_inexact
;
4042 zSig0
= UINT64_C(0x8000000000000000);
4045 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
4049 if ( zSig0
== 0 ) zExp
= 0;
4051 return packFloatx80( zSign
, zExp
, zSig0
);
4055 /*----------------------------------------------------------------------------
4056 | Takes an abstract floating-point value having sign `zSign', exponent
4057 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4058 | and returns the proper extended double-precision floating-point value
4059 | corresponding to the abstract input. This routine is just like
4060 | `roundAndPackFloatx80' except that the input significand does not have to be
4062 *----------------------------------------------------------------------------*/
4064 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
4065 flag zSign
, int32_t zExp
,
4066 uint64_t zSig0
, uint64_t zSig1
,
4067 float_status
*status
)
4076 shiftCount
= clz64(zSig0
);
4077 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4079 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
4080 zSig0
, zSig1
, status
);
4084 /*----------------------------------------------------------------------------
4085 | Returns the least-significant 64 fraction bits of the quadruple-precision
4086 | floating-point value `a'.
4087 *----------------------------------------------------------------------------*/
4089 static inline uint64_t extractFloat128Frac1( float128 a
)
4096 /*----------------------------------------------------------------------------
4097 | Returns the most-significant 48 fraction bits of the quadruple-precision
4098 | floating-point value `a'.
4099 *----------------------------------------------------------------------------*/
4101 static inline uint64_t extractFloat128Frac0( float128 a
)
4104 return a
.high
& UINT64_C(0x0000FFFFFFFFFFFF);
4108 /*----------------------------------------------------------------------------
4109 | Returns the exponent bits of the quadruple-precision floating-point value
4111 *----------------------------------------------------------------------------*/
4113 static inline int32_t extractFloat128Exp( float128 a
)
4116 return ( a
.high
>>48 ) & 0x7FFF;
4120 /*----------------------------------------------------------------------------
4121 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4122 *----------------------------------------------------------------------------*/
4124 static inline flag
extractFloat128Sign( float128 a
)
4131 /*----------------------------------------------------------------------------
4132 | Normalizes the subnormal quadruple-precision floating-point value
4133 | represented by the denormalized significand formed by the concatenation of
4134 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4135 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4136 | significand are stored at the location pointed to by `zSig0Ptr', and the
4137 | least significant 64 bits of the normalized significand are stored at the
4138 | location pointed to by `zSig1Ptr'.
4139 *----------------------------------------------------------------------------*/
4142 normalizeFloat128Subnormal(
4153 shiftCount
= clz64(aSig1
) - 15;
4154 if ( shiftCount
< 0 ) {
4155 *zSig0Ptr
= aSig1
>>( - shiftCount
);
4156 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
4159 *zSig0Ptr
= aSig1
<<shiftCount
;
4162 *zExpPtr
= - shiftCount
- 63;
4165 shiftCount
= clz64(aSig0
) - 15;
4166 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
4167 *zExpPtr
= 1 - shiftCount
;
4172 /*----------------------------------------------------------------------------
4173 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4174 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4175 | floating-point value, returning the result. After being shifted into the
4176 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4177 | added together to form the most significant 32 bits of the result. This
4178 | means that any integer portion of `zSig0' will be added into the exponent.
4179 | Since a properly normalized significand will have an integer portion equal
4180 | to 1, the `zExp' input should be 1 less than the desired result exponent
4181 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4183 *----------------------------------------------------------------------------*/
4185 static inline float128
4186 packFloat128( flag zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
4191 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
4196 /*----------------------------------------------------------------------------
4197 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4198 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4199 | and `zSig2', and returns the proper quadruple-precision floating-point value
4200 | corresponding to the abstract input. Ordinarily, the abstract value is
4201 | simply rounded and packed into the quadruple-precision format, with the
4202 | inexact exception raised if the abstract input cannot be represented
4203 | exactly. However, if the abstract value is too large, the overflow and
4204 | inexact exceptions are raised and an infinity or maximal finite value is
4205 | returned. If the abstract value is too small, the input value is rounded to
4206 | a subnormal number, and the underflow and inexact exceptions are raised if
4207 | the abstract input cannot be represented exactly as a subnormal quadruple-
4208 | precision floating-point number.
4209 | The input significand must be normalized or smaller. If the input
4210 | significand is not normalized, `zExp' must be 0; in that case, the result
4211 | returned is a subnormal number, and it must not require rounding. In the
4212 | usual case that the input significand is normalized, `zExp' must be 1 less
4213 | than the ``true'' floating-point exponent. The handling of underflow and
4214 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4215 *----------------------------------------------------------------------------*/
4217 static float128
roundAndPackFloat128(flag zSign
, int32_t zExp
,
4218 uint64_t zSig0
, uint64_t zSig1
,
4219 uint64_t zSig2
, float_status
*status
)
4221 int8_t roundingMode
;
4222 flag roundNearestEven
, increment
, isTiny
;
4224 roundingMode
= status
->float_rounding_mode
;
4225 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4226 switch (roundingMode
) {
4227 case float_round_nearest_even
:
4228 case float_round_ties_away
:
4229 increment
= ((int64_t)zSig2
< 0);
4231 case float_round_to_zero
:
4234 case float_round_up
:
4235 increment
= !zSign
&& zSig2
;
4237 case float_round_down
:
4238 increment
= zSign
&& zSig2
;
4240 case float_round_to_odd
:
4241 increment
= !(zSig1
& 0x1) && zSig2
;
4246 if ( 0x7FFD <= (uint32_t) zExp
) {
4247 if ( ( 0x7FFD < zExp
)
4248 || ( ( zExp
== 0x7FFD )
4250 UINT64_C(0x0001FFFFFFFFFFFF),
4251 UINT64_C(0xFFFFFFFFFFFFFFFF),
4258 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4259 if ( ( roundingMode
== float_round_to_zero
)
4260 || ( zSign
&& ( roundingMode
== float_round_up
) )
4261 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4262 || (roundingMode
== float_round_to_odd
)
4268 UINT64_C(0x0000FFFFFFFFFFFF),
4269 UINT64_C(0xFFFFFFFFFFFFFFFF)
4272 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4275 if (status
->flush_to_zero
) {
4276 float_raise(float_flag_output_denormal
, status
);
4277 return packFloat128(zSign
, 0, 0, 0);
4280 (status
->float_detect_tininess
4281 == float_tininess_before_rounding
)
4287 UINT64_C(0x0001FFFFFFFFFFFF),
4288 UINT64_C(0xFFFFFFFFFFFFFFFF)
4290 shift128ExtraRightJamming(
4291 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
4293 if (isTiny
&& zSig2
) {
4294 float_raise(float_flag_underflow
, status
);
4296 switch (roundingMode
) {
4297 case float_round_nearest_even
:
4298 case float_round_ties_away
:
4299 increment
= ((int64_t)zSig2
< 0);
4301 case float_round_to_zero
:
4304 case float_round_up
:
4305 increment
= !zSign
&& zSig2
;
4307 case float_round_down
:
4308 increment
= zSign
&& zSig2
;
4310 case float_round_to_odd
:
4311 increment
= !(zSig1
& 0x1) && zSig2
;
4319 status
->float_exception_flags
|= float_flag_inexact
;
4322 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
4323 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
4326 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
4328 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4332 /*----------------------------------------------------------------------------
4333 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4334 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4335 | returns the proper quadruple-precision floating-point value corresponding
4336 | to the abstract input. This routine is just like `roundAndPackFloat128'
4337 | except that the input significand has fewer bits and does not have to be
4338 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4340 *----------------------------------------------------------------------------*/
4342 static float128
normalizeRoundAndPackFloat128(flag zSign
, int32_t zExp
,
4343 uint64_t zSig0
, uint64_t zSig1
,
4344 float_status
*status
)
4354 shiftCount
= clz64(zSig0
) - 15;
4355 if ( 0 <= shiftCount
) {
4357 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4360 shift128ExtraRightJamming(
4361 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
4364 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
4369 /*----------------------------------------------------------------------------
4370 | Returns the result of converting the 32-bit two's complement integer `a'
4371 | to the extended double-precision floating-point format. The conversion
4372 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4374 *----------------------------------------------------------------------------*/
4376 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
4383 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4385 absA
= zSign
? - a
: a
;
4386 shiftCount
= clz32(absA
) + 32;
4388 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
4392 /*----------------------------------------------------------------------------
4393 | Returns the result of converting the 32-bit two's complement integer `a' to
4394 | the quadruple-precision floating-point format. The conversion is performed
4395 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4396 *----------------------------------------------------------------------------*/
4398 float128
int32_to_float128(int32_t a
, float_status
*status
)
4405 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4407 absA
= zSign
? - a
: a
;
4408 shiftCount
= clz32(absA
) + 17;
4410 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
4414 /*----------------------------------------------------------------------------
4415 | Returns the result of converting the 64-bit two's complement integer `a'
4416 | to the extended double-precision floating-point format. The conversion
4417 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4419 *----------------------------------------------------------------------------*/
4421 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
4427 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4429 absA
= zSign
? - a
: a
;
4430 shiftCount
= clz64(absA
);
4431 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
4435 /*----------------------------------------------------------------------------
4436 | Returns the result of converting the 64-bit two's complement integer `a' to
4437 | the quadruple-precision floating-point format. The conversion is performed
4438 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4439 *----------------------------------------------------------------------------*/
4441 float128
int64_to_float128(int64_t a
, float_status
*status
)
4447 uint64_t zSig0
, zSig1
;
4449 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4451 absA
= zSign
? - a
: a
;
4452 shiftCount
= clz64(absA
) + 49;
4453 zExp
= 0x406E - shiftCount
;
4454 if ( 64 <= shiftCount
) {
4463 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4464 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4468 /*----------------------------------------------------------------------------
4469 | Returns the result of converting the 64-bit unsigned integer `a'
4470 | to the quadruple-precision floating-point format. The conversion is performed
4471 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4472 *----------------------------------------------------------------------------*/
4474 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
4477 return float128_zero
;
4479 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a
, status
);
4482 /*----------------------------------------------------------------------------
4483 | Returns the result of converting the single-precision floating-point value
4484 | `a' to the extended double-precision floating-point format. The conversion
4485 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4487 *----------------------------------------------------------------------------*/
4489 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
4495 a
= float32_squash_input_denormal(a
, status
);
4496 aSig
= extractFloat32Frac( a
);
4497 aExp
= extractFloat32Exp( a
);
4498 aSign
= extractFloat32Sign( a
);
4499 if ( aExp
== 0xFF ) {
4501 return commonNaNToFloatx80(float32ToCommonNaN(a
, status
), status
);
4503 return packFloatx80(aSign
,
4504 floatx80_infinity_high
,
4505 floatx80_infinity_low
);
4508 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
4509 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4512 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
4516 /*----------------------------------------------------------------------------
4517 | Returns the result of converting the single-precision floating-point value
4518 | `a' to the double-precision floating-point format. The conversion is
4519 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4521 *----------------------------------------------------------------------------*/
4523 float128
float32_to_float128(float32 a
, float_status
*status
)
4529 a
= float32_squash_input_denormal(a
, status
);
4530 aSig
= extractFloat32Frac( a
);
4531 aExp
= extractFloat32Exp( a
);
4532 aSign
= extractFloat32Sign( a
);
4533 if ( aExp
== 0xFF ) {
4535 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
4537 return packFloat128( aSign
, 0x7FFF, 0, 0 );
4540 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
4541 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4544 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
4548 /*----------------------------------------------------------------------------
4549 | Returns the remainder of the single-precision floating-point value `a'
4550 | with respect to the corresponding value `b'. The operation is performed
4551 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4552 *----------------------------------------------------------------------------*/
4554 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
4557 int aExp
, bExp
, expDiff
;
4558 uint32_t aSig
, bSig
;
4560 uint64_t aSig64
, bSig64
, q64
;
4561 uint32_t alternateASig
;
4563 a
= float32_squash_input_denormal(a
, status
);
4564 b
= float32_squash_input_denormal(b
, status
);
4566 aSig
= extractFloat32Frac( a
);
4567 aExp
= extractFloat32Exp( a
);
4568 aSign
= extractFloat32Sign( a
);
4569 bSig
= extractFloat32Frac( b
);
4570 bExp
= extractFloat32Exp( b
);
4571 if ( aExp
== 0xFF ) {
4572 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
4573 return propagateFloat32NaN(a
, b
, status
);
4575 float_raise(float_flag_invalid
, status
);
4576 return float32_default_nan(status
);
4578 if ( bExp
== 0xFF ) {
4580 return propagateFloat32NaN(a
, b
, status
);
4586 float_raise(float_flag_invalid
, status
);
4587 return float32_default_nan(status
);
4589 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
4592 if ( aSig
== 0 ) return a
;
4593 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4595 expDiff
= aExp
- bExp
;
4598 if ( expDiff
< 32 ) {
4601 if ( expDiff
< 0 ) {
4602 if ( expDiff
< -1 ) return a
;
4605 q
= ( bSig
<= aSig
);
4606 if ( q
) aSig
-= bSig
;
4607 if ( 0 < expDiff
) {
4608 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
4611 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4619 if ( bSig
<= aSig
) aSig
-= bSig
;
4620 aSig64
= ( (uint64_t) aSig
)<<40;
4621 bSig64
= ( (uint64_t) bSig
)<<40;
4623 while ( 0 < expDiff
) {
4624 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
4625 q64
= ( 2 < q64
) ? q64
- 2 : 0;
4626 aSig64
= - ( ( bSig
* q64
)<<38 );
4630 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
4631 q64
= ( 2 < q64
) ? q64
- 2 : 0;
4632 q
= q64
>>( 64 - expDiff
);
4634 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
4637 alternateASig
= aSig
;
4640 } while ( 0 <= (int32_t) aSig
);
4641 sigMean
= aSig
+ alternateASig
;
4642 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4643 aSig
= alternateASig
;
4645 zSign
= ( (int32_t) aSig
< 0 );
4646 if ( zSign
) aSig
= - aSig
;
4647 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
4652 /*----------------------------------------------------------------------------
4653 | Returns the binary exponential of the single-precision floating-point value
4654 | `a'. The operation is performed according to the IEC/IEEE Standard for
4655 | Binary Floating-Point Arithmetic.
4657 | Uses the following identities:
4659 | 1. -------------------------------------------------------------------------
4663 | 2. -------------------------------------------------------------------------
4666 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4668 *----------------------------------------------------------------------------*/
4670 static const float64 float32_exp2_coefficients
[15] =
4672 const_float64( 0x3ff0000000000000ll
), /* 1 */
4673 const_float64( 0x3fe0000000000000ll
), /* 2 */
4674 const_float64( 0x3fc5555555555555ll
), /* 3 */
4675 const_float64( 0x3fa5555555555555ll
), /* 4 */
4676 const_float64( 0x3f81111111111111ll
), /* 5 */
4677 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
4678 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
4679 const_float64( 0x3efa01a01a01a01all
), /* 8 */
4680 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
4681 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
4682 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
4683 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
4684 const_float64( 0x3de6124613a86d09ll
), /* 13 */
4685 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
4686 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
4689 float32
float32_exp2(float32 a
, float_status
*status
)
4696 a
= float32_squash_input_denormal(a
, status
);
4698 aSig
= extractFloat32Frac( a
);
4699 aExp
= extractFloat32Exp( a
);
4700 aSign
= extractFloat32Sign( a
);
4702 if ( aExp
== 0xFF) {
4704 return propagateFloat32NaN(a
, float32_zero
, status
);
4706 return (aSign
) ? float32_zero
: a
;
4709 if (aSig
== 0) return float32_one
;
4712 float_raise(float_flag_inexact
, status
);
4714 /* ******************************* */
4715 /* using float64 for approximation */
4716 /* ******************************* */
4717 x
= float32_to_float64(a
, status
);
4718 x
= float64_mul(x
, float64_ln2
, status
);
4722 for (i
= 0 ; i
< 15 ; i
++) {
4725 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
4726 r
= float64_add(r
, f
, status
);
4728 xn
= float64_mul(xn
, x
, status
);
4731 return float64_to_float32(r
, status
);
4734 /*----------------------------------------------------------------------------
4735 | Returns the binary log of the single-precision floating-point value `a'.
4736 | The operation is performed according to the IEC/IEEE Standard for Binary
4737 | Floating-Point Arithmetic.
4738 *----------------------------------------------------------------------------*/
4739 float32
float32_log2(float32 a
, float_status
*status
)
4743 uint32_t aSig
, zSig
, i
;
4745 a
= float32_squash_input_denormal(a
, status
);
4746 aSig
= extractFloat32Frac( a
);
4747 aExp
= extractFloat32Exp( a
);
4748 aSign
= extractFloat32Sign( a
);
4751 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
4752 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4755 float_raise(float_flag_invalid
, status
);
4756 return float32_default_nan(status
);
4758 if ( aExp
== 0xFF ) {
4760 return propagateFloat32NaN(a
, float32_zero
, status
);
4770 for (i
= 1 << 22; i
> 0; i
>>= 1) {
4771 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
4772 if ( aSig
& 0x01000000 ) {
4781 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
4784 /*----------------------------------------------------------------------------
4785 | Returns 1 if the single-precision floating-point value `a' is equal to
4786 | the corresponding value `b', and 0 otherwise. The invalid exception is
4787 | raised if either operand is a NaN. Otherwise, the comparison is performed
4788 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4789 *----------------------------------------------------------------------------*/
4791 int float32_eq(float32 a
, float32 b
, float_status
*status
)
4794 a
= float32_squash_input_denormal(a
, status
);
4795 b
= float32_squash_input_denormal(b
, status
);
4797 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4798 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4800 float_raise(float_flag_invalid
, status
);
4803 av
= float32_val(a
);
4804 bv
= float32_val(b
);
4805 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
4808 /*----------------------------------------------------------------------------
4809 | Returns 1 if the single-precision floating-point value `a' is less than
4810 | or equal to the corresponding value `b', and 0 otherwise. The invalid
4811 | exception is raised if either operand is a NaN. The comparison is performed
4812 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4813 *----------------------------------------------------------------------------*/
4815 int float32_le(float32 a
, float32 b
, float_status
*status
)
4819 a
= float32_squash_input_denormal(a
, status
);
4820 b
= float32_squash_input_denormal(b
, status
);
4822 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4823 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4825 float_raise(float_flag_invalid
, status
);
4828 aSign
= extractFloat32Sign( a
);
4829 bSign
= extractFloat32Sign( b
);
4830 av
= float32_val(a
);
4831 bv
= float32_val(b
);
4832 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
4833 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4837 /*----------------------------------------------------------------------------
4838 | Returns 1 if the single-precision floating-point value `a' is less than
4839 | the corresponding value `b', and 0 otherwise. The invalid exception is
4840 | raised if either operand is a NaN. The comparison is performed according
4841 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4842 *----------------------------------------------------------------------------*/
4844 int float32_lt(float32 a
, float32 b
, float_status
*status
)
4848 a
= float32_squash_input_denormal(a
, status
);
4849 b
= float32_squash_input_denormal(b
, status
);
4851 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4852 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4854 float_raise(float_flag_invalid
, status
);
4857 aSign
= extractFloat32Sign( a
);
4858 bSign
= extractFloat32Sign( b
);
4859 av
= float32_val(a
);
4860 bv
= float32_val(b
);
4861 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
4862 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4866 /*----------------------------------------------------------------------------
4867 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4868 | be compared, and 0 otherwise. The invalid exception is raised if either
4869 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4870 | Standard for Binary Floating-Point Arithmetic.
4871 *----------------------------------------------------------------------------*/
4873 int float32_unordered(float32 a
, float32 b
, float_status
*status
)
4875 a
= float32_squash_input_denormal(a
, status
);
4876 b
= float32_squash_input_denormal(b
, status
);
4878 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4879 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4881 float_raise(float_flag_invalid
, status
);
4887 /*----------------------------------------------------------------------------
4888 | Returns 1 if the single-precision floating-point value `a' is equal to
4889 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4890 | exception. The comparison is performed according to the IEC/IEEE Standard
4891 | for Binary Floating-Point Arithmetic.
4892 *----------------------------------------------------------------------------*/
4894 int float32_eq_quiet(float32 a
, float32 b
, float_status
*status
)
4896 a
= float32_squash_input_denormal(a
, status
);
4897 b
= float32_squash_input_denormal(b
, status
);
4899 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4900 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4902 if (float32_is_signaling_nan(a
, status
)
4903 || float32_is_signaling_nan(b
, status
)) {
4904 float_raise(float_flag_invalid
, status
);
4908 return ( float32_val(a
) == float32_val(b
) ) ||
4909 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
4912 /*----------------------------------------------------------------------------
4913 | Returns 1 if the single-precision floating-point value `a' is less than or
4914 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4915 | cause an exception. Otherwise, the comparison is performed according to the
4916 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4917 *----------------------------------------------------------------------------*/
4919 int float32_le_quiet(float32 a
, float32 b
, float_status
*status
)
4923 a
= float32_squash_input_denormal(a
, status
);
4924 b
= float32_squash_input_denormal(b
, status
);
4926 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4927 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4929 if (float32_is_signaling_nan(a
, status
)
4930 || float32_is_signaling_nan(b
, status
)) {
4931 float_raise(float_flag_invalid
, status
);
4935 aSign
= extractFloat32Sign( a
);
4936 bSign
= extractFloat32Sign( b
);
4937 av
= float32_val(a
);
4938 bv
= float32_val(b
);
4939 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
4940 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4944 /*----------------------------------------------------------------------------
4945 | Returns 1 if the single-precision floating-point value `a' is less than
4946 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4947 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4948 | Standard for Binary Floating-Point Arithmetic.
4949 *----------------------------------------------------------------------------*/
4951 int float32_lt_quiet(float32 a
, float32 b
, float_status
*status
)
4955 a
= float32_squash_input_denormal(a
, status
);
4956 b
= float32_squash_input_denormal(b
, status
);
4958 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4959 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4961 if (float32_is_signaling_nan(a
, status
)
4962 || float32_is_signaling_nan(b
, status
)) {
4963 float_raise(float_flag_invalid
, status
);
4967 aSign
= extractFloat32Sign( a
);
4968 bSign
= extractFloat32Sign( b
);
4969 av
= float32_val(a
);
4970 bv
= float32_val(b
);
4971 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
4972 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4976 /*----------------------------------------------------------------------------
4977 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4978 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4979 | comparison is performed according to the IEC/IEEE Standard for Binary
4980 | Floating-Point Arithmetic.
4981 *----------------------------------------------------------------------------*/
4983 int float32_unordered_quiet(float32 a
, float32 b
, float_status
*status
)
4985 a
= float32_squash_input_denormal(a
, status
);
4986 b
= float32_squash_input_denormal(b
, status
);
4988 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4989 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4991 if (float32_is_signaling_nan(a
, status
)
4992 || float32_is_signaling_nan(b
, status
)) {
4993 float_raise(float_flag_invalid
, status
);
5000 /*----------------------------------------------------------------------------
5001 | Returns the result of converting the double-precision floating-point value
5002 | `a' to the extended double-precision floating-point format. The conversion
5003 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5005 *----------------------------------------------------------------------------*/
5007 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
5013 a
= float64_squash_input_denormal(a
, status
);
5014 aSig
= extractFloat64Frac( a
);
5015 aExp
= extractFloat64Exp( a
);
5016 aSign
= extractFloat64Sign( a
);
5017 if ( aExp
== 0x7FF ) {
5019 return commonNaNToFloatx80(float64ToCommonNaN(a
, status
), status
);
5021 return packFloatx80(aSign
,
5022 floatx80_infinity_high
,
5023 floatx80_infinity_low
);
5026 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
5027 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5031 aSign
, aExp
+ 0x3C00, (aSig
| UINT64_C(0x0010000000000000)) << 11);
5035 /*----------------------------------------------------------------------------
5036 | Returns the result of converting the double-precision floating-point value
5037 | `a' to the quadruple-precision floating-point format. The conversion is
5038 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5040 *----------------------------------------------------------------------------*/
5042 float128
float64_to_float128(float64 a
, float_status
*status
)
5046 uint64_t aSig
, zSig0
, zSig1
;
5048 a
= float64_squash_input_denormal(a
, status
);
5049 aSig
= extractFloat64Frac( a
);
5050 aExp
= extractFloat64Exp( a
);
5051 aSign
= extractFloat64Sign( a
);
5052 if ( aExp
== 0x7FF ) {
5054 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
5056 return packFloat128( aSign
, 0x7FFF, 0, 0 );
5059 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
5060 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5063 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
5064 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
5069 /*----------------------------------------------------------------------------
5070 | Returns the remainder of the double-precision floating-point value `a'
5071 | with respect to the corresponding value `b'. The operation is performed
5072 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5073 *----------------------------------------------------------------------------*/
5075 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
5078 int aExp
, bExp
, expDiff
;
5079 uint64_t aSig
, bSig
;
5080 uint64_t q
, alternateASig
;
5083 a
= float64_squash_input_denormal(a
, status
);
5084 b
= float64_squash_input_denormal(b
, status
);
5085 aSig
= extractFloat64Frac( a
);
5086 aExp
= extractFloat64Exp( a
);
5087 aSign
= extractFloat64Sign( a
);
5088 bSig
= extractFloat64Frac( b
);
5089 bExp
= extractFloat64Exp( b
);
5090 if ( aExp
== 0x7FF ) {
5091 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
5092 return propagateFloat64NaN(a
, b
, status
);
5094 float_raise(float_flag_invalid
, status
);
5095 return float64_default_nan(status
);
5097 if ( bExp
== 0x7FF ) {
5099 return propagateFloat64NaN(a
, b
, status
);
5105 float_raise(float_flag_invalid
, status
);
5106 return float64_default_nan(status
);
5108 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
5111 if ( aSig
== 0 ) return a
;
5112 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5114 expDiff
= aExp
- bExp
;
5115 aSig
= (aSig
| UINT64_C(0x0010000000000000)) << 11;
5116 bSig
= (bSig
| UINT64_C(0x0010000000000000)) << 11;
5117 if ( expDiff
< 0 ) {
5118 if ( expDiff
< -1 ) return a
;
5121 q
= ( bSig
<= aSig
);
5122 if ( q
) aSig
-= bSig
;
5124 while ( 0 < expDiff
) {
5125 q
= estimateDiv128To64( aSig
, 0, bSig
);
5126 q
= ( 2 < q
) ? q
- 2 : 0;
5127 aSig
= - ( ( bSig
>>2 ) * q
);
5131 if ( 0 < expDiff
) {
5132 q
= estimateDiv128To64( aSig
, 0, bSig
);
5133 q
= ( 2 < q
) ? q
- 2 : 0;
5136 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5143 alternateASig
= aSig
;
5146 } while ( 0 <= (int64_t) aSig
);
5147 sigMean
= aSig
+ alternateASig
;
5148 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5149 aSig
= alternateASig
;
5151 zSign
= ( (int64_t) aSig
< 0 );
5152 if ( zSign
) aSig
= - aSig
;
5153 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
5157 /*----------------------------------------------------------------------------
5158 | Returns the binary log of the double-precision floating-point value `a'.
5159 | The operation is performed according to the IEC/IEEE Standard for Binary
5160 | Floating-Point Arithmetic.
5161 *----------------------------------------------------------------------------*/
5162 float64
float64_log2(float64 a
, float_status
*status
)
5166 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
5167 a
= float64_squash_input_denormal(a
, status
);
5169 aSig
= extractFloat64Frac( a
);
5170 aExp
= extractFloat64Exp( a
);
5171 aSign
= extractFloat64Sign( a
);
5174 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
5175 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5178 float_raise(float_flag_invalid
, status
);
5179 return float64_default_nan(status
);
5181 if ( aExp
== 0x7FF ) {
5183 return propagateFloat64NaN(a
, float64_zero
, status
);
5189 aSig
|= UINT64_C(0x0010000000000000);
5191 zSig
= (uint64_t)aExp
<< 52;
5192 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
5193 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
5194 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
5195 if ( aSig
& UINT64_C(0x0020000000000000) ) {
5203 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
5206 /*----------------------------------------------------------------------------
5207 | Returns 1 if the double-precision floating-point value `a' is equal to the
5208 | corresponding value `b', and 0 otherwise. The invalid exception is raised
5209 | if either operand is a NaN. Otherwise, the comparison is performed
5210 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5211 *----------------------------------------------------------------------------*/
5213 int float64_eq(float64 a
, float64 b
, float_status
*status
)
5216 a
= float64_squash_input_denormal(a
, status
);
5217 b
= float64_squash_input_denormal(b
, status
);
5219 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5220 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5222 float_raise(float_flag_invalid
, status
);
5225 av
= float64_val(a
);
5226 bv
= float64_val(b
);
5227 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
5231 /*----------------------------------------------------------------------------
5232 | Returns 1 if the double-precision floating-point value `a' is less than or
5233 | equal to the corresponding value `b', and 0 otherwise. The invalid
5234 | exception is raised if either operand is a NaN. The comparison is performed
5235 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5236 *----------------------------------------------------------------------------*/
5238 int float64_le(float64 a
, float64 b
, float_status
*status
)
5242 a
= float64_squash_input_denormal(a
, status
);
5243 b
= float64_squash_input_denormal(b
, status
);
5245 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5246 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5248 float_raise(float_flag_invalid
, status
);
5251 aSign
= extractFloat64Sign( a
);
5252 bSign
= extractFloat64Sign( b
);
5253 av
= float64_val(a
);
5254 bv
= float64_val(b
);
5255 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
5256 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
5260 /*----------------------------------------------------------------------------
5261 | Returns 1 if the double-precision floating-point value `a' is less than
5262 | the corresponding value `b', and 0 otherwise. The invalid exception is
5263 | raised if either operand is a NaN. The comparison is performed according
5264 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5265 *----------------------------------------------------------------------------*/
5267 int float64_lt(float64 a
, float64 b
, float_status
*status
)
5272 a
= float64_squash_input_denormal(a
, status
);
5273 b
= float64_squash_input_denormal(b
, status
);
5274 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5275 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5277 float_raise(float_flag_invalid
, status
);
5280 aSign
= extractFloat64Sign( a
);
5281 bSign
= extractFloat64Sign( b
);
5282 av
= float64_val(a
);
5283 bv
= float64_val(b
);
5284 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
5285 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
5289 /*----------------------------------------------------------------------------
5290 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5291 | be compared, and 0 otherwise. The invalid exception is raised if either
5292 | operand is a NaN. The comparison is performed according to the IEC/IEEE
5293 | Standard for Binary Floating-Point Arithmetic.
5294 *----------------------------------------------------------------------------*/
5296 int float64_unordered(float64 a
, float64 b
, float_status
*status
)
5298 a
= float64_squash_input_denormal(a
, status
);
5299 b
= float64_squash_input_denormal(b
, status
);
5301 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5302 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5304 float_raise(float_flag_invalid
, status
);
5310 /*----------------------------------------------------------------------------
5311 | Returns 1 if the double-precision floating-point value `a' is equal to the
5312 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5313 | exception.The comparison is performed according to the IEC/IEEE Standard
5314 | for Binary Floating-Point Arithmetic.
5315 *----------------------------------------------------------------------------*/
5317 int float64_eq_quiet(float64 a
, float64 b
, float_status
*status
)
5320 a
= float64_squash_input_denormal(a
, status
);
5321 b
= float64_squash_input_denormal(b
, status
);
5323 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5324 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5326 if (float64_is_signaling_nan(a
, status
)
5327 || float64_is_signaling_nan(b
, status
)) {
5328 float_raise(float_flag_invalid
, status
);
5332 av
= float64_val(a
);
5333 bv
= float64_val(b
);
5334 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
5338 /*----------------------------------------------------------------------------
5339 | Returns 1 if the double-precision floating-point value `a' is less than or
5340 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5341 | cause an exception. Otherwise, the comparison is performed according to the
5342 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5343 *----------------------------------------------------------------------------*/
5345 int float64_le_quiet(float64 a
, float64 b
, float_status
*status
)
5349 a
= float64_squash_input_denormal(a
, status
);
5350 b
= float64_squash_input_denormal(b
, status
);
5352 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5353 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5355 if (float64_is_signaling_nan(a
, status
)
5356 || float64_is_signaling_nan(b
, status
)) {
5357 float_raise(float_flag_invalid
, status
);
5361 aSign
= extractFloat64Sign( a
);
5362 bSign
= extractFloat64Sign( b
);
5363 av
= float64_val(a
);
5364 bv
= float64_val(b
);
5365 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
5366 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
5370 /*----------------------------------------------------------------------------
5371 | Returns 1 if the double-precision floating-point value `a' is less than
5372 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5373 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5374 | Standard for Binary Floating-Point Arithmetic.
5375 *----------------------------------------------------------------------------*/
5377 int float64_lt_quiet(float64 a
, float64 b
, float_status
*status
)
5381 a
= float64_squash_input_denormal(a
, status
);
5382 b
= float64_squash_input_denormal(b
, status
);
5384 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5385 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5387 if (float64_is_signaling_nan(a
, status
)
5388 || float64_is_signaling_nan(b
, status
)) {
5389 float_raise(float_flag_invalid
, status
);
5393 aSign
= extractFloat64Sign( a
);
5394 bSign
= extractFloat64Sign( b
);
5395 av
= float64_val(a
);
5396 bv
= float64_val(b
);
5397 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
5398 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
5402 /*----------------------------------------------------------------------------
5403 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5404 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
5405 | comparison is performed according to the IEC/IEEE Standard for Binary
5406 | Floating-Point Arithmetic.
5407 *----------------------------------------------------------------------------*/
5409 int float64_unordered_quiet(float64 a
, float64 b
, float_status
*status
)
5411 a
= float64_squash_input_denormal(a
, status
);
5412 b
= float64_squash_input_denormal(b
, status
);
5414 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5415 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5417 if (float64_is_signaling_nan(a
, status
)
5418 || float64_is_signaling_nan(b
, status
)) {
5419 float_raise(float_flag_invalid
, status
);
5426 /*----------------------------------------------------------------------------
5427 | Returns the result of converting the extended double-precision floating-
5428 | point value `a' to the 32-bit two's complement integer format. The
5429 | conversion is performed according to the IEC/IEEE Standard for Binary
5430 | Floating-Point Arithmetic---which means in particular that the conversion
5431 | is rounded according to the current rounding mode. If `a' is a NaN, the
5432 | largest positive integer is returned. Otherwise, if the conversion
5433 | overflows, the largest integer with the same sign as `a' is returned.
5434 *----------------------------------------------------------------------------*/
5436 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
5439 int32_t aExp
, shiftCount
;
5442 if (floatx80_invalid_encoding(a
)) {
5443 float_raise(float_flag_invalid
, status
);
5446 aSig
= extractFloatx80Frac( a
);
5447 aExp
= extractFloatx80Exp( a
);
5448 aSign
= extractFloatx80Sign( a
);
5449 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5450 shiftCount
= 0x4037 - aExp
;
5451 if ( shiftCount
<= 0 ) shiftCount
= 1;
5452 shift64RightJamming( aSig
, shiftCount
, &aSig
);
5453 return roundAndPackInt32(aSign
, aSig
, status
);
5457 /*----------------------------------------------------------------------------
5458 | Returns the result of converting the extended double-precision floating-
5459 | point value `a' to the 32-bit two's complement integer format. The
5460 | conversion is performed according to the IEC/IEEE Standard for Binary
5461 | Floating-Point Arithmetic, except that the conversion is always rounded
5462 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5463 | Otherwise, if the conversion overflows, the largest integer with the same
5464 | sign as `a' is returned.
5465 *----------------------------------------------------------------------------*/
5467 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
5470 int32_t aExp
, shiftCount
;
5471 uint64_t aSig
, savedASig
;
5474 if (floatx80_invalid_encoding(a
)) {
5475 float_raise(float_flag_invalid
, status
);
5478 aSig
= extractFloatx80Frac( a
);
5479 aExp
= extractFloatx80Exp( a
);
5480 aSign
= extractFloatx80Sign( a
);
5481 if ( 0x401E < aExp
) {
5482 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5485 else if ( aExp
< 0x3FFF ) {
5487 status
->float_exception_flags
|= float_flag_inexact
;
5491 shiftCount
= 0x403E - aExp
;
5493 aSig
>>= shiftCount
;
5495 if ( aSign
) z
= - z
;
5496 if ( ( z
< 0 ) ^ aSign
) {
5498 float_raise(float_flag_invalid
, status
);
5499 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5501 if ( ( aSig
<<shiftCount
) != savedASig
) {
5502 status
->float_exception_flags
|= float_flag_inexact
;
5508 /*----------------------------------------------------------------------------
5509 | Returns the result of converting the extended double-precision floating-
5510 | point value `a' to the 64-bit two's complement integer format. The
5511 | conversion is performed according to the IEC/IEEE Standard for Binary
5512 | Floating-Point Arithmetic---which means in particular that the conversion
5513 | is rounded according to the current rounding mode. If `a' is a NaN,
5514 | the largest positive integer is returned. Otherwise, if the conversion
5515 | overflows, the largest integer with the same sign as `a' is returned.
5516 *----------------------------------------------------------------------------*/
5518 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
5521 int32_t aExp
, shiftCount
;
5522 uint64_t aSig
, aSigExtra
;
5524 if (floatx80_invalid_encoding(a
)) {
5525 float_raise(float_flag_invalid
, status
);
5528 aSig
= extractFloatx80Frac( a
);
5529 aExp
= extractFloatx80Exp( a
);
5530 aSign
= extractFloatx80Sign( a
);
5531 shiftCount
= 0x403E - aExp
;
5532 if ( shiftCount
<= 0 ) {
5534 float_raise(float_flag_invalid
, status
);
5535 if (!aSign
|| floatx80_is_any_nan(a
)) {
5543 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
5545 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
5549 /*----------------------------------------------------------------------------
5550 | Returns the result of converting the extended double-precision floating-
5551 | point value `a' to the 64-bit two's complement integer format. The
5552 | conversion is performed according to the IEC/IEEE Standard for Binary
5553 | Floating-Point Arithmetic, except that the conversion is always rounded
5554 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5555 | Otherwise, if the conversion overflows, the largest integer with the same
5556 | sign as `a' is returned.
5557 *----------------------------------------------------------------------------*/
5559 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
5562 int32_t aExp
, shiftCount
;
5566 if (floatx80_invalid_encoding(a
)) {
5567 float_raise(float_flag_invalid
, status
);
5570 aSig
= extractFloatx80Frac( a
);
5571 aExp
= extractFloatx80Exp( a
);
5572 aSign
= extractFloatx80Sign( a
);
5573 shiftCount
= aExp
- 0x403E;
5574 if ( 0 <= shiftCount
) {
5575 aSig
&= UINT64_C(0x7FFFFFFFFFFFFFFF);
5576 if ( ( a
.high
!= 0xC03E ) || aSig
) {
5577 float_raise(float_flag_invalid
, status
);
5578 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
5584 else if ( aExp
< 0x3FFF ) {
5586 status
->float_exception_flags
|= float_flag_inexact
;
5590 z
= aSig
>>( - shiftCount
);
5591 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
5592 status
->float_exception_flags
|= float_flag_inexact
;
5594 if ( aSign
) z
= - z
;
5599 /*----------------------------------------------------------------------------
5600 | Returns the result of converting the extended double-precision floating-
5601 | point value `a' to the single-precision floating-point format. The
5602 | conversion is performed according to the IEC/IEEE Standard for Binary
5603 | Floating-Point Arithmetic.
5604 *----------------------------------------------------------------------------*/
5606 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
5612 if (floatx80_invalid_encoding(a
)) {
5613 float_raise(float_flag_invalid
, status
);
5614 return float32_default_nan(status
);
5616 aSig
= extractFloatx80Frac( a
);
5617 aExp
= extractFloatx80Exp( a
);
5618 aSign
= extractFloatx80Sign( a
);
5619 if ( aExp
== 0x7FFF ) {
5620 if ( (uint64_t) ( aSig
<<1 ) ) {
5621 return commonNaNToFloat32(floatx80ToCommonNaN(a
, status
), status
);
5623 return packFloat32( aSign
, 0xFF, 0 );
5625 shift64RightJamming( aSig
, 33, &aSig
);
5626 if ( aExp
|| aSig
) aExp
-= 0x3F81;
5627 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
5631 /*----------------------------------------------------------------------------
5632 | Returns the result of converting the extended double-precision floating-
5633 | point value `a' to the double-precision floating-point format. The
5634 | conversion is performed according to the IEC/IEEE Standard for Binary
5635 | Floating-Point Arithmetic.
5636 *----------------------------------------------------------------------------*/
5638 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
5642 uint64_t aSig
, zSig
;
5644 if (floatx80_invalid_encoding(a
)) {
5645 float_raise(float_flag_invalid
, status
);
5646 return float64_default_nan(status
);
5648 aSig
= extractFloatx80Frac( a
);
5649 aExp
= extractFloatx80Exp( a
);
5650 aSign
= extractFloatx80Sign( a
);
5651 if ( aExp
== 0x7FFF ) {
5652 if ( (uint64_t) ( aSig
<<1 ) ) {
5653 return commonNaNToFloat64(floatx80ToCommonNaN(a
, status
), status
);
5655 return packFloat64( aSign
, 0x7FF, 0 );
5657 shift64RightJamming( aSig
, 1, &zSig
);
5658 if ( aExp
|| aSig
) aExp
-= 0x3C01;
5659 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
5663 /*----------------------------------------------------------------------------
5664 | Returns the result of converting the extended double-precision floating-
5665 | point value `a' to the quadruple-precision floating-point format. The
5666 | conversion is performed according to the IEC/IEEE Standard for Binary
5667 | Floating-Point Arithmetic.
5668 *----------------------------------------------------------------------------*/
5670 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
5674 uint64_t aSig
, zSig0
, zSig1
;
5676 if (floatx80_invalid_encoding(a
)) {
5677 float_raise(float_flag_invalid
, status
);
5678 return float128_default_nan(status
);
5680 aSig
= extractFloatx80Frac( a
);
5681 aExp
= extractFloatx80Exp( a
);
5682 aSign
= extractFloatx80Sign( a
);
5683 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
5684 return commonNaNToFloat128(floatx80ToCommonNaN(a
, status
), status
);
5686 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
5687 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
5691 /*----------------------------------------------------------------------------
5692 | Rounds the extended double-precision floating-point value `a'
5693 | to the precision provided by floatx80_rounding_precision and returns the
5694 | result as an extended double-precision floating-point value.
5695 | The operation is performed according to the IEC/IEEE Standard for Binary
5696 | Floating-Point Arithmetic.
5697 *----------------------------------------------------------------------------*/
5699 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
5701 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5702 extractFloatx80Sign(a
),
5703 extractFloatx80Exp(a
),
5704 extractFloatx80Frac(a
), 0, status
);
5707 /*----------------------------------------------------------------------------
5708 | Rounds the extended double-precision floating-point value `a' to an integer,
5709 | and returns the result as an extended quadruple-precision floating-point
5710 | value. The operation is performed according to the IEC/IEEE Standard for
5711 | Binary Floating-Point Arithmetic.
5712 *----------------------------------------------------------------------------*/
5714 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
5718 uint64_t lastBitMask
, roundBitsMask
;
5721 if (floatx80_invalid_encoding(a
)) {
5722 float_raise(float_flag_invalid
, status
);
5723 return floatx80_default_nan(status
);
5725 aExp
= extractFloatx80Exp( a
);
5726 if ( 0x403E <= aExp
) {
5727 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
5728 return propagateFloatx80NaN(a
, a
, status
);
5732 if ( aExp
< 0x3FFF ) {
5734 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
5737 status
->float_exception_flags
|= float_flag_inexact
;
5738 aSign
= extractFloatx80Sign( a
);
5739 switch (status
->float_rounding_mode
) {
5740 case float_round_nearest_even
:
5741 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
5744 packFloatx80( aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5747 case float_round_ties_away
:
5748 if (aExp
== 0x3FFE) {
5749 return packFloatx80(aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5752 case float_round_down
:
5755 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5756 : packFloatx80( 0, 0, 0 );
5757 case float_round_up
:
5759 aSign
? packFloatx80( 1, 0, 0 )
5760 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5762 return packFloatx80( aSign
, 0, 0 );
5765 lastBitMask
<<= 0x403E - aExp
;
5766 roundBitsMask
= lastBitMask
- 1;
5768 switch (status
->float_rounding_mode
) {
5769 case float_round_nearest_even
:
5770 z
.low
+= lastBitMask
>>1;
5771 if ((z
.low
& roundBitsMask
) == 0) {
5772 z
.low
&= ~lastBitMask
;
5775 case float_round_ties_away
:
5776 z
.low
+= lastBitMask
>> 1;
5778 case float_round_to_zero
:
5780 case float_round_up
:
5781 if (!extractFloatx80Sign(z
)) {
5782 z
.low
+= roundBitsMask
;
5785 case float_round_down
:
5786 if (extractFloatx80Sign(z
)) {
5787 z
.low
+= roundBitsMask
;
5793 z
.low
&= ~ roundBitsMask
;
5796 z
.low
= UINT64_C(0x8000000000000000);
5798 if (z
.low
!= a
.low
) {
5799 status
->float_exception_flags
|= float_flag_inexact
;
5805 /*----------------------------------------------------------------------------
5806 | Returns the result of adding the absolute values of the extended double-
5807 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5808 | negated before being returned. `zSign' is ignored if the result is a NaN.
5809 | The addition is performed according to the IEC/IEEE Standard for Binary
5810 | Floating-Point Arithmetic.
5811 *----------------------------------------------------------------------------*/
5813 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
5814 float_status
*status
)
5816 int32_t aExp
, bExp
, zExp
;
5817 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5820 aSig
= extractFloatx80Frac( a
);
5821 aExp
= extractFloatx80Exp( a
);
5822 bSig
= extractFloatx80Frac( b
);
5823 bExp
= extractFloatx80Exp( b
);
5824 expDiff
= aExp
- bExp
;
5825 if ( 0 < expDiff
) {
5826 if ( aExp
== 0x7FFF ) {
5827 if ((uint64_t)(aSig
<< 1)) {
5828 return propagateFloatx80NaN(a
, b
, status
);
5832 if ( bExp
== 0 ) --expDiff
;
5833 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5836 else if ( expDiff
< 0 ) {
5837 if ( bExp
== 0x7FFF ) {
5838 if ((uint64_t)(bSig
<< 1)) {
5839 return propagateFloatx80NaN(a
, b
, status
);
5841 return packFloatx80(zSign
,
5842 floatx80_infinity_high
,
5843 floatx80_infinity_low
);
5845 if ( aExp
== 0 ) ++expDiff
;
5846 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5850 if ( aExp
== 0x7FFF ) {
5851 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5852 return propagateFloatx80NaN(a
, b
, status
);
5857 zSig0
= aSig
+ bSig
;
5860 return packFloatx80(zSign
, 0, 0);
5862 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5868 zSig0
= aSig
+ bSig
;
5869 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5871 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5872 zSig0
|= UINT64_C(0x8000000000000000);
5875 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5876 zSign
, zExp
, zSig0
, zSig1
, status
);
5879 /*----------------------------------------------------------------------------
5880 | Returns the result of subtracting the absolute values of the extended
5881 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5882 | difference is negated before being returned. `zSign' is ignored if the
5883 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5884 | Standard for Binary Floating-Point Arithmetic.
5885 *----------------------------------------------------------------------------*/
5887 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
5888 float_status
*status
)
5890 int32_t aExp
, bExp
, zExp
;
5891 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5894 aSig
= extractFloatx80Frac( a
);
5895 aExp
= extractFloatx80Exp( a
);
5896 bSig
= extractFloatx80Frac( b
);
5897 bExp
= extractFloatx80Exp( b
);
5898 expDiff
= aExp
- bExp
;
5899 if ( 0 < expDiff
) goto aExpBigger
;
5900 if ( expDiff
< 0 ) goto bExpBigger
;
5901 if ( aExp
== 0x7FFF ) {
5902 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5903 return propagateFloatx80NaN(a
, b
, status
);
5905 float_raise(float_flag_invalid
, status
);
5906 return floatx80_default_nan(status
);
5913 if ( bSig
< aSig
) goto aBigger
;
5914 if ( aSig
< bSig
) goto bBigger
;
5915 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
5917 if ( bExp
== 0x7FFF ) {
5918 if ((uint64_t)(bSig
<< 1)) {
5919 return propagateFloatx80NaN(a
, b
, status
);
5921 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
5922 floatx80_infinity_low
);
5924 if ( aExp
== 0 ) ++expDiff
;
5925 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5927 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
5930 goto normalizeRoundAndPack
;
5932 if ( aExp
== 0x7FFF ) {
5933 if ((uint64_t)(aSig
<< 1)) {
5934 return propagateFloatx80NaN(a
, b
, status
);
5938 if ( bExp
== 0 ) --expDiff
;
5939 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5941 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
5943 normalizeRoundAndPack
:
5944 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
5945 zSign
, zExp
, zSig0
, zSig1
, status
);
5948 /*----------------------------------------------------------------------------
5949 | Returns the result of adding the extended double-precision floating-point
5950 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5951 | Standard for Binary Floating-Point Arithmetic.
5952 *----------------------------------------------------------------------------*/
5954 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
5958 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5959 float_raise(float_flag_invalid
, status
);
5960 return floatx80_default_nan(status
);
5962 aSign
= extractFloatx80Sign( a
);
5963 bSign
= extractFloatx80Sign( b
);
5964 if ( aSign
== bSign
) {
5965 return addFloatx80Sigs(a
, b
, aSign
, status
);
5968 return subFloatx80Sigs(a
, b
, aSign
, status
);
5973 /*----------------------------------------------------------------------------
5974 | Returns the result of subtracting the extended double-precision floating-
5975 | point values `a' and `b'. The operation is performed according to the
5976 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5977 *----------------------------------------------------------------------------*/
5979 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5983 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5984 float_raise(float_flag_invalid
, status
);
5985 return floatx80_default_nan(status
);
5987 aSign
= extractFloatx80Sign( a
);
5988 bSign
= extractFloatx80Sign( b
);
5989 if ( aSign
== bSign
) {
5990 return subFloatx80Sigs(a
, b
, aSign
, status
);
5993 return addFloatx80Sigs(a
, b
, aSign
, status
);
5998 /*----------------------------------------------------------------------------
5999 | Returns the result of multiplying the extended double-precision floating-
6000 | point values `a' and `b'. The operation is performed according to the
6001 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6002 *----------------------------------------------------------------------------*/
6004 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
6006 flag aSign
, bSign
, zSign
;
6007 int32_t aExp
, bExp
, zExp
;
6008 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6010 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6011 float_raise(float_flag_invalid
, status
);
6012 return floatx80_default_nan(status
);
6014 aSig
= extractFloatx80Frac( a
);
6015 aExp
= extractFloatx80Exp( a
);
6016 aSign
= extractFloatx80Sign( a
);
6017 bSig
= extractFloatx80Frac( b
);
6018 bExp
= extractFloatx80Exp( b
);
6019 bSign
= extractFloatx80Sign( b
);
6020 zSign
= aSign
^ bSign
;
6021 if ( aExp
== 0x7FFF ) {
6022 if ( (uint64_t) ( aSig
<<1 )
6023 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6024 return propagateFloatx80NaN(a
, b
, status
);
6026 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
6027 return packFloatx80(zSign
, floatx80_infinity_high
,
6028 floatx80_infinity_low
);
6030 if ( bExp
== 0x7FFF ) {
6031 if ((uint64_t)(bSig
<< 1)) {
6032 return propagateFloatx80NaN(a
, b
, status
);
6034 if ( ( aExp
| aSig
) == 0 ) {
6036 float_raise(float_flag_invalid
, status
);
6037 return floatx80_default_nan(status
);
6039 return packFloatx80(zSign
, floatx80_infinity_high
,
6040 floatx80_infinity_low
);
6043 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6044 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6047 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6048 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6050 zExp
= aExp
+ bExp
- 0x3FFE;
6051 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
6052 if ( 0 < (int64_t) zSig0
) {
6053 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
6056 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6057 zSign
, zExp
, zSig0
, zSig1
, status
);
6060 /*----------------------------------------------------------------------------
6061 | Returns the result of dividing the extended double-precision floating-point
6062 | value `a' by the corresponding value `b'. The operation is performed
6063 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6064 *----------------------------------------------------------------------------*/
6066 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
6068 flag aSign
, bSign
, zSign
;
6069 int32_t aExp
, bExp
, zExp
;
6070 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6071 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
6073 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6074 float_raise(float_flag_invalid
, status
);
6075 return floatx80_default_nan(status
);
6077 aSig
= extractFloatx80Frac( a
);
6078 aExp
= extractFloatx80Exp( a
);
6079 aSign
= extractFloatx80Sign( a
);
6080 bSig
= extractFloatx80Frac( b
);
6081 bExp
= extractFloatx80Exp( b
);
6082 bSign
= extractFloatx80Sign( b
);
6083 zSign
= aSign
^ bSign
;
6084 if ( aExp
== 0x7FFF ) {
6085 if ((uint64_t)(aSig
<< 1)) {
6086 return propagateFloatx80NaN(a
, b
, status
);
6088 if ( bExp
== 0x7FFF ) {
6089 if ((uint64_t)(bSig
<< 1)) {
6090 return propagateFloatx80NaN(a
, b
, status
);
6094 return packFloatx80(zSign
, floatx80_infinity_high
,
6095 floatx80_infinity_low
);
6097 if ( bExp
== 0x7FFF ) {
6098 if ((uint64_t)(bSig
<< 1)) {
6099 return propagateFloatx80NaN(a
, b
, status
);
6101 return packFloatx80( zSign
, 0, 0 );
6105 if ( ( aExp
| aSig
) == 0 ) {
6107 float_raise(float_flag_invalid
, status
);
6108 return floatx80_default_nan(status
);
6110 float_raise(float_flag_divbyzero
, status
);
6111 return packFloatx80(zSign
, floatx80_infinity_high
,
6112 floatx80_infinity_low
);
6114 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6117 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6118 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6120 zExp
= aExp
- bExp
+ 0x3FFE;
6122 if ( bSig
<= aSig
) {
6123 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
6126 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
6127 mul64To128( bSig
, zSig0
, &term0
, &term1
);
6128 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
6129 while ( (int64_t) rem0
< 0 ) {
6131 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
6133 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
6134 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
6135 mul64To128( bSig
, zSig1
, &term1
, &term2
);
6136 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6137 while ( (int64_t) rem1
< 0 ) {
6139 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
6141 zSig1
|= ( ( rem1
| rem2
) != 0 );
6143 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6144 zSign
, zExp
, zSig0
, zSig1
, status
);
6147 /*----------------------------------------------------------------------------
6148 | Returns the remainder of the extended double-precision floating-point value
6149 | `a' with respect to the corresponding value `b'. The operation is performed
6150 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6151 *----------------------------------------------------------------------------*/
6153 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
6156 int32_t aExp
, bExp
, expDiff
;
6157 uint64_t aSig0
, aSig1
, bSig
;
6158 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
6160 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6161 float_raise(float_flag_invalid
, status
);
6162 return floatx80_default_nan(status
);
6164 aSig0
= extractFloatx80Frac( a
);
6165 aExp
= extractFloatx80Exp( a
);
6166 aSign
= extractFloatx80Sign( a
);
6167 bSig
= extractFloatx80Frac( b
);
6168 bExp
= extractFloatx80Exp( b
);
6169 if ( aExp
== 0x7FFF ) {
6170 if ( (uint64_t) ( aSig0
<<1 )
6171 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6172 return propagateFloatx80NaN(a
, b
, status
);
6176 if ( bExp
== 0x7FFF ) {
6177 if ((uint64_t)(bSig
<< 1)) {
6178 return propagateFloatx80NaN(a
, b
, status
);
6185 float_raise(float_flag_invalid
, status
);
6186 return floatx80_default_nan(status
);
6188 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6191 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
6192 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6194 bSig
|= UINT64_C(0x8000000000000000);
6196 expDiff
= aExp
- bExp
;
6198 if ( expDiff
< 0 ) {
6199 if ( expDiff
< -1 ) return a
;
6200 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
6203 q
= ( bSig
<= aSig0
);
6204 if ( q
) aSig0
-= bSig
;
6206 while ( 0 < expDiff
) {
6207 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6208 q
= ( 2 < q
) ? q
- 2 : 0;
6209 mul64To128( bSig
, q
, &term0
, &term1
);
6210 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6211 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
6215 if ( 0 < expDiff
) {
6216 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6217 q
= ( 2 < q
) ? q
- 2 : 0;
6219 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
6220 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6221 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
6222 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
6224 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6231 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
6232 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6233 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6236 aSig0
= alternateASig0
;
6237 aSig1
= alternateASig1
;
6241 normalizeRoundAndPackFloatx80(
6242 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
6246 /*----------------------------------------------------------------------------
6247 | Returns the square root of the extended double-precision floating-point
6248 | value `a'. The operation is performed according to the IEC/IEEE Standard
6249 | for Binary Floating-Point Arithmetic.
6250 *----------------------------------------------------------------------------*/
6252 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
6256 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
6257 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6259 if (floatx80_invalid_encoding(a
)) {
6260 float_raise(float_flag_invalid
, status
);
6261 return floatx80_default_nan(status
);
6263 aSig0
= extractFloatx80Frac( a
);
6264 aExp
= extractFloatx80Exp( a
);
6265 aSign
= extractFloatx80Sign( a
);
6266 if ( aExp
== 0x7FFF ) {
6267 if ((uint64_t)(aSig0
<< 1)) {
6268 return propagateFloatx80NaN(a
, a
, status
);
6270 if ( ! aSign
) return a
;
6274 if ( ( aExp
| aSig0
) == 0 ) return a
;
6276 float_raise(float_flag_invalid
, status
);
6277 return floatx80_default_nan(status
);
6280 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
6281 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6283 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
6284 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
6285 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
6286 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6287 doubleZSig0
= zSig0
<<1;
6288 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6289 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6290 while ( (int64_t) rem0
< 0 ) {
6293 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6295 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6296 if ( ( zSig1
& UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6297 if ( zSig1
== 0 ) zSig1
= 1;
6298 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6299 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6300 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6301 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6302 while ( (int64_t) rem1
< 0 ) {
6304 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6306 term2
|= doubleZSig0
;
6307 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6309 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6311 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
6312 zSig0
|= doubleZSig0
;
6313 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6314 0, zExp
, zSig0
, zSig1
, status
);
6317 /*----------------------------------------------------------------------------
6318 | Returns 1 if the extended double-precision floating-point value `a' is equal
6319 | to the corresponding value `b', and 0 otherwise. The invalid exception is
6320 | raised if either operand is a NaN. Otherwise, the comparison is performed
6321 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6322 *----------------------------------------------------------------------------*/
6324 int floatx80_eq(floatx80 a
, floatx80 b
, float_status
*status
)
6327 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
6328 || (extractFloatx80Exp(a
) == 0x7FFF
6329 && (uint64_t) (extractFloatx80Frac(a
) << 1))
6330 || (extractFloatx80Exp(b
) == 0x7FFF
6331 && (uint64_t) (extractFloatx80Frac(b
) << 1))
6333 float_raise(float_flag_invalid
, status
);
6338 && ( ( a
.high
== b
.high
)
6340 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6345 /*----------------------------------------------------------------------------
6346 | Returns 1 if the extended double-precision floating-point value `a' is
6347 | less than or equal to the corresponding value `b', and 0 otherwise. The
6348 | invalid exception is raised if either operand is a NaN. The comparison is
6349 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6351 *----------------------------------------------------------------------------*/
6353 int floatx80_le(floatx80 a
, floatx80 b
, float_status
*status
)
6357 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
6358 || (extractFloatx80Exp(a
) == 0x7FFF
6359 && (uint64_t) (extractFloatx80Frac(a
) << 1))
6360 || (extractFloatx80Exp(b
) == 0x7FFF
6361 && (uint64_t) (extractFloatx80Frac(b
) << 1))
6363 float_raise(float_flag_invalid
, status
);
6366 aSign
= extractFloatx80Sign( a
);
6367 bSign
= extractFloatx80Sign( b
);
6368 if ( aSign
!= bSign
) {
6371 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6375 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6376 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6380 /*----------------------------------------------------------------------------
6381 | Returns 1 if the extended double-precision floating-point value `a' is
6382 | less than the corresponding value `b', and 0 otherwise. The invalid
6383 | exception is raised if either operand is a NaN. The comparison is performed
6384 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6385 *----------------------------------------------------------------------------*/
6387 int floatx80_lt(floatx80 a
, floatx80 b
, float_status
*status
)
6391 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
6392 || (extractFloatx80Exp(a
) == 0x7FFF
6393 && (uint64_t) (extractFloatx80Frac(a
) << 1))
6394 || (extractFloatx80Exp(b
) == 0x7FFF
6395 && (uint64_t) (extractFloatx80Frac(b
) << 1))
6397 float_raise(float_flag_invalid
, status
);
6400 aSign
= extractFloatx80Sign( a
);
6401 bSign
= extractFloatx80Sign( b
);
6402 if ( aSign
!= bSign
) {
6405 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6409 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6410 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6414 /*----------------------------------------------------------------------------
6415 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6416 | cannot be compared, and 0 otherwise. The invalid exception is raised if
6417 | either operand is a NaN. The comparison is performed according to the
6418 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6419 *----------------------------------------------------------------------------*/
6420 int floatx80_unordered(floatx80 a
, floatx80 b
, float_status
*status
)
6422 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
6423 || (extractFloatx80Exp(a
) == 0x7FFF
6424 && (uint64_t) (extractFloatx80Frac(a
) << 1))
6425 || (extractFloatx80Exp(b
) == 0x7FFF
6426 && (uint64_t) (extractFloatx80Frac(b
) << 1))
6428 float_raise(float_flag_invalid
, status
);
6434 /*----------------------------------------------------------------------------
6435 | Returns 1 if the extended double-precision floating-point value `a' is
6436 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6437 | cause an exception. The comparison is performed according to the IEC/IEEE
6438 | Standard for Binary Floating-Point Arithmetic.
6439 *----------------------------------------------------------------------------*/
6441 int floatx80_eq_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6444 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6445 float_raise(float_flag_invalid
, status
);
6448 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
6449 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
6450 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
6451 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
6453 if (floatx80_is_signaling_nan(a
, status
)
6454 || floatx80_is_signaling_nan(b
, status
)) {
6455 float_raise(float_flag_invalid
, status
);
6461 && ( ( a
.high
== b
.high
)
6463 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6468 /*----------------------------------------------------------------------------
6469 | Returns 1 if the extended double-precision floating-point value `a' is less
6470 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
6471 | do not cause an exception. Otherwise, the comparison is performed according
6472 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6473 *----------------------------------------------------------------------------*/
6475 int floatx80_le_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6479 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6480 float_raise(float_flag_invalid
, status
);
6483 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
6484 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
6485 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
6486 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
6488 if (floatx80_is_signaling_nan(a
, status
)
6489 || floatx80_is_signaling_nan(b
, status
)) {
6490 float_raise(float_flag_invalid
, status
);
6494 aSign
= extractFloatx80Sign( a
);
6495 bSign
= extractFloatx80Sign( b
);
6496 if ( aSign
!= bSign
) {
6499 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6503 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6504 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6508 /*----------------------------------------------------------------------------
6509 | Returns 1 if the extended double-precision floating-point value `a' is less
6510 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
6511 | an exception. Otherwise, the comparison is performed according to the
6512 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6513 *----------------------------------------------------------------------------*/
6515 int floatx80_lt_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6519 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6520 float_raise(float_flag_invalid
, status
);
6523 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
6524 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
6525 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
6526 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
6528 if (floatx80_is_signaling_nan(a
, status
)
6529 || floatx80_is_signaling_nan(b
, status
)) {
6530 float_raise(float_flag_invalid
, status
);
6534 aSign
= extractFloatx80Sign( a
);
6535 bSign
= extractFloatx80Sign( b
);
6536 if ( aSign
!= bSign
) {
6539 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6543 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6544 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6548 /*----------------------------------------------------------------------------
6549 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6550 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
6551 | The comparison is performed according to the IEC/IEEE Standard for Binary
6552 | Floating-Point Arithmetic.
6553 *----------------------------------------------------------------------------*/
6554 int floatx80_unordered_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6556 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6557 float_raise(float_flag_invalid
, status
);
6560 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
6561 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
6562 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
6563 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
6565 if (floatx80_is_signaling_nan(a
, status
)
6566 || floatx80_is_signaling_nan(b
, status
)) {
6567 float_raise(float_flag_invalid
, status
);
6574 /*----------------------------------------------------------------------------
6575 | Returns the result of converting the quadruple-precision floating-point
6576 | value `a' to the 32-bit two's complement integer format. The conversion
6577 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6578 | Arithmetic---which means in particular that the conversion is rounded
6579 | according to the current rounding mode. If `a' is a NaN, the largest
6580 | positive integer is returned. Otherwise, if the conversion overflows, the
6581 | largest integer with the same sign as `a' is returned.
6582 *----------------------------------------------------------------------------*/
6584 int32_t float128_to_int32(float128 a
, float_status
*status
)
6587 int32_t aExp
, shiftCount
;
6588 uint64_t aSig0
, aSig1
;
6590 aSig1
= extractFloat128Frac1( a
);
6591 aSig0
= extractFloat128Frac0( a
);
6592 aExp
= extractFloat128Exp( a
);
6593 aSign
= extractFloat128Sign( a
);
6594 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
6595 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6596 aSig0
|= ( aSig1
!= 0 );
6597 shiftCount
= 0x4028 - aExp
;
6598 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
6599 return roundAndPackInt32(aSign
, aSig0
, status
);
6603 /*----------------------------------------------------------------------------
6604 | Returns the result of converting the quadruple-precision floating-point
6605 | value `a' to the 32-bit two's complement integer format. The conversion
6606 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6607 | Arithmetic, except that the conversion is always rounded toward zero. If
6608 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6609 | conversion overflows, the largest integer with the same sign as `a' is
6611 *----------------------------------------------------------------------------*/
6613 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
6616 int32_t aExp
, shiftCount
;
6617 uint64_t aSig0
, aSig1
, savedASig
;
6620 aSig1
= extractFloat128Frac1( a
);
6621 aSig0
= extractFloat128Frac0( a
);
6622 aExp
= extractFloat128Exp( a
);
6623 aSign
= extractFloat128Sign( a
);
6624 aSig0
|= ( aSig1
!= 0 );
6625 if ( 0x401E < aExp
) {
6626 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
6629 else if ( aExp
< 0x3FFF ) {
6630 if (aExp
|| aSig0
) {
6631 status
->float_exception_flags
|= float_flag_inexact
;
6635 aSig0
|= UINT64_C(0x0001000000000000);
6636 shiftCount
= 0x402F - aExp
;
6638 aSig0
>>= shiftCount
;
6640 if ( aSign
) z
= - z
;
6641 if ( ( z
< 0 ) ^ aSign
) {
6643 float_raise(float_flag_invalid
, status
);
6644 return aSign
? INT32_MIN
: INT32_MAX
;
6646 if ( ( aSig0
<<shiftCount
) != savedASig
) {
6647 status
->float_exception_flags
|= float_flag_inexact
;
6653 /*----------------------------------------------------------------------------
6654 | Returns the result of converting the quadruple-precision floating-point
6655 | value `a' to the 64-bit two's complement integer format. The conversion
6656 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6657 | Arithmetic---which means in particular that the conversion is rounded
6658 | according to the current rounding mode. If `a' is a NaN, the largest
6659 | positive integer is returned. Otherwise, if the conversion overflows, the
6660 | largest integer with the same sign as `a' is returned.
6661 *----------------------------------------------------------------------------*/
6663 int64_t float128_to_int64(float128 a
, float_status
*status
)
6666 int32_t aExp
, shiftCount
;
6667 uint64_t aSig0
, aSig1
;
6669 aSig1
= extractFloat128Frac1( a
);
6670 aSig0
= extractFloat128Frac0( a
);
6671 aExp
= extractFloat128Exp( a
);
6672 aSign
= extractFloat128Sign( a
);
6673 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6674 shiftCount
= 0x402F - aExp
;
6675 if ( shiftCount
<= 0 ) {
6676 if ( 0x403E < aExp
) {
6677 float_raise(float_flag_invalid
, status
);
6679 || ( ( aExp
== 0x7FFF )
6680 && ( aSig1
|| ( aSig0
!= UINT64_C(0x0001000000000000) ) )
6687 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
6690 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6692 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
6696 /*----------------------------------------------------------------------------
6697 | Returns the result of converting the quadruple-precision floating-point
6698 | value `a' to the 64-bit two's complement integer format. The conversion
6699 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6700 | Arithmetic, except that the conversion is always rounded toward zero.
6701 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6702 | the conversion overflows, the largest integer with the same sign as `a' is
6704 *----------------------------------------------------------------------------*/
6706 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
6709 int32_t aExp
, shiftCount
;
6710 uint64_t aSig0
, aSig1
;
6713 aSig1
= extractFloat128Frac1( a
);
6714 aSig0
= extractFloat128Frac0( a
);
6715 aExp
= extractFloat128Exp( a
);
6716 aSign
= extractFloat128Sign( a
);
6717 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6718 shiftCount
= aExp
- 0x402F;
6719 if ( 0 < shiftCount
) {
6720 if ( 0x403E <= aExp
) {
6721 aSig0
&= UINT64_C(0x0000FFFFFFFFFFFF);
6722 if ( ( a
.high
== UINT64_C(0xC03E000000000000) )
6723 && ( aSig1
< UINT64_C(0x0002000000000000) ) ) {
6725 status
->float_exception_flags
|= float_flag_inexact
;
6729 float_raise(float_flag_invalid
, status
);
6730 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
6736 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
6737 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
6738 status
->float_exception_flags
|= float_flag_inexact
;
6742 if ( aExp
< 0x3FFF ) {
6743 if ( aExp
| aSig0
| aSig1
) {
6744 status
->float_exception_flags
|= float_flag_inexact
;
6748 z
= aSig0
>>( - shiftCount
);
6750 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
6751 status
->float_exception_flags
|= float_flag_inexact
;
6754 if ( aSign
) z
= - z
;
6759 /*----------------------------------------------------------------------------
6760 | Returns the result of converting the quadruple-precision floating-point value
6761 | `a' to the 64-bit unsigned integer format. The conversion is
6762 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6763 | Arithmetic---which means in particular that the conversion is rounded
6764 | according to the current rounding mode. If `a' is a NaN, the largest
6765 | positive integer is returned. If the conversion overflows, the
6766 | largest unsigned integer is returned. If 'a' is negative, the value is
6767 | rounded and zero is returned; negative values that do not round to zero
6768 | will raise the inexact exception.
6769 *----------------------------------------------------------------------------*/
6771 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
6776 uint64_t aSig0
, aSig1
;
6778 aSig0
= extractFloat128Frac0(a
);
6779 aSig1
= extractFloat128Frac1(a
);
6780 aExp
= extractFloat128Exp(a
);
6781 aSign
= extractFloat128Sign(a
);
6782 if (aSign
&& (aExp
> 0x3FFE)) {
6783 float_raise(float_flag_invalid
, status
);
6784 if (float128_is_any_nan(a
)) {
6791 aSig0
|= UINT64_C(0x0001000000000000);
6793 shiftCount
= 0x402F - aExp
;
6794 if (shiftCount
<= 0) {
6795 if (0x403E < aExp
) {
6796 float_raise(float_flag_invalid
, status
);
6799 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
6801 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6803 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
6806 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
6809 signed char current_rounding_mode
= status
->float_rounding_mode
;
6811 set_float_rounding_mode(float_round_to_zero
, status
);
6812 v
= float128_to_uint64(a
, status
);
6813 set_float_rounding_mode(current_rounding_mode
, status
);
6818 /*----------------------------------------------------------------------------
6819 | Returns the result of converting the quadruple-precision floating-point
6820 | value `a' to the 32-bit unsigned integer format. The conversion
6821 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6822 | Arithmetic except that the conversion is always rounded toward zero.
6823 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6824 | if the conversion overflows, the largest unsigned integer is returned.
6825 | If 'a' is negative, the value is rounded and zero is returned; negative
6826 | values that do not round to zero will raise the inexact exception.
6827 *----------------------------------------------------------------------------*/
6829 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
6833 int old_exc_flags
= get_float_exception_flags(status
);
6835 v
= float128_to_uint64_round_to_zero(a
, status
);
6836 if (v
> 0xffffffff) {
6841 set_float_exception_flags(old_exc_flags
, status
);
6842 float_raise(float_flag_invalid
, status
);
6846 /*----------------------------------------------------------------------------
6847 | Returns the result of converting the quadruple-precision floating-point value
6848 | `a' to the 32-bit unsigned integer format. The conversion is
6849 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6850 | Arithmetic---which means in particular that the conversion is rounded
6851 | according to the current rounding mode. If `a' is a NaN, the largest
6852 | positive integer is returned. If the conversion overflows, the
6853 | largest unsigned integer is returned. If 'a' is negative, the value is
6854 | rounded and zero is returned; negative values that do not round to zero
6855 | will raise the inexact exception.
6856 *----------------------------------------------------------------------------*/
6858 uint32_t float128_to_uint32(float128 a
, float_status
*status
)
6862 int old_exc_flags
= get_float_exception_flags(status
);
6864 v
= float128_to_uint64(a
, status
);
6865 if (v
> 0xffffffff) {
6870 set_float_exception_flags(old_exc_flags
, status
);
6871 float_raise(float_flag_invalid
, status
);
6875 /*----------------------------------------------------------------------------
6876 | Returns the result of converting the quadruple-precision floating-point
6877 | value `a' to the single-precision floating-point format. The conversion
6878 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6880 *----------------------------------------------------------------------------*/
6882 float32
float128_to_float32(float128 a
, float_status
*status
)
6886 uint64_t aSig0
, aSig1
;
6889 aSig1
= extractFloat128Frac1( a
);
6890 aSig0
= extractFloat128Frac0( a
);
6891 aExp
= extractFloat128Exp( a
);
6892 aSign
= extractFloat128Sign( a
);
6893 if ( aExp
== 0x7FFF ) {
6894 if ( aSig0
| aSig1
) {
6895 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
6897 return packFloat32( aSign
, 0xFF, 0 );
6899 aSig0
|= ( aSig1
!= 0 );
6900 shift64RightJamming( aSig0
, 18, &aSig0
);
6902 if ( aExp
|| zSig
) {
6906 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
6910 /*----------------------------------------------------------------------------
6911 | Returns the result of converting the quadruple-precision floating-point
6912 | value `a' to the double-precision floating-point format. The conversion
6913 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6915 *----------------------------------------------------------------------------*/
6917 float64
float128_to_float64(float128 a
, float_status
*status
)
6921 uint64_t aSig0
, aSig1
;
6923 aSig1
= extractFloat128Frac1( a
);
6924 aSig0
= extractFloat128Frac0( a
);
6925 aExp
= extractFloat128Exp( a
);
6926 aSign
= extractFloat128Sign( a
);
6927 if ( aExp
== 0x7FFF ) {
6928 if ( aSig0
| aSig1
) {
6929 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
6931 return packFloat64( aSign
, 0x7FF, 0 );
6933 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6934 aSig0
|= ( aSig1
!= 0 );
6935 if ( aExp
|| aSig0
) {
6936 aSig0
|= UINT64_C(0x4000000000000000);
6939 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
6943 /*----------------------------------------------------------------------------
6944 | Returns the result of converting the quadruple-precision floating-point
6945 | value `a' to the extended double-precision floating-point format. The
6946 | conversion is performed according to the IEC/IEEE Standard for Binary
6947 | Floating-Point Arithmetic.
6948 *----------------------------------------------------------------------------*/
6950 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6954 uint64_t aSig0
, aSig1
;
6956 aSig1
= extractFloat128Frac1( a
);
6957 aSig0
= extractFloat128Frac0( a
);
6958 aExp
= extractFloat128Exp( a
);
6959 aSign
= extractFloat128Sign( a
);
6960 if ( aExp
== 0x7FFF ) {
6961 if ( aSig0
| aSig1
) {
6962 return commonNaNToFloatx80(float128ToCommonNaN(a
, status
), status
);
6964 return packFloatx80(aSign
, floatx80_infinity_high
,
6965 floatx80_infinity_low
);
6968 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6969 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6972 aSig0
|= UINT64_C(0x0001000000000000);
6974 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6975 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6979 /*----------------------------------------------------------------------------
6980 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6981 | returns the result as a quadruple-precision floating-point value. The
6982 | operation is performed according to the IEC/IEEE Standard for Binary
6983 | Floating-Point Arithmetic.
6984 *----------------------------------------------------------------------------*/
6986 float128
float128_round_to_int(float128 a
, float_status
*status
)
6990 uint64_t lastBitMask
, roundBitsMask
;
6993 aExp
= extractFloat128Exp( a
);
6994 if ( 0x402F <= aExp
) {
6995 if ( 0x406F <= aExp
) {
6996 if ( ( aExp
== 0x7FFF )
6997 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6999 return propagateFloat128NaN(a
, a
, status
);
7004 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
7005 roundBitsMask
= lastBitMask
- 1;
7007 switch (status
->float_rounding_mode
) {
7008 case float_round_nearest_even
:
7009 if ( lastBitMask
) {
7010 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
7011 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
7014 if ( (int64_t) z
.low
< 0 ) {
7016 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
7020 case float_round_ties_away
:
7022 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
7024 if ((int64_t) z
.low
< 0) {
7029 case float_round_to_zero
:
7031 case float_round_up
:
7032 if (!extractFloat128Sign(z
)) {
7033 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
7036 case float_round_down
:
7037 if (extractFloat128Sign(z
)) {
7038 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
7041 case float_round_to_odd
:
7043 * Note that if lastBitMask == 0, the last bit is the lsb
7044 * of high, and roundBitsMask == -1.
7046 if ((lastBitMask
? z
.low
& lastBitMask
: z
.high
& 1) == 0) {
7047 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
7053 z
.low
&= ~ roundBitsMask
;
7056 if ( aExp
< 0x3FFF ) {
7057 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
7058 status
->float_exception_flags
|= float_flag_inexact
;
7059 aSign
= extractFloat128Sign( a
);
7060 switch (status
->float_rounding_mode
) {
7061 case float_round_nearest_even
:
7062 if ( ( aExp
== 0x3FFE )
7063 && ( extractFloat128Frac0( a
)
7064 | extractFloat128Frac1( a
) )
7066 return packFloat128( aSign
, 0x3FFF, 0, 0 );
7069 case float_round_ties_away
:
7070 if (aExp
== 0x3FFE) {
7071 return packFloat128(aSign
, 0x3FFF, 0, 0);
7074 case float_round_down
:
7076 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
7077 : packFloat128( 0, 0, 0, 0 );
7078 case float_round_up
:
7080 aSign
? packFloat128( 1, 0, 0, 0 )
7081 : packFloat128( 0, 0x3FFF, 0, 0 );
7083 case float_round_to_odd
:
7084 return packFloat128(aSign
, 0x3FFF, 0, 0);
7086 return packFloat128( aSign
, 0, 0, 0 );
7089 lastBitMask
<<= 0x402F - aExp
;
7090 roundBitsMask
= lastBitMask
- 1;
7093 switch (status
->float_rounding_mode
) {
7094 case float_round_nearest_even
:
7095 z
.high
+= lastBitMask
>>1;
7096 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
7097 z
.high
&= ~ lastBitMask
;
7100 case float_round_ties_away
:
7101 z
.high
+= lastBitMask
>>1;
7103 case float_round_to_zero
:
7105 case float_round_up
:
7106 if (!extractFloat128Sign(z
)) {
7107 z
.high
|= ( a
.low
!= 0 );
7108 z
.high
+= roundBitsMask
;
7111 case float_round_down
:
7112 if (extractFloat128Sign(z
)) {
7113 z
.high
|= (a
.low
!= 0);
7114 z
.high
+= roundBitsMask
;
7117 case float_round_to_odd
:
7118 if ((z
.high
& lastBitMask
) == 0) {
7119 z
.high
|= (a
.low
!= 0);
7120 z
.high
+= roundBitsMask
;
7126 z
.high
&= ~ roundBitsMask
;
7128 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
7129 status
->float_exception_flags
|= float_flag_inexact
;
7135 /*----------------------------------------------------------------------------
7136 | Returns the result of adding the absolute values of the quadruple-precision
7137 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
7138 | before being returned. `zSign' is ignored if the result is a NaN.
7139 | The addition is performed according to the IEC/IEEE Standard for Binary
7140 | Floating-Point Arithmetic.
7141 *----------------------------------------------------------------------------*/
7143 static float128
addFloat128Sigs(float128 a
, float128 b
, flag zSign
,
7144 float_status
*status
)
7146 int32_t aExp
, bExp
, zExp
;
7147 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
7150 aSig1
= extractFloat128Frac1( a
);
7151 aSig0
= extractFloat128Frac0( a
);
7152 aExp
= extractFloat128Exp( a
);
7153 bSig1
= extractFloat128Frac1( b
);
7154 bSig0
= extractFloat128Frac0( b
);
7155 bExp
= extractFloat128Exp( b
);
7156 expDiff
= aExp
- bExp
;
7157 if ( 0 < expDiff
) {
7158 if ( aExp
== 0x7FFF ) {
7159 if (aSig0
| aSig1
) {
7160 return propagateFloat128NaN(a
, b
, status
);
7168 bSig0
|= UINT64_C(0x0001000000000000);
7170 shift128ExtraRightJamming(
7171 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
7174 else if ( expDiff
< 0 ) {
7175 if ( bExp
== 0x7FFF ) {
7176 if (bSig0
| bSig1
) {
7177 return propagateFloat128NaN(a
, b
, status
);
7179 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7185 aSig0
|= UINT64_C(0x0001000000000000);
7187 shift128ExtraRightJamming(
7188 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
7192 if ( aExp
== 0x7FFF ) {
7193 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
7194 return propagateFloat128NaN(a
, b
, status
);
7198 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7200 if (status
->flush_to_zero
) {
7201 if (zSig0
| zSig1
) {
7202 float_raise(float_flag_output_denormal
, status
);
7204 return packFloat128(zSign
, 0, 0, 0);
7206 return packFloat128( zSign
, 0, zSig0
, zSig1
);
7209 zSig0
|= UINT64_C(0x0002000000000000);
7213 aSig0
|= UINT64_C(0x0001000000000000);
7214 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7216 if ( zSig0
< UINT64_C(0x0002000000000000) ) goto roundAndPack
;
7219 shift128ExtraRightJamming(
7220 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
7222 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7226 /*----------------------------------------------------------------------------
7227 | Returns the result of subtracting the absolute values of the quadruple-
7228 | precision floating-point values `a' and `b'. If `zSign' is 1, the
7229 | difference is negated before being returned. `zSign' is ignored if the
7230 | result is a NaN. The subtraction is performed according to the IEC/IEEE
7231 | Standard for Binary Floating-Point Arithmetic.
7232 *----------------------------------------------------------------------------*/
7234 static float128
subFloat128Sigs(float128 a
, float128 b
, flag zSign
,
7235 float_status
*status
)
7237 int32_t aExp
, bExp
, zExp
;
7238 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
7241 aSig1
= extractFloat128Frac1( a
);
7242 aSig0
= extractFloat128Frac0( a
);
7243 aExp
= extractFloat128Exp( a
);
7244 bSig1
= extractFloat128Frac1( b
);
7245 bSig0
= extractFloat128Frac0( b
);
7246 bExp
= extractFloat128Exp( b
);
7247 expDiff
= aExp
- bExp
;
7248 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
7249 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
7250 if ( 0 < expDiff
) goto aExpBigger
;
7251 if ( expDiff
< 0 ) goto bExpBigger
;
7252 if ( aExp
== 0x7FFF ) {
7253 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
7254 return propagateFloat128NaN(a
, b
, status
);
7256 float_raise(float_flag_invalid
, status
);
7257 return float128_default_nan(status
);
7263 if ( bSig0
< aSig0
) goto aBigger
;
7264 if ( aSig0
< bSig0
) goto bBigger
;
7265 if ( bSig1
< aSig1
) goto aBigger
;
7266 if ( aSig1
< bSig1
) goto bBigger
;
7267 return packFloat128(status
->float_rounding_mode
== float_round_down
,
7270 if ( bExp
== 0x7FFF ) {
7271 if (bSig0
| bSig1
) {
7272 return propagateFloat128NaN(a
, b
, status
);
7274 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
7280 aSig0
|= UINT64_C(0x4000000000000000);
7282 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7283 bSig0
|= UINT64_C(0x4000000000000000);
7285 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7288 goto normalizeRoundAndPack
;
7290 if ( aExp
== 0x7FFF ) {
7291 if (aSig0
| aSig1
) {
7292 return propagateFloat128NaN(a
, b
, status
);
7300 bSig0
|= UINT64_C(0x4000000000000000);
7302 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
7303 aSig0
|= UINT64_C(0x4000000000000000);
7305 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7307 normalizeRoundAndPack
:
7309 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
7314 /*----------------------------------------------------------------------------
7315 | Returns the result of adding the quadruple-precision floating-point values
7316 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7317 | for Binary Floating-Point Arithmetic.
7318 *----------------------------------------------------------------------------*/
7320 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
7324 aSign
= extractFloat128Sign( a
);
7325 bSign
= extractFloat128Sign( b
);
7326 if ( aSign
== bSign
) {
7327 return addFloat128Sigs(a
, b
, aSign
, status
);
7330 return subFloat128Sigs(a
, b
, aSign
, status
);
7335 /*----------------------------------------------------------------------------
7336 | Returns the result of subtracting the quadruple-precision floating-point
7337 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7338 | Standard for Binary Floating-Point Arithmetic.
7339 *----------------------------------------------------------------------------*/
7341 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
7345 aSign
= extractFloat128Sign( a
);
7346 bSign
= extractFloat128Sign( b
);
7347 if ( aSign
== bSign
) {
7348 return subFloat128Sigs(a
, b
, aSign
, status
);
7351 return addFloat128Sigs(a
, b
, aSign
, status
);
7356 /*----------------------------------------------------------------------------
7357 | Returns the result of multiplying the quadruple-precision floating-point
7358 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7359 | Standard for Binary Floating-Point Arithmetic.
7360 *----------------------------------------------------------------------------*/
7362 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
7364 flag aSign
, bSign
, zSign
;
7365 int32_t aExp
, bExp
, zExp
;
7366 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
7368 aSig1
= extractFloat128Frac1( a
);
7369 aSig0
= extractFloat128Frac0( a
);
7370 aExp
= extractFloat128Exp( a
);
7371 aSign
= extractFloat128Sign( a
);
7372 bSig1
= extractFloat128Frac1( b
);
7373 bSig0
= extractFloat128Frac0( b
);
7374 bExp
= extractFloat128Exp( b
);
7375 bSign
= extractFloat128Sign( b
);
7376 zSign
= aSign
^ bSign
;
7377 if ( aExp
== 0x7FFF ) {
7378 if ( ( aSig0
| aSig1
)
7379 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7380 return propagateFloat128NaN(a
, b
, status
);
7382 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
7383 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7385 if ( bExp
== 0x7FFF ) {
7386 if (bSig0
| bSig1
) {
7387 return propagateFloat128NaN(a
, b
, status
);
7389 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7391 float_raise(float_flag_invalid
, status
);
7392 return float128_default_nan(status
);
7394 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7397 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7398 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7401 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7402 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7404 zExp
= aExp
+ bExp
- 0x4000;
7405 aSig0
|= UINT64_C(0x0001000000000000);
7406 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
7407 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
7408 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7409 zSig2
|= ( zSig3
!= 0 );
7410 if (UINT64_C( 0x0002000000000000) <= zSig0
) {
7411 shift128ExtraRightJamming(
7412 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
7415 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7419 /*----------------------------------------------------------------------------
7420 | Returns the result of dividing the quadruple-precision floating-point value
7421 | `a' by the corresponding value `b'. The operation is performed according to
7422 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7423 *----------------------------------------------------------------------------*/
7425 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
7427 flag aSign
, bSign
, zSign
;
7428 int32_t aExp
, bExp
, zExp
;
7429 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
7430 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7432 aSig1
= extractFloat128Frac1( a
);
7433 aSig0
= extractFloat128Frac0( a
);
7434 aExp
= extractFloat128Exp( a
);
7435 aSign
= extractFloat128Sign( a
);
7436 bSig1
= extractFloat128Frac1( b
);
7437 bSig0
= extractFloat128Frac0( b
);
7438 bExp
= extractFloat128Exp( b
);
7439 bSign
= extractFloat128Sign( b
);
7440 zSign
= aSign
^ bSign
;
7441 if ( aExp
== 0x7FFF ) {
7442 if (aSig0
| aSig1
) {
7443 return propagateFloat128NaN(a
, b
, status
);
7445 if ( bExp
== 0x7FFF ) {
7446 if (bSig0
| bSig1
) {
7447 return propagateFloat128NaN(a
, b
, status
);
7451 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7453 if ( bExp
== 0x7FFF ) {
7454 if (bSig0
| bSig1
) {
7455 return propagateFloat128NaN(a
, b
, status
);
7457 return packFloat128( zSign
, 0, 0, 0 );
7460 if ( ( bSig0
| bSig1
) == 0 ) {
7461 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7463 float_raise(float_flag_invalid
, status
);
7464 return float128_default_nan(status
);
7466 float_raise(float_flag_divbyzero
, status
);
7467 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7469 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7472 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7473 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7475 zExp
= aExp
- bExp
+ 0x3FFD;
7477 aSig0
| UINT64_C(0x0001000000000000), aSig1
, 15, &aSig0
, &aSig1
);
7479 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7480 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
7481 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
7484 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7485 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
7486 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
7487 while ( (int64_t) rem0
< 0 ) {
7489 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
7491 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
7492 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
7493 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
7494 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
7495 while ( (int64_t) rem1
< 0 ) {
7497 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
7499 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7501 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
7502 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7506 /*----------------------------------------------------------------------------
7507 | Returns the remainder of the quadruple-precision floating-point value `a'
7508 | with respect to the corresponding value `b'. The operation is performed
7509 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7510 *----------------------------------------------------------------------------*/
7512 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
7515 int32_t aExp
, bExp
, expDiff
;
7516 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
7517 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
7520 aSig1
= extractFloat128Frac1( a
);
7521 aSig0
= extractFloat128Frac0( a
);
7522 aExp
= extractFloat128Exp( a
);
7523 aSign
= extractFloat128Sign( a
);
7524 bSig1
= extractFloat128Frac1( b
);
7525 bSig0
= extractFloat128Frac0( b
);
7526 bExp
= extractFloat128Exp( b
);
7527 if ( aExp
== 0x7FFF ) {
7528 if ( ( aSig0
| aSig1
)
7529 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7530 return propagateFloat128NaN(a
, b
, status
);
7534 if ( bExp
== 0x7FFF ) {
7535 if (bSig0
| bSig1
) {
7536 return propagateFloat128NaN(a
, b
, status
);
7541 if ( ( bSig0
| bSig1
) == 0 ) {
7543 float_raise(float_flag_invalid
, status
);
7544 return float128_default_nan(status
);
7546 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7549 if ( ( aSig0
| aSig1
) == 0 ) return a
;
7550 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7552 expDiff
= aExp
- bExp
;
7553 if ( expDiff
< -1 ) return a
;
7555 aSig0
| UINT64_C(0x0001000000000000),
7557 15 - ( expDiff
< 0 ),
7562 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7563 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
7564 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7566 while ( 0 < expDiff
) {
7567 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7568 q
= ( 4 < q
) ? q
- 4 : 0;
7569 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7570 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
7571 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
7572 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
7575 if ( -64 < expDiff
) {
7576 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7577 q
= ( 4 < q
) ? q
- 4 : 0;
7579 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7581 if ( expDiff
< 0 ) {
7582 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7585 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
7587 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7588 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
7591 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
7592 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7595 alternateASig0
= aSig0
;
7596 alternateASig1
= aSig1
;
7598 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7599 } while ( 0 <= (int64_t) aSig0
);
7601 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
7602 if ( ( sigMean0
< 0 )
7603 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
7604 aSig0
= alternateASig0
;
7605 aSig1
= alternateASig1
;
7607 zSign
= ( (int64_t) aSig0
< 0 );
7608 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
7609 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
7613 /*----------------------------------------------------------------------------
7614 | Returns the square root of the quadruple-precision floating-point value `a'.
7615 | The operation is performed according to the IEC/IEEE Standard for Binary
7616 | Floating-Point Arithmetic.
7617 *----------------------------------------------------------------------------*/
7619 float128
float128_sqrt(float128 a
, float_status
*status
)
7623 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
7624 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7626 aSig1
= extractFloat128Frac1( a
);
7627 aSig0
= extractFloat128Frac0( a
);
7628 aExp
= extractFloat128Exp( a
);
7629 aSign
= extractFloat128Sign( a
);
7630 if ( aExp
== 0x7FFF ) {
7631 if (aSig0
| aSig1
) {
7632 return propagateFloat128NaN(a
, a
, status
);
7634 if ( ! aSign
) return a
;
7638 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
7640 float_raise(float_flag_invalid
, status
);
7641 return float128_default_nan(status
);
7644 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
7645 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7647 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
7648 aSig0
|= UINT64_C(0x0001000000000000);
7649 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
7650 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
7651 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
7652 doubleZSig0
= zSig0
<<1;
7653 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
7654 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
7655 while ( (int64_t) rem0
< 0 ) {
7658 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
7660 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
7661 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
7662 if ( zSig1
== 0 ) zSig1
= 1;
7663 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
7664 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
7665 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
7666 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7667 while ( (int64_t) rem1
< 0 ) {
7669 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
7671 term2
|= doubleZSig0
;
7672 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7674 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7676 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
7677 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
7681 /*----------------------------------------------------------------------------
7682 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7683 | the corresponding value `b', and 0 otherwise. The invalid exception is
7684 | raised if either operand is a NaN. Otherwise, the comparison is performed
7685 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7686 *----------------------------------------------------------------------------*/
7688 int float128_eq(float128 a
, float128 b
, float_status
*status
)
7691 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7692 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7693 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7694 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7696 float_raise(float_flag_invalid
, status
);
7701 && ( ( a
.high
== b
.high
)
7703 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
7708 /*----------------------------------------------------------------------------
7709 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7710 | or equal to the corresponding value `b', and 0 otherwise. The invalid
7711 | exception is raised if either operand is a NaN. The comparison is performed
7712 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7713 *----------------------------------------------------------------------------*/
7715 int float128_le(float128 a
, float128 b
, float_status
*status
)
7719 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7720 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7721 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7722 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7724 float_raise(float_flag_invalid
, status
);
7727 aSign
= extractFloat128Sign( a
);
7728 bSign
= extractFloat128Sign( b
);
7729 if ( aSign
!= bSign
) {
7732 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7736 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
7737 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
7741 /*----------------------------------------------------------------------------
7742 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7743 | the corresponding value `b', and 0 otherwise. The invalid exception is
7744 | raised if either operand is a NaN. The comparison is performed according
7745 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7746 *----------------------------------------------------------------------------*/
7748 int float128_lt(float128 a
, float128 b
, float_status
*status
)
7752 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7753 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7754 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7755 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7757 float_raise(float_flag_invalid
, status
);
7760 aSign
= extractFloat128Sign( a
);
7761 bSign
= extractFloat128Sign( b
);
7762 if ( aSign
!= bSign
) {
7765 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7769 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
7770 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
7774 /*----------------------------------------------------------------------------
7775 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7776 | be compared, and 0 otherwise. The invalid exception is raised if either
7777 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7778 | Standard for Binary Floating-Point Arithmetic.
7779 *----------------------------------------------------------------------------*/
7781 int float128_unordered(float128 a
, float128 b
, float_status
*status
)
7783 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7784 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7785 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7786 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7788 float_raise(float_flag_invalid
, status
);
7794 /*----------------------------------------------------------------------------
7795 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7796 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7797 | exception. The comparison is performed according to the IEC/IEEE Standard
7798 | for Binary Floating-Point Arithmetic.
7799 *----------------------------------------------------------------------------*/
7801 int float128_eq_quiet(float128 a
, float128 b
, float_status
*status
)
7804 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7805 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7806 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7807 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7809 if (float128_is_signaling_nan(a
, status
)
7810 || float128_is_signaling_nan(b
, status
)) {
7811 float_raise(float_flag_invalid
, status
);
7817 && ( ( a
.high
== b
.high
)
7819 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
7824 /*----------------------------------------------------------------------------
7825 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7826 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
7827 | cause an exception. Otherwise, the comparison is performed according to the
7828 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7829 *----------------------------------------------------------------------------*/
7831 int float128_le_quiet(float128 a
, float128 b
, float_status
*status
)
7835 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7836 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7837 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7838 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7840 if (float128_is_signaling_nan(a
, status
)
7841 || float128_is_signaling_nan(b
, status
)) {
7842 float_raise(float_flag_invalid
, status
);
7846 aSign
= extractFloat128Sign( a
);
7847 bSign
= extractFloat128Sign( b
);
7848 if ( aSign
!= bSign
) {
7851 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7855 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
7856 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
7860 /*----------------------------------------------------------------------------
7861 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7862 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7863 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
7864 | Standard for Binary Floating-Point Arithmetic.
7865 *----------------------------------------------------------------------------*/
7867 int float128_lt_quiet(float128 a
, float128 b
, float_status
*status
)
7871 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7872 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7873 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7874 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7876 if (float128_is_signaling_nan(a
, status
)
7877 || float128_is_signaling_nan(b
, status
)) {
7878 float_raise(float_flag_invalid
, status
);
7882 aSign
= extractFloat128Sign( a
);
7883 bSign
= extractFloat128Sign( b
);
7884 if ( aSign
!= bSign
) {
7887 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7891 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
7892 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
7896 /*----------------------------------------------------------------------------
7897 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7898 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7899 | comparison is performed according to the IEC/IEEE Standard for Binary
7900 | Floating-Point Arithmetic.
7901 *----------------------------------------------------------------------------*/
7903 int float128_unordered_quiet(float128 a
, float128 b
, float_status
*status
)
7905 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7906 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7907 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7908 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7910 if (float128_is_signaling_nan(a
, status
)
7911 || float128_is_signaling_nan(b
, status
)) {
7912 float_raise(float_flag_invalid
, status
);
7919 static inline int floatx80_compare_internal(floatx80 a
, floatx80 b
,
7920 int is_quiet
, float_status
*status
)
7924 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
7925 float_raise(float_flag_invalid
, status
);
7926 return float_relation_unordered
;
7928 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7929 ( extractFloatx80Frac( a
)<<1 ) ) ||
7930 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7931 ( extractFloatx80Frac( b
)<<1 ) )) {
7933 floatx80_is_signaling_nan(a
, status
) ||
7934 floatx80_is_signaling_nan(b
, status
)) {
7935 float_raise(float_flag_invalid
, status
);
7937 return float_relation_unordered
;
7939 aSign
= extractFloatx80Sign( a
);
7940 bSign
= extractFloatx80Sign( b
);
7941 if ( aSign
!= bSign
) {
7943 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7944 ( ( a
.low
| b
.low
) == 0 ) ) {
7946 return float_relation_equal
;
7948 return 1 - (2 * aSign
);
7951 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7952 return float_relation_equal
;
7954 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7959 int floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
7961 return floatx80_compare_internal(a
, b
, 0, status
);
7964 int floatx80_compare_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
7966 return floatx80_compare_internal(a
, b
, 1, status
);
7969 static inline int float128_compare_internal(float128 a
, float128 b
,
7970 int is_quiet
, float_status
*status
)
7974 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7975 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7976 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7977 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7979 float128_is_signaling_nan(a
, status
) ||
7980 float128_is_signaling_nan(b
, status
)) {
7981 float_raise(float_flag_invalid
, status
);
7983 return float_relation_unordered
;
7985 aSign
= extractFloat128Sign( a
);
7986 bSign
= extractFloat128Sign( b
);
7987 if ( aSign
!= bSign
) {
7988 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7990 return float_relation_equal
;
7992 return 1 - (2 * aSign
);
7995 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7996 return float_relation_equal
;
7998 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
8003 int float128_compare(float128 a
, float128 b
, float_status
*status
)
8005 return float128_compare_internal(a
, b
, 0, status
);
8008 int float128_compare_quiet(float128 a
, float128 b
, float_status
*status
)
8010 return float128_compare_internal(a
, b
, 1, status
);
8013 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
8019 if (floatx80_invalid_encoding(a
)) {
8020 float_raise(float_flag_invalid
, status
);
8021 return floatx80_default_nan(status
);
8023 aSig
= extractFloatx80Frac( a
);
8024 aExp
= extractFloatx80Exp( a
);
8025 aSign
= extractFloatx80Sign( a
);
8027 if ( aExp
== 0x7FFF ) {
8029 return propagateFloatx80NaN(a
, a
, status
);
8043 } else if (n
< -0x10000) {
8048 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
8049 aSign
, aExp
, aSig
, 0, status
);
8052 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
8056 uint64_t aSig0
, aSig1
;
8058 aSig1
= extractFloat128Frac1( a
);
8059 aSig0
= extractFloat128Frac0( a
);
8060 aExp
= extractFloat128Exp( a
);
8061 aSign
= extractFloat128Sign( a
);
8062 if ( aExp
== 0x7FFF ) {
8063 if ( aSig0
| aSig1
) {
8064 return propagateFloat128NaN(a
, a
, status
);
8069 aSig0
|= UINT64_C(0x0001000000000000);
8070 } else if (aSig0
== 0 && aSig1
== 0) {
8078 } else if (n
< -0x10000) {
8083 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1
8088 static void __attribute__((constructor
)) softfloat_init(void)
8090 union_float64 ua
, ub
, uc
, ur
;
8092 if (QEMU_NO_HARDFLOAT
) {
8096 * Test that the host's FMA is not obviously broken. For example,
8097 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
8098 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
8100 ua
.s
= 0x0020000000000001ULL
;
8101 ub
.s
= 0x3ca0000000000000ULL
;
8102 uc
.s
= 0x0020000000000000ULL
;
8103 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
8104 if (ur
.s
!= 0x0020000000000001ULL
) {
8105 force_soft_fma
= true;