4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
90 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 s->float_exception_flags |= float_flag_input_denormal; \
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck
, float32
)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck
, float64
)
141 #undef GEN_INPUT_FLUSH__NOCHECK
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
146 if (likely(!s->flush_inputs_to_zero)) { \
149 soft_t ## _input_flush__nocheck(a, s); \
152 GEN_INPUT_FLUSH1(float32_input_flush1
, float32
)
153 GEN_INPUT_FLUSH1(float64_input_flush1
, float64
)
154 #undef GEN_INPUT_FLUSH1
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
159 if (likely(!s->flush_inputs_to_zero)) { \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
166 GEN_INPUT_FLUSH2(float32_input_flush2
, float32
)
167 GEN_INPUT_FLUSH2(float64_input_flush2
, float64
)
168 #undef GEN_INPUT_FLUSH2
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
173 if (likely(!s->flush_inputs_to_zero)) { \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
181 GEN_INPUT_FLUSH3(float32_input_flush3
, float32
)
182 GEN_INPUT_FLUSH3(float64_input_flush3
, float64
)
183 #undef GEN_INPUT_FLUSH3
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
215 # define QEMU_HARDFLOAT_USE_ISINF 0
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
235 static inline bool can_use_fpu(const float_status
*s
)
237 if (QEMU_NO_HARDFLOAT
) {
240 return likely(s
->float_exception_flags
& float_flag_inexact
&&
241 s
->float_rounding_mode
== float_round_nearest_even
);
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
266 typedef bool (*f32_check_fn
)(union_float32 a
, union_float32 b
);
267 typedef bool (*f64_check_fn
)(union_float64 a
, union_float64 b
);
269 typedef float32 (*soft_f32_op2_fn
)(float32 a
, float32 b
, float_status
*s
);
270 typedef float64 (*soft_f64_op2_fn
)(float64 a
, float64 b
, float_status
*s
);
271 typedef float (*hard_f32_op2_fn
)(float a
, float b
);
272 typedef double (*hard_f64_op2_fn
)(double a
, double b
);
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a
, union_float32 b
)
277 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
282 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
283 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
285 return float32_is_zero_or_normal(a
.s
) &&
286 float32_is_zero_or_normal(b
.s
);
289 static inline bool f64_is_zon2(union_float64 a
, union_float64 b
)
291 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
292 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
293 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
295 return float64_is_zero_or_normal(a
.s
) &&
296 float64_is_zero_or_normal(b
.s
);
299 /* 3-input is-zero-or-normal */
301 bool f32_is_zon3(union_float32 a
, union_float32 b
, union_float32 c
)
303 if (QEMU_HARDFLOAT_3F32_USE_FP
) {
304 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
305 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
306 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
308 return float32_is_zero_or_normal(a
.s
) &&
309 float32_is_zero_or_normal(b
.s
) &&
310 float32_is_zero_or_normal(c
.s
);
314 bool f64_is_zon3(union_float64 a
, union_float64 b
, union_float64 c
)
316 if (QEMU_HARDFLOAT_3F64_USE_FP
) {
317 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
318 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
319 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
321 return float64_is_zero_or_normal(a
.s
) &&
322 float64_is_zero_or_normal(b
.s
) &&
323 float64_is_zero_or_normal(c
.s
);
326 static inline bool f32_is_inf(union_float32 a
)
328 if (QEMU_HARDFLOAT_USE_ISINF
) {
331 return float32_is_infinity(a
.s
);
334 static inline bool f64_is_inf(union_float64 a
)
336 if (QEMU_HARDFLOAT_USE_ISINF
) {
339 return float64_is_infinity(a
.s
);
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa
, float32 xb
, float_status
*s
,
345 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
,
346 f32_check_fn pre
, f32_check_fn post
,
347 f32_check_fn fast_test
, soft_f32_op2_fn fast_op
)
349 union_float32 ua
, ub
, ur
;
354 if (unlikely(!can_use_fpu(s
))) {
358 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
359 if (unlikely(!pre(ua
, ub
))) {
362 if (fast_test
&& fast_test(ua
, ub
)) {
363 return fast_op(ua
.s
, ub
.s
, s
);
366 ur
.h
= hard(ua
.h
, ub
.h
);
367 if (unlikely(f32_is_inf(ur
))) {
368 s
->float_exception_flags
|= float_flag_overflow
;
369 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
)) {
370 if (post
== NULL
|| post(ua
, ub
)) {
377 return soft(ua
.s
, ub
.s
, s
);
380 static inline float64
381 float64_gen2(float64 xa
, float64 xb
, float_status
*s
,
382 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
,
383 f64_check_fn pre
, f64_check_fn post
,
384 f64_check_fn fast_test
, soft_f64_op2_fn fast_op
)
386 union_float64 ua
, ub
, ur
;
391 if (unlikely(!can_use_fpu(s
))) {
395 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
396 if (unlikely(!pre(ua
, ub
))) {
399 if (fast_test
&& fast_test(ua
, ub
)) {
400 return fast_op(ua
.s
, ub
.s
, s
);
403 ur
.h
= hard(ua
.h
, ub
.h
);
404 if (unlikely(f64_is_inf(ur
))) {
405 s
->float_exception_flags
|= float_flag_overflow
;
406 } else if (unlikely(fabs(ur
.h
) <= DBL_MIN
)) {
407 if (post
== NULL
|| post(ua
, ub
)) {
414 return soft(ua
.s
, ub
.s
, s
);
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the single-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
421 static inline uint32_t extractFloat32Frac(float32 a
)
423 return float32_val(a
) & 0x007FFFFF;
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the single-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
430 static inline int extractFloat32Exp(float32 a
)
432 return (float32_val(a
) >> 23) & 0xFF;
435 /*----------------------------------------------------------------------------
436 | Returns the sign bit of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
439 static inline flag
extractFloat32Sign(float32 a
)
441 return float32_val(a
) >> 31;
444 /*----------------------------------------------------------------------------
445 | Returns the fraction bits of the double-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
448 static inline uint64_t extractFloat64Frac(float64 a
)
450 return float64_val(a
) & UINT64_C(0x000FFFFFFFFFFFFF);
453 /*----------------------------------------------------------------------------
454 | Returns the exponent bits of the double-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
457 static inline int extractFloat64Exp(float64 a
)
459 return (float64_val(a
) >> 52) & 0x7FF;
462 /*----------------------------------------------------------------------------
463 | Returns the sign bit of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
466 static inline flag
extractFloat64Sign(float64 a
)
468 return float64_val(a
) >> 63;
472 * Classify a floating point number. Everything above float_class_qnan
473 * is a NaN so cls >= float_class_qnan is any NaN.
476 typedef enum __attribute__ ((__packed__
)) {
477 float_class_unclassified
,
481 float_class_qnan
, /* all NaNs from here */
485 /* Simple helpers for checking if, or what kind of, NaN we have */
486 static inline __attribute__((unused
)) bool is_nan(FloatClass c
)
488 return unlikely(c
>= float_class_qnan
);
491 static inline __attribute__((unused
)) bool is_snan(FloatClass c
)
493 return c
== float_class_snan
;
496 static inline __attribute__((unused
)) bool is_qnan(FloatClass c
)
498 return c
== float_class_qnan
;
502 * Structure holding all of the decomposed parts of a float. The
503 * exponent is unbiased and the fraction is normalized. All
504 * calculations are done with a 64 bit fraction and then rounded as
505 * appropriate for the final format.
507 * Thanks to the packed FloatClass a decent compiler should be able to
508 * fit the whole structure into registers and avoid using the stack
509 * for parameter passing.
519 #define DECOMPOSED_BINARY_POINT (64 - 2)
520 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
521 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
523 /* Structure holding all of the relevant parameters for a format.
524 * exp_size: the size of the exponent field
525 * exp_bias: the offset applied to the exponent field
526 * exp_max: the maximum normalised exponent
527 * frac_size: the size of the fraction field
528 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
529 * The following are computed based the size of fraction
530 * frac_lsb: least significant bit of fraction
531 * frac_lsbm1: the bit below the least significant bit (for rounding)
532 * round_mask/roundeven_mask: masks used for rounding
533 * The following optional modifiers are available:
534 * arm_althp: handle ARM Alternative Half Precision
545 uint64_t roundeven_mask
;
549 /* Expand fields based on the size of exponent and fraction */
550 #define FLOAT_PARAMS(E, F) \
552 .exp_bias = ((1 << E) - 1) >> 1, \
553 .exp_max = (1 << E) - 1, \
555 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
556 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
557 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
558 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
559 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
561 static const FloatFmt float16_params
= {
565 static const FloatFmt float16_params_ahp
= {
570 static const FloatFmt float32_params
= {
574 static const FloatFmt float64_params
= {
578 /* Unpack a float to parts, but do not canonicalize. */
579 static inline FloatParts
unpack_raw(FloatFmt fmt
, uint64_t raw
)
581 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
583 return (FloatParts
) {
584 .cls
= float_class_unclassified
,
585 .sign
= extract64(raw
, sign_pos
, 1),
586 .exp
= extract64(raw
, fmt
.frac_size
, fmt
.exp_size
),
587 .frac
= extract64(raw
, 0, fmt
.frac_size
),
591 static inline FloatParts
float16_unpack_raw(float16 f
)
593 return unpack_raw(float16_params
, f
);
596 static inline FloatParts
float32_unpack_raw(float32 f
)
598 return unpack_raw(float32_params
, f
);
601 static inline FloatParts
float64_unpack_raw(float64 f
)
603 return unpack_raw(float64_params
, f
);
606 /* Pack a float from parts, but do not canonicalize. */
607 static inline uint64_t pack_raw(FloatFmt fmt
, FloatParts p
)
609 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
610 uint64_t ret
= deposit64(p
.frac
, fmt
.frac_size
, fmt
.exp_size
, p
.exp
);
611 return deposit64(ret
, sign_pos
, 1, p
.sign
);
614 static inline float16
float16_pack_raw(FloatParts p
)
616 return make_float16(pack_raw(float16_params
, p
));
619 static inline float32
float32_pack_raw(FloatParts p
)
621 return make_float32(pack_raw(float32_params
, p
));
624 static inline float64
float64_pack_raw(FloatParts p
)
626 return make_float64(pack_raw(float64_params
, p
));
629 /*----------------------------------------------------------------------------
630 | Functions and definitions to determine: (1) whether tininess for underflow
631 | is detected before or after rounding by default, (2) what (if anything)
632 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
633 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
634 | are propagated from function inputs to output. These details are target-
636 *----------------------------------------------------------------------------*/
637 #include "softfloat-specialize.inc.c"
639 /* Canonicalize EXP and FRAC, setting CLS. */
640 static FloatParts
sf_canonicalize(FloatParts part
, const FloatFmt
*parm
,
641 float_status
*status
)
643 if (part
.exp
== parm
->exp_max
&& !parm
->arm_althp
) {
644 if (part
.frac
== 0) {
645 part
.cls
= float_class_inf
;
647 part
.frac
<<= parm
->frac_shift
;
648 part
.cls
= (parts_is_snan_frac(part
.frac
, status
)
649 ? float_class_snan
: float_class_qnan
);
651 } else if (part
.exp
== 0) {
652 if (likely(part
.frac
== 0)) {
653 part
.cls
= float_class_zero
;
654 } else if (status
->flush_inputs_to_zero
) {
655 float_raise(float_flag_input_denormal
, status
);
656 part
.cls
= float_class_zero
;
659 int shift
= clz64(part
.frac
) - 1;
660 part
.cls
= float_class_normal
;
661 part
.exp
= parm
->frac_shift
- parm
->exp_bias
- shift
+ 1;
665 part
.cls
= float_class_normal
;
666 part
.exp
-= parm
->exp_bias
;
667 part
.frac
= DECOMPOSED_IMPLICIT_BIT
+ (part
.frac
<< parm
->frac_shift
);
672 /* Round and uncanonicalize a floating-point number by parts. There
673 * are FRAC_SHIFT bits that may require rounding at the bottom of the
674 * fraction; these bits will be removed. The exponent will be biased
675 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
678 static FloatParts
round_canonical(FloatParts p
, float_status
*s
,
679 const FloatFmt
*parm
)
681 const uint64_t frac_lsb
= parm
->frac_lsb
;
682 const uint64_t frac_lsbm1
= parm
->frac_lsbm1
;
683 const uint64_t round_mask
= parm
->round_mask
;
684 const uint64_t roundeven_mask
= parm
->roundeven_mask
;
685 const int exp_max
= parm
->exp_max
;
686 const int frac_shift
= parm
->frac_shift
;
695 case float_class_normal
:
696 switch (s
->float_rounding_mode
) {
697 case float_round_nearest_even
:
698 overflow_norm
= false;
699 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
701 case float_round_ties_away
:
702 overflow_norm
= false;
705 case float_round_to_zero
:
706 overflow_norm
= true;
710 inc
= p
.sign
? 0 : round_mask
;
711 overflow_norm
= p
.sign
;
713 case float_round_down
:
714 inc
= p
.sign
? round_mask
: 0;
715 overflow_norm
= !p
.sign
;
717 case float_round_to_odd
:
718 overflow_norm
= true;
719 inc
= frac
& frac_lsb
? 0 : round_mask
;
722 g_assert_not_reached();
725 exp
+= parm
->exp_bias
;
726 if (likely(exp
> 0)) {
727 if (frac
& round_mask
) {
728 flags
|= float_flag_inexact
;
730 if (frac
& DECOMPOSED_OVERFLOW_BIT
) {
737 if (parm
->arm_althp
) {
738 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
739 if (unlikely(exp
> exp_max
)) {
740 /* Overflow. Return the maximum normal. */
741 flags
= float_flag_invalid
;
745 } else if (unlikely(exp
>= exp_max
)) {
746 flags
|= float_flag_overflow
| float_flag_inexact
;
751 p
.cls
= float_class_inf
;
755 } else if (s
->flush_to_zero
) {
756 flags
|= float_flag_output_denormal
;
757 p
.cls
= float_class_zero
;
760 bool is_tiny
= (s
->float_detect_tininess
761 == float_tininess_before_rounding
)
763 || !((frac
+ inc
) & DECOMPOSED_OVERFLOW_BIT
);
765 shift64RightJamming(frac
, 1 - exp
, &frac
);
766 if (frac
& round_mask
) {
767 /* Need to recompute round-to-even. */
768 switch (s
->float_rounding_mode
) {
769 case float_round_nearest_even
:
770 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
773 case float_round_to_odd
:
774 inc
= frac
& frac_lsb
? 0 : round_mask
;
777 flags
|= float_flag_inexact
;
781 exp
= (frac
& DECOMPOSED_IMPLICIT_BIT
? 1 : 0);
784 if (is_tiny
&& (flags
& float_flag_inexact
)) {
785 flags
|= float_flag_underflow
;
787 if (exp
== 0 && frac
== 0) {
788 p
.cls
= float_class_zero
;
793 case float_class_zero
:
799 case float_class_inf
:
801 assert(!parm
->arm_althp
);
806 case float_class_qnan
:
807 case float_class_snan
:
808 assert(!parm
->arm_althp
);
810 frac
>>= parm
->frac_shift
;
814 g_assert_not_reached();
817 float_raise(flags
, s
);
823 /* Explicit FloatFmt version */
824 static FloatParts
float16a_unpack_canonical(float16 f
, float_status
*s
,
825 const FloatFmt
*params
)
827 return sf_canonicalize(float16_unpack_raw(f
), params
, s
);
830 static FloatParts
float16_unpack_canonical(float16 f
, float_status
*s
)
832 return float16a_unpack_canonical(f
, s
, &float16_params
);
835 static float16
float16a_round_pack_canonical(FloatParts p
, float_status
*s
,
836 const FloatFmt
*params
)
838 return float16_pack_raw(round_canonical(p
, s
, params
));
841 static float16
float16_round_pack_canonical(FloatParts p
, float_status
*s
)
843 return float16a_round_pack_canonical(p
, s
, &float16_params
);
846 static FloatParts
float32_unpack_canonical(float32 f
, float_status
*s
)
848 return sf_canonicalize(float32_unpack_raw(f
), &float32_params
, s
);
851 static float32
float32_round_pack_canonical(FloatParts p
, float_status
*s
)
853 return float32_pack_raw(round_canonical(p
, s
, &float32_params
));
856 static FloatParts
float64_unpack_canonical(float64 f
, float_status
*s
)
858 return sf_canonicalize(float64_unpack_raw(f
), &float64_params
, s
);
861 static float64
float64_round_pack_canonical(FloatParts p
, float_status
*s
)
863 return float64_pack_raw(round_canonical(p
, s
, &float64_params
));
866 static FloatParts
return_nan(FloatParts a
, float_status
*s
)
869 case float_class_snan
:
870 s
->float_exception_flags
|= float_flag_invalid
;
871 a
= parts_silence_nan(a
, s
);
873 case float_class_qnan
:
874 if (s
->default_nan_mode
) {
875 return parts_default_nan(s
);
880 g_assert_not_reached();
885 static FloatParts
pick_nan(FloatParts a
, FloatParts b
, float_status
*s
)
887 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
888 s
->float_exception_flags
|= float_flag_invalid
;
891 if (s
->default_nan_mode
) {
892 return parts_default_nan(s
);
894 if (pickNaN(a
.cls
, b
.cls
,
896 (a
.frac
== b
.frac
&& a
.sign
< b
.sign
))) {
899 if (is_snan(a
.cls
)) {
900 return parts_silence_nan(a
, s
);
906 static FloatParts
pick_nan_muladd(FloatParts a
, FloatParts b
, FloatParts c
,
907 bool inf_zero
, float_status
*s
)
911 if (is_snan(a
.cls
) || is_snan(b
.cls
) || is_snan(c
.cls
)) {
912 s
->float_exception_flags
|= float_flag_invalid
;
915 which
= pickNaNMulAdd(a
.cls
, b
.cls
, c
.cls
, inf_zero
, s
);
917 if (s
->default_nan_mode
) {
918 /* Note that this check is after pickNaNMulAdd so that function
919 * has an opportunity to set the Invalid flag.
934 return parts_default_nan(s
);
936 g_assert_not_reached();
939 if (is_snan(a
.cls
)) {
940 return parts_silence_nan(a
, s
);
946 * Returns the result of adding or subtracting the values of the
947 * floating-point values `a' and `b'. The operation is performed
948 * according to the IEC/IEEE Standard for Binary Floating-Point
952 static FloatParts
addsub_floats(FloatParts a
, FloatParts b
, bool subtract
,
955 bool a_sign
= a
.sign
;
956 bool b_sign
= b
.sign
^ subtract
;
958 if (a_sign
!= b_sign
) {
961 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
962 if (a
.exp
> b
.exp
|| (a
.exp
== b
.exp
&& a
.frac
>= b
.frac
)) {
963 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
964 a
.frac
= a
.frac
- b
.frac
;
966 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
967 a
.frac
= b
.frac
- a
.frac
;
973 a
.cls
= float_class_zero
;
974 a
.sign
= s
->float_rounding_mode
== float_round_down
;
976 int shift
= clz64(a
.frac
) - 1;
977 a
.frac
= a
.frac
<< shift
;
978 a
.exp
= a
.exp
- shift
;
983 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
984 return pick_nan(a
, b
, s
);
986 if (a
.cls
== float_class_inf
) {
987 if (b
.cls
== float_class_inf
) {
988 float_raise(float_flag_invalid
, s
);
989 return parts_default_nan(s
);
993 if (a
.cls
== float_class_zero
&& b
.cls
== float_class_zero
) {
994 a
.sign
= s
->float_rounding_mode
== float_round_down
;
997 if (a
.cls
== float_class_zero
|| b
.cls
== float_class_inf
) {
1001 if (b
.cls
== float_class_zero
) {
1006 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1007 if (a
.exp
> b
.exp
) {
1008 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
1009 } else if (a
.exp
< b
.exp
) {
1010 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
1014 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
1015 shift64RightJamming(a
.frac
, 1, &a
.frac
);
1020 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1021 return pick_nan(a
, b
, s
);
1023 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1026 if (b
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1031 g_assert_not_reached();
1035 * Returns the result of adding or subtracting the floating-point
1036 * values `a' and `b'. The operation is performed according to the
1037 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1040 float16 QEMU_FLATTEN
float16_add(float16 a
, float16 b
, float_status
*status
)
1042 FloatParts pa
= float16_unpack_canonical(a
, status
);
1043 FloatParts pb
= float16_unpack_canonical(b
, status
);
1044 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
1046 return float16_round_pack_canonical(pr
, status
);
1049 float16 QEMU_FLATTEN
float16_sub(float16 a
, float16 b
, float_status
*status
)
1051 FloatParts pa
= float16_unpack_canonical(a
, status
);
1052 FloatParts pb
= float16_unpack_canonical(b
, status
);
1053 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
1055 return float16_round_pack_canonical(pr
, status
);
1058 static float32 QEMU_SOFTFLOAT_ATTR
1059 soft_f32_addsub(float32 a
, float32 b
, bool subtract
, float_status
*status
)
1061 FloatParts pa
= float32_unpack_canonical(a
, status
);
1062 FloatParts pb
= float32_unpack_canonical(b
, status
);
1063 FloatParts pr
= addsub_floats(pa
, pb
, subtract
, status
);
1065 return float32_round_pack_canonical(pr
, status
);
1068 static inline float32
soft_f32_add(float32 a
, float32 b
, float_status
*status
)
1070 return soft_f32_addsub(a
, b
, false, status
);
1073 static inline float32
soft_f32_sub(float32 a
, float32 b
, float_status
*status
)
1075 return soft_f32_addsub(a
, b
, true, status
);
1078 static float64 QEMU_SOFTFLOAT_ATTR
1079 soft_f64_addsub(float64 a
, float64 b
, bool subtract
, float_status
*status
)
1081 FloatParts pa
= float64_unpack_canonical(a
, status
);
1082 FloatParts pb
= float64_unpack_canonical(b
, status
);
1083 FloatParts pr
= addsub_floats(pa
, pb
, subtract
, status
);
1085 return float64_round_pack_canonical(pr
, status
);
1088 static inline float64
soft_f64_add(float64 a
, float64 b
, float_status
*status
)
1090 return soft_f64_addsub(a
, b
, false, status
);
1093 static inline float64
soft_f64_sub(float64 a
, float64 b
, float_status
*status
)
1095 return soft_f64_addsub(a
, b
, true, status
);
1098 static float hard_f32_add(float a
, float b
)
1103 static float hard_f32_sub(float a
, float b
)
1108 static double hard_f64_add(double a
, double b
)
1113 static double hard_f64_sub(double a
, double b
)
1118 static bool f32_addsub_post(union_float32 a
, union_float32 b
)
1120 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1121 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1123 return !(float32_is_zero(a
.s
) && float32_is_zero(b
.s
));
1126 static bool f64_addsub_post(union_float64 a
, union_float64 b
)
1128 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1129 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1131 return !(float64_is_zero(a
.s
) && float64_is_zero(b
.s
));
1135 static float32
float32_addsub(float32 a
, float32 b
, float_status
*s
,
1136 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
)
1138 return float32_gen2(a
, b
, s
, hard
, soft
,
1139 f32_is_zon2
, f32_addsub_post
, NULL
, NULL
);
1142 static float64
float64_addsub(float64 a
, float64 b
, float_status
*s
,
1143 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
)
1145 return float64_gen2(a
, b
, s
, hard
, soft
,
1146 f64_is_zon2
, f64_addsub_post
, NULL
, NULL
);
1149 float32 QEMU_FLATTEN
1150 float32_add(float32 a
, float32 b
, float_status
*s
)
1152 return float32_addsub(a
, b
, s
, hard_f32_add
, soft_f32_add
);
1155 float32 QEMU_FLATTEN
1156 float32_sub(float32 a
, float32 b
, float_status
*s
)
1158 return float32_addsub(a
, b
, s
, hard_f32_sub
, soft_f32_sub
);
1161 float64 QEMU_FLATTEN
1162 float64_add(float64 a
, float64 b
, float_status
*s
)
1164 return float64_addsub(a
, b
, s
, hard_f64_add
, soft_f64_add
);
1167 float64 QEMU_FLATTEN
1168 float64_sub(float64 a
, float64 b
, float_status
*s
)
1170 return float64_addsub(a
, b
, s
, hard_f64_sub
, soft_f64_sub
);
1174 * Returns the result of multiplying the floating-point values `a' and
1175 * `b'. The operation is performed according to the IEC/IEEE Standard
1176 * for Binary Floating-Point Arithmetic.
1179 static FloatParts
mul_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1181 bool sign
= a
.sign
^ b
.sign
;
1183 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1185 int exp
= a
.exp
+ b
.exp
;
1187 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1188 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1189 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
1190 shift64RightJamming(lo
, 1, &lo
);
1200 /* handle all the NaN cases */
1201 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1202 return pick_nan(a
, b
, s
);
1204 /* Inf * Zero == NaN */
1205 if ((a
.cls
== float_class_inf
&& b
.cls
== float_class_zero
) ||
1206 (a
.cls
== float_class_zero
&& b
.cls
== float_class_inf
)) {
1207 s
->float_exception_flags
|= float_flag_invalid
;
1208 return parts_default_nan(s
);
1210 /* Multiply by 0 or Inf */
1211 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1215 if (b
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1219 g_assert_not_reached();
1222 float16 QEMU_FLATTEN
float16_mul(float16 a
, float16 b
, float_status
*status
)
1224 FloatParts pa
= float16_unpack_canonical(a
, status
);
1225 FloatParts pb
= float16_unpack_canonical(b
, status
);
1226 FloatParts pr
= mul_floats(pa
, pb
, status
);
1228 return float16_round_pack_canonical(pr
, status
);
1231 static float32 QEMU_SOFTFLOAT_ATTR
1232 soft_f32_mul(float32 a
, float32 b
, float_status
*status
)
1234 FloatParts pa
= float32_unpack_canonical(a
, status
);
1235 FloatParts pb
= float32_unpack_canonical(b
, status
);
1236 FloatParts pr
= mul_floats(pa
, pb
, status
);
1238 return float32_round_pack_canonical(pr
, status
);
1241 static float64 QEMU_SOFTFLOAT_ATTR
1242 soft_f64_mul(float64 a
, float64 b
, float_status
*status
)
1244 FloatParts pa
= float64_unpack_canonical(a
, status
);
1245 FloatParts pb
= float64_unpack_canonical(b
, status
);
1246 FloatParts pr
= mul_floats(pa
, pb
, status
);
1248 return float64_round_pack_canonical(pr
, status
);
1251 static float hard_f32_mul(float a
, float b
)
1256 static double hard_f64_mul(double a
, double b
)
1261 static bool f32_mul_fast_test(union_float32 a
, union_float32 b
)
1263 return float32_is_zero(a
.s
) || float32_is_zero(b
.s
);
1266 static bool f64_mul_fast_test(union_float64 a
, union_float64 b
)
1268 return float64_is_zero(a
.s
) || float64_is_zero(b
.s
);
1271 static float32
f32_mul_fast_op(float32 a
, float32 b
, float_status
*s
)
1273 bool signbit
= float32_is_neg(a
) ^ float32_is_neg(b
);
1275 return float32_set_sign(float32_zero
, signbit
);
1278 static float64
f64_mul_fast_op(float64 a
, float64 b
, float_status
*s
)
1280 bool signbit
= float64_is_neg(a
) ^ float64_is_neg(b
);
1282 return float64_set_sign(float64_zero
, signbit
);
1285 float32 QEMU_FLATTEN
1286 float32_mul(float32 a
, float32 b
, float_status
*s
)
1288 return float32_gen2(a
, b
, s
, hard_f32_mul
, soft_f32_mul
,
1289 f32_is_zon2
, NULL
, f32_mul_fast_test
, f32_mul_fast_op
);
1292 float64 QEMU_FLATTEN
1293 float64_mul(float64 a
, float64 b
, float_status
*s
)
1295 return float64_gen2(a
, b
, s
, hard_f64_mul
, soft_f64_mul
,
1296 f64_is_zon2
, NULL
, f64_mul_fast_test
, f64_mul_fast_op
);
1300 * Returns the result of multiplying the floating-point values `a' and
1301 * `b' then adding 'c', with no intermediate rounding step after the
1302 * multiplication. The operation is performed according to the
1303 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1304 * The flags argument allows the caller to select negation of the
1305 * addend, the intermediate product, or the final result. (The
1306 * difference between this and having the caller do a separate
1307 * negation is that negating externally will flip the sign bit on
1311 static FloatParts
muladd_floats(FloatParts a
, FloatParts b
, FloatParts c
,
1312 int flags
, float_status
*s
)
1314 bool inf_zero
= ((1 << a
.cls
) | (1 << b
.cls
)) ==
1315 ((1 << float_class_inf
) | (1 << float_class_zero
));
1317 bool sign_flip
= flags
& float_muladd_negate_result
;
1322 /* It is implementation-defined whether the cases of (0,inf,qnan)
1323 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1324 * they return if they do), so we have to hand this information
1325 * off to the target-specific pick-a-NaN routine.
1327 if (is_nan(a
.cls
) || is_nan(b
.cls
) || is_nan(c
.cls
)) {
1328 return pick_nan_muladd(a
, b
, c
, inf_zero
, s
);
1332 s
->float_exception_flags
|= float_flag_invalid
;
1333 return parts_default_nan(s
);
1336 if (flags
& float_muladd_negate_c
) {
1340 p_sign
= a
.sign
^ b
.sign
;
1342 if (flags
& float_muladd_negate_product
) {
1346 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_inf
) {
1347 p_class
= float_class_inf
;
1348 } else if (a
.cls
== float_class_zero
|| b
.cls
== float_class_zero
) {
1349 p_class
= float_class_zero
;
1351 p_class
= float_class_normal
;
1354 if (c
.cls
== float_class_inf
) {
1355 if (p_class
== float_class_inf
&& p_sign
!= c
.sign
) {
1356 s
->float_exception_flags
|= float_flag_invalid
;
1357 return parts_default_nan(s
);
1359 a
.cls
= float_class_inf
;
1360 a
.sign
= c
.sign
^ sign_flip
;
1365 if (p_class
== float_class_inf
) {
1366 a
.cls
= float_class_inf
;
1367 a
.sign
= p_sign
^ sign_flip
;
1371 if (p_class
== float_class_zero
) {
1372 if (c
.cls
== float_class_zero
) {
1373 if (p_sign
!= c
.sign
) {
1374 p_sign
= s
->float_rounding_mode
== float_round_down
;
1377 } else if (flags
& float_muladd_halve_result
) {
1380 c
.sign
^= sign_flip
;
1384 /* a & b should be normals now... */
1385 assert(a
.cls
== float_class_normal
&&
1386 b
.cls
== float_class_normal
);
1388 p_exp
= a
.exp
+ b
.exp
;
1390 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1393 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1394 /* binary point now at bit 124 */
1396 /* check for overflow */
1397 if (hi
& (1ULL << (DECOMPOSED_BINARY_POINT
* 2 + 1 - 64))) {
1398 shift128RightJamming(hi
, lo
, 1, &hi
, &lo
);
1403 if (c
.cls
== float_class_zero
) {
1404 /* move binary point back to 62 */
1405 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1407 int exp_diff
= p_exp
- c
.exp
;
1408 if (p_sign
== c
.sign
) {
1410 if (exp_diff
<= 0) {
1411 shift128RightJamming(hi
, lo
,
1412 DECOMPOSED_BINARY_POINT
- exp_diff
,
1417 uint64_t c_hi
, c_lo
;
1418 /* shift c to the same binary point as the product (124) */
1421 shift128RightJamming(c_hi
, c_lo
,
1424 add128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1425 /* move binary point back to 62 */
1426 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1429 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
1430 shift64RightJamming(lo
, 1, &lo
);
1436 uint64_t c_hi
, c_lo
;
1437 /* make C binary point match product at bit 124 */
1441 if (exp_diff
<= 0) {
1442 shift128RightJamming(hi
, lo
, -exp_diff
, &hi
, &lo
);
1445 (hi
> c_hi
|| (hi
== c_hi
&& lo
>= c_lo
))) {
1446 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1448 sub128(c_hi
, c_lo
, hi
, lo
, &hi
, &lo
);
1453 shift128RightJamming(c_hi
, c_lo
,
1456 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1459 if (hi
== 0 && lo
== 0) {
1460 a
.cls
= float_class_zero
;
1461 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1462 a
.sign
^= sign_flip
;
1469 shift
= clz64(lo
) + 64;
1471 /* Normalizing to a binary point of 124 is the
1472 correct adjust for the exponent. However since we're
1473 shifting, we might as well put the binary point back
1474 at 62 where we really want it. Therefore shift as
1475 if we're leaving 1 bit at the top of the word, but
1476 adjust the exponent as if we're leaving 3 bits. */
1479 lo
= lo
<< (shift
- 64);
1481 hi
= (hi
<< shift
) | (lo
>> (64 - shift
));
1482 lo
= hi
| ((lo
<< shift
) != 0);
1489 if (flags
& float_muladd_halve_result
) {
1493 /* finally prepare our result */
1494 a
.cls
= float_class_normal
;
1495 a
.sign
= p_sign
^ sign_flip
;
1502 float16 QEMU_FLATTEN
float16_muladd(float16 a
, float16 b
, float16 c
,
1503 int flags
, float_status
*status
)
1505 FloatParts pa
= float16_unpack_canonical(a
, status
);
1506 FloatParts pb
= float16_unpack_canonical(b
, status
);
1507 FloatParts pc
= float16_unpack_canonical(c
, status
);
1508 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1510 return float16_round_pack_canonical(pr
, status
);
1513 static float32 QEMU_SOFTFLOAT_ATTR
1514 soft_f32_muladd(float32 a
, float32 b
, float32 c
, int flags
,
1515 float_status
*status
)
1517 FloatParts pa
= float32_unpack_canonical(a
, status
);
1518 FloatParts pb
= float32_unpack_canonical(b
, status
);
1519 FloatParts pc
= float32_unpack_canonical(c
, status
);
1520 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1522 return float32_round_pack_canonical(pr
, status
);
1525 static float64 QEMU_SOFTFLOAT_ATTR
1526 soft_f64_muladd(float64 a
, float64 b
, float64 c
, int flags
,
1527 float_status
*status
)
1529 FloatParts pa
= float64_unpack_canonical(a
, status
);
1530 FloatParts pb
= float64_unpack_canonical(b
, status
);
1531 FloatParts pc
= float64_unpack_canonical(c
, status
);
1532 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1534 return float64_round_pack_canonical(pr
, status
);
1537 static bool force_soft_fma
;
1539 float32 QEMU_FLATTEN
1540 float32_muladd(float32 xa
, float32 xb
, float32 xc
, int flags
, float_status
*s
)
1542 union_float32 ua
, ub
, uc
, ur
;
1548 if (unlikely(!can_use_fpu(s
))) {
1551 if (unlikely(flags
& float_muladd_halve_result
)) {
1555 float32_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1556 if (unlikely(!f32_is_zon3(ua
, ub
, uc
))) {
1560 if (unlikely(force_soft_fma
)) {
1565 * When (a || b) == 0, there's no need to check for under/over flow,
1566 * since we know the addend is (normal || 0) and the product is 0.
1568 if (float32_is_zero(ua
.s
) || float32_is_zero(ub
.s
)) {
1572 prod_sign
= float32_is_neg(ua
.s
) ^ float32_is_neg(ub
.s
);
1573 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1574 up
.s
= float32_set_sign(float32_zero
, prod_sign
);
1576 if (flags
& float_muladd_negate_c
) {
1581 union_float32 ua_orig
= ua
;
1582 union_float32 uc_orig
= uc
;
1584 if (flags
& float_muladd_negate_product
) {
1587 if (flags
& float_muladd_negate_c
) {
1591 ur
.h
= fmaf(ua
.h
, ub
.h
, uc
.h
);
1593 if (unlikely(f32_is_inf(ur
))) {
1594 s
->float_exception_flags
|= float_flag_overflow
;
1595 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
)) {
1601 if (flags
& float_muladd_negate_result
) {
1602 return float32_chs(ur
.s
);
1607 return soft_f32_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1610 float64 QEMU_FLATTEN
1611 float64_muladd(float64 xa
, float64 xb
, float64 xc
, int flags
, float_status
*s
)
1613 union_float64 ua
, ub
, uc
, ur
;
1619 if (unlikely(!can_use_fpu(s
))) {
1622 if (unlikely(flags
& float_muladd_halve_result
)) {
1626 float64_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1627 if (unlikely(!f64_is_zon3(ua
, ub
, uc
))) {
1631 if (unlikely(force_soft_fma
)) {
1636 * When (a || b) == 0, there's no need to check for under/over flow,
1637 * since we know the addend is (normal || 0) and the product is 0.
1639 if (float64_is_zero(ua
.s
) || float64_is_zero(ub
.s
)) {
1643 prod_sign
= float64_is_neg(ua
.s
) ^ float64_is_neg(ub
.s
);
1644 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1645 up
.s
= float64_set_sign(float64_zero
, prod_sign
);
1647 if (flags
& float_muladd_negate_c
) {
1652 union_float64 ua_orig
= ua
;
1653 union_float64 uc_orig
= uc
;
1655 if (flags
& float_muladd_negate_product
) {
1658 if (flags
& float_muladd_negate_c
) {
1662 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
1664 if (unlikely(f64_is_inf(ur
))) {
1665 s
->float_exception_flags
|= float_flag_overflow
;
1666 } else if (unlikely(fabs(ur
.h
) <= FLT_MIN
)) {
1672 if (flags
& float_muladd_negate_result
) {
1673 return float64_chs(ur
.s
);
1678 return soft_f64_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1682 * Returns the result of dividing the floating-point value `a' by the
1683 * corresponding value `b'. The operation is performed according to
1684 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1687 static FloatParts
div_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1689 bool sign
= a
.sign
^ b
.sign
;
1691 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1692 uint64_t n0
, n1
, q
, r
;
1693 int exp
= a
.exp
- b
.exp
;
1696 * We want a 2*N / N-bit division to produce exactly an N-bit
1697 * result, so that we do not lose any precision and so that we
1698 * do not have to renormalize afterward. If A.frac < B.frac,
1699 * then division would produce an (N-1)-bit result; shift A left
1700 * by one to produce the an N-bit result, and decrement the
1701 * exponent to match.
1703 * The udiv_qrnnd algorithm that we're using requires normalization,
1704 * i.e. the msb of the denominator must be set. Since we know that
1705 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1706 * by one (more), and the remainder must be shifted right by one.
1708 if (a
.frac
< b
.frac
) {
1710 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 2, &n1
, &n0
);
1712 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1, &n1
, &n0
);
1714 q
= udiv_qrnnd(&r
, n1
, n0
, b
.frac
<< 1);
1717 * Set lsb if there is a remainder, to set inexact.
1718 * As mentioned above, to find the actual value of the remainder we
1719 * would need to shift right, but (1) we are only concerned about
1720 * non-zero-ness, and (2) the remainder will always be even because
1721 * both inputs to the division primitive are even.
1723 a
.frac
= q
| (r
!= 0);
1728 /* handle all the NaN cases */
1729 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1730 return pick_nan(a
, b
, s
);
1732 /* 0/0 or Inf/Inf */
1735 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1736 s
->float_exception_flags
|= float_flag_invalid
;
1737 return parts_default_nan(s
);
1739 /* Inf / x or 0 / x */
1740 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1745 if (b
.cls
== float_class_zero
) {
1746 s
->float_exception_flags
|= float_flag_divbyzero
;
1747 a
.cls
= float_class_inf
;
1752 if (b
.cls
== float_class_inf
) {
1753 a
.cls
= float_class_zero
;
1757 g_assert_not_reached();
1760 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1762 FloatParts pa
= float16_unpack_canonical(a
, status
);
1763 FloatParts pb
= float16_unpack_canonical(b
, status
);
1764 FloatParts pr
= div_floats(pa
, pb
, status
);
1766 return float16_round_pack_canonical(pr
, status
);
1769 static float32 QEMU_SOFTFLOAT_ATTR
1770 soft_f32_div(float32 a
, float32 b
, float_status
*status
)
1772 FloatParts pa
= float32_unpack_canonical(a
, status
);
1773 FloatParts pb
= float32_unpack_canonical(b
, status
);
1774 FloatParts pr
= div_floats(pa
, pb
, status
);
1776 return float32_round_pack_canonical(pr
, status
);
1779 static float64 QEMU_SOFTFLOAT_ATTR
1780 soft_f64_div(float64 a
, float64 b
, float_status
*status
)
1782 FloatParts pa
= float64_unpack_canonical(a
, status
);
1783 FloatParts pb
= float64_unpack_canonical(b
, status
);
1784 FloatParts pr
= div_floats(pa
, pb
, status
);
1786 return float64_round_pack_canonical(pr
, status
);
1789 static float hard_f32_div(float a
, float b
)
1794 static double hard_f64_div(double a
, double b
)
1799 static bool f32_div_pre(union_float32 a
, union_float32 b
)
1801 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1802 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1803 fpclassify(b
.h
) == FP_NORMAL
;
1805 return float32_is_zero_or_normal(a
.s
) && float32_is_normal(b
.s
);
1808 static bool f64_div_pre(union_float64 a
, union_float64 b
)
1810 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1811 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1812 fpclassify(b
.h
) == FP_NORMAL
;
1814 return float64_is_zero_or_normal(a
.s
) && float64_is_normal(b
.s
);
1817 static bool f32_div_post(union_float32 a
, union_float32 b
)
1819 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1820 return fpclassify(a
.h
) != FP_ZERO
;
1822 return !float32_is_zero(a
.s
);
1825 static bool f64_div_post(union_float64 a
, union_float64 b
)
1827 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1828 return fpclassify(a
.h
) != FP_ZERO
;
1830 return !float64_is_zero(a
.s
);
1833 float32 QEMU_FLATTEN
1834 float32_div(float32 a
, float32 b
, float_status
*s
)
1836 return float32_gen2(a
, b
, s
, hard_f32_div
, soft_f32_div
,
1837 f32_div_pre
, f32_div_post
, NULL
, NULL
);
1840 float64 QEMU_FLATTEN
1841 float64_div(float64 a
, float64 b
, float_status
*s
)
1843 return float64_gen2(a
, b
, s
, hard_f64_div
, soft_f64_div
,
1844 f64_div_pre
, f64_div_post
, NULL
, NULL
);
1848 * Float to Float conversions
1850 * Returns the result of converting one float format to another. The
1851 * conversion is performed according to the IEC/IEEE Standard for
1852 * Binary Floating-Point Arithmetic.
1854 * The float_to_float helper only needs to take care of raising
1855 * invalid exceptions and handling the conversion on NaNs.
1858 static FloatParts
float_to_float(FloatParts a
, const FloatFmt
*dstf
,
1861 if (dstf
->arm_althp
) {
1863 case float_class_qnan
:
1864 case float_class_snan
:
1865 /* There is no NaN in the destination format. Raise Invalid
1866 * and return a zero with the sign of the input NaN.
1868 s
->float_exception_flags
|= float_flag_invalid
;
1869 a
.cls
= float_class_zero
;
1874 case float_class_inf
:
1875 /* There is no Inf in the destination format. Raise Invalid
1876 * and return the maximum normal with the correct sign.
1878 s
->float_exception_flags
|= float_flag_invalid
;
1879 a
.cls
= float_class_normal
;
1880 a
.exp
= dstf
->exp_max
;
1881 a
.frac
= ((1ull << dstf
->frac_size
) - 1) << dstf
->frac_shift
;
1887 } else if (is_nan(a
.cls
)) {
1888 if (is_snan(a
.cls
)) {
1889 s
->float_exception_flags
|= float_flag_invalid
;
1890 a
= parts_silence_nan(a
, s
);
1892 if (s
->default_nan_mode
) {
1893 return parts_default_nan(s
);
1899 float32
float16_to_float32(float16 a
, bool ieee
, float_status
*s
)
1901 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1902 FloatParts p
= float16a_unpack_canonical(a
, s
, fmt16
);
1903 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
1904 return float32_round_pack_canonical(pr
, s
);
1907 float64
float16_to_float64(float16 a
, bool ieee
, float_status
*s
)
1909 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1910 FloatParts p
= float16a_unpack_canonical(a
, s
, fmt16
);
1911 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
1912 return float64_round_pack_canonical(pr
, s
);
1915 float16
float32_to_float16(float32 a
, bool ieee
, float_status
*s
)
1917 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1918 FloatParts p
= float32_unpack_canonical(a
, s
);
1919 FloatParts pr
= float_to_float(p
, fmt16
, s
);
1920 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1923 float64
float32_to_float64(float32 a
, float_status
*s
)
1925 FloatParts p
= float32_unpack_canonical(a
, s
);
1926 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
1927 return float64_round_pack_canonical(pr
, s
);
1930 float16
float64_to_float16(float64 a
, bool ieee
, float_status
*s
)
1932 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1933 FloatParts p
= float64_unpack_canonical(a
, s
);
1934 FloatParts pr
= float_to_float(p
, fmt16
, s
);
1935 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1938 float32
float64_to_float32(float64 a
, float_status
*s
)
1940 FloatParts p
= float64_unpack_canonical(a
, s
);
1941 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
1942 return float32_round_pack_canonical(pr
, s
);
1946 * Rounds the floating-point value `a' to an integer, and returns the
1947 * result as a floating-point value. The operation is performed
1948 * according to the IEC/IEEE Standard for Binary Floating-Point
1952 static FloatParts
round_to_int(FloatParts a
, int rmode
,
1953 int scale
, float_status
*s
)
1956 case float_class_qnan
:
1957 case float_class_snan
:
1958 return return_nan(a
, s
);
1960 case float_class_zero
:
1961 case float_class_inf
:
1962 /* already "integral" */
1965 case float_class_normal
:
1966 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
1969 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
1970 /* already integral */
1975 /* all fractional */
1976 s
->float_exception_flags
|= float_flag_inexact
;
1978 case float_round_nearest_even
:
1979 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
1981 case float_round_ties_away
:
1982 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
1984 case float_round_to_zero
:
1987 case float_round_up
:
1990 case float_round_down
:
1993 case float_round_to_odd
:
1997 g_assert_not_reached();
2001 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
2004 a
.cls
= float_class_zero
;
2007 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
2008 uint64_t frac_lsbm1
= frac_lsb
>> 1;
2009 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
2010 uint64_t rnd_mask
= rnd_even_mask
>> 1;
2014 case float_round_nearest_even
:
2015 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
2017 case float_round_ties_away
:
2020 case float_round_to_zero
:
2023 case float_round_up
:
2024 inc
= a
.sign
? 0 : rnd_mask
;
2026 case float_round_down
:
2027 inc
= a
.sign
? rnd_mask
: 0;
2029 case float_round_to_odd
:
2030 inc
= a
.frac
& frac_lsb
? 0 : rnd_mask
;
2033 g_assert_not_reached();
2036 if (a
.frac
& rnd_mask
) {
2037 s
->float_exception_flags
|= float_flag_inexact
;
2039 a
.frac
&= ~rnd_mask
;
2040 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
2048 g_assert_not_reached();
2053 float16
float16_round_to_int(float16 a
, float_status
*s
)
2055 FloatParts pa
= float16_unpack_canonical(a
, s
);
2056 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2057 return float16_round_pack_canonical(pr
, s
);
2060 float32
float32_round_to_int(float32 a
, float_status
*s
)
2062 FloatParts pa
= float32_unpack_canonical(a
, s
);
2063 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2064 return float32_round_pack_canonical(pr
, s
);
2067 float64
float64_round_to_int(float64 a
, float_status
*s
)
2069 FloatParts pa
= float64_unpack_canonical(a
, s
);
2070 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2071 return float64_round_pack_canonical(pr
, s
);
2075 * Returns the result of converting the floating-point value `a' to
2076 * the two's complement integer format. The conversion is performed
2077 * according to the IEC/IEEE Standard for Binary Floating-Point
2078 * Arithmetic---which means in particular that the conversion is
2079 * rounded according to the current rounding mode. If `a' is a NaN,
2080 * the largest positive integer is returned. Otherwise, if the
2081 * conversion overflows, the largest integer with the same sign as `a'
2085 static int64_t round_to_int_and_pack(FloatParts in
, int rmode
, int scale
,
2086 int64_t min
, int64_t max
,
2090 int orig_flags
= get_float_exception_flags(s
);
2091 FloatParts p
= round_to_int(in
, rmode
, scale
, s
);
2094 case float_class_snan
:
2095 case float_class_qnan
:
2096 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2098 case float_class_inf
:
2099 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2100 return p
.sign
? min
: max
;
2101 case float_class_zero
:
2103 case float_class_normal
:
2104 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
2105 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2106 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
2107 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
2112 if (r
<= -(uint64_t) min
) {
2115 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2122 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2127 g_assert_not_reached();
2131 int16_t float16_to_int16_scalbn(float16 a
, int rmode
, int scale
,
2134 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2135 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2138 int32_t float16_to_int32_scalbn(float16 a
, int rmode
, int scale
,
2141 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2142 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2145 int64_t float16_to_int64_scalbn(float16 a
, int rmode
, int scale
,
2148 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2149 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2152 int16_t float32_to_int16_scalbn(float32 a
, int rmode
, int scale
,
2155 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2156 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2159 int32_t float32_to_int32_scalbn(float32 a
, int rmode
, int scale
,
2162 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2163 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2166 int64_t float32_to_int64_scalbn(float32 a
, int rmode
, int scale
,
2169 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2170 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2173 int16_t float64_to_int16_scalbn(float64 a
, int rmode
, int scale
,
2176 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2177 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2180 int32_t float64_to_int32_scalbn(float64 a
, int rmode
, int scale
,
2183 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2184 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2187 int64_t float64_to_int64_scalbn(float64 a
, int rmode
, int scale
,
2190 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2191 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2194 int16_t float16_to_int16(float16 a
, float_status
*s
)
2196 return float16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2199 int32_t float16_to_int32(float16 a
, float_status
*s
)
2201 return float16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2204 int64_t float16_to_int64(float16 a
, float_status
*s
)
2206 return float16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2209 int16_t float32_to_int16(float32 a
, float_status
*s
)
2211 return float32_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2214 int32_t float32_to_int32(float32 a
, float_status
*s
)
2216 return float32_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2219 int64_t float32_to_int64(float32 a
, float_status
*s
)
2221 return float32_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2224 int16_t float64_to_int16(float64 a
, float_status
*s
)
2226 return float64_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2229 int32_t float64_to_int32(float64 a
, float_status
*s
)
2231 return float64_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2234 int64_t float64_to_int64(float64 a
, float_status
*s
)
2236 return float64_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2239 int16_t float16_to_int16_round_to_zero(float16 a
, float_status
*s
)
2241 return float16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2244 int32_t float16_to_int32_round_to_zero(float16 a
, float_status
*s
)
2246 return float16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2249 int64_t float16_to_int64_round_to_zero(float16 a
, float_status
*s
)
2251 return float16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2254 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*s
)
2256 return float32_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2259 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*s
)
2261 return float32_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2264 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*s
)
2266 return float32_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2269 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*s
)
2271 return float64_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2274 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*s
)
2276 return float64_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2279 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*s
)
2281 return float64_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2285 * Returns the result of converting the floating-point value `a' to
2286 * the unsigned integer format. The conversion is performed according
2287 * to the IEC/IEEE Standard for Binary Floating-Point
2288 * Arithmetic---which means in particular that the conversion is
2289 * rounded according to the current rounding mode. If `a' is a NaN,
2290 * the largest unsigned integer is returned. Otherwise, if the
2291 * conversion overflows, the largest unsigned integer is returned. If
2292 * the 'a' is negative, the result is rounded and zero is returned;
2293 * values that do not round to zero will raise the inexact exception
2297 static uint64_t round_to_uint_and_pack(FloatParts in
, int rmode
, int scale
,
2298 uint64_t max
, float_status
*s
)
2300 int orig_flags
= get_float_exception_flags(s
);
2301 FloatParts p
= round_to_int(in
, rmode
, scale
, s
);
2305 case float_class_snan
:
2306 case float_class_qnan
:
2307 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2309 case float_class_inf
:
2310 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2311 return p
.sign
? 0 : max
;
2312 case float_class_zero
:
2314 case float_class_normal
:
2316 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2320 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
2321 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2322 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
2323 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
2325 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2329 /* For uint64 this will never trip, but if p.exp is too large
2330 * to shift a decomposed fraction we shall have exited via the
2334 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2339 g_assert_not_reached();
2343 uint16_t float16_to_uint16_scalbn(float16 a
, int rmode
, int scale
,
2346 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2347 rmode
, scale
, UINT16_MAX
, s
);
2350 uint32_t float16_to_uint32_scalbn(float16 a
, int rmode
, int scale
,
2353 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2354 rmode
, scale
, UINT32_MAX
, s
);
2357 uint64_t float16_to_uint64_scalbn(float16 a
, int rmode
, int scale
,
2360 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2361 rmode
, scale
, UINT64_MAX
, s
);
2364 uint16_t float32_to_uint16_scalbn(float32 a
, int rmode
, int scale
,
2367 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2368 rmode
, scale
, UINT16_MAX
, s
);
2371 uint32_t float32_to_uint32_scalbn(float32 a
, int rmode
, int scale
,
2374 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2375 rmode
, scale
, UINT32_MAX
, s
);
2378 uint64_t float32_to_uint64_scalbn(float32 a
, int rmode
, int scale
,
2381 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2382 rmode
, scale
, UINT64_MAX
, s
);
2385 uint16_t float64_to_uint16_scalbn(float64 a
, int rmode
, int scale
,
2388 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2389 rmode
, scale
, UINT16_MAX
, s
);
2392 uint32_t float64_to_uint32_scalbn(float64 a
, int rmode
, int scale
,
2395 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2396 rmode
, scale
, UINT32_MAX
, s
);
2399 uint64_t float64_to_uint64_scalbn(float64 a
, int rmode
, int scale
,
2402 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2403 rmode
, scale
, UINT64_MAX
, s
);
2406 uint16_t float16_to_uint16(float16 a
, float_status
*s
)
2408 return float16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2411 uint32_t float16_to_uint32(float16 a
, float_status
*s
)
2413 return float16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2416 uint64_t float16_to_uint64(float16 a
, float_status
*s
)
2418 return float16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2421 uint16_t float32_to_uint16(float32 a
, float_status
*s
)
2423 return float32_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2426 uint32_t float32_to_uint32(float32 a
, float_status
*s
)
2428 return float32_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2431 uint64_t float32_to_uint64(float32 a
, float_status
*s
)
2433 return float32_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2436 uint16_t float64_to_uint16(float64 a
, float_status
*s
)
2438 return float64_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2441 uint32_t float64_to_uint32(float64 a
, float_status
*s
)
2443 return float64_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2446 uint64_t float64_to_uint64(float64 a
, float_status
*s
)
2448 return float64_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2451 uint16_t float16_to_uint16_round_to_zero(float16 a
, float_status
*s
)
2453 return float16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2456 uint32_t float16_to_uint32_round_to_zero(float16 a
, float_status
*s
)
2458 return float16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2461 uint64_t float16_to_uint64_round_to_zero(float16 a
, float_status
*s
)
2463 return float16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2466 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*s
)
2468 return float32_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2471 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*s
)
2473 return float32_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2476 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*s
)
2478 return float32_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2481 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*s
)
2483 return float64_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2486 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*s
)
2488 return float64_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2491 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*s
)
2493 return float64_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2497 * Integer to float conversions
2499 * Returns the result of converting the two's complement integer `a'
2500 * to the floating-point format. The conversion is performed according
2501 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2504 static FloatParts
int_to_float(int64_t a
, int scale
, float_status
*status
)
2506 FloatParts r
= { .sign
= false };
2509 r
.cls
= float_class_zero
;
2514 r
.cls
= float_class_normal
;
2519 shift
= clz64(f
) - 1;
2520 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2522 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2523 r
.frac
= (shift
< 0 ? DECOMPOSED_IMPLICIT_BIT
: f
<< shift
);
2529 float16
int64_to_float16_scalbn(int64_t a
, int scale
, float_status
*status
)
2531 FloatParts pa
= int_to_float(a
, scale
, status
);
2532 return float16_round_pack_canonical(pa
, status
);
2535 float16
int32_to_float16_scalbn(int32_t a
, int scale
, float_status
*status
)
2537 return int64_to_float16_scalbn(a
, scale
, status
);
2540 float16
int16_to_float16_scalbn(int16_t a
, int scale
, float_status
*status
)
2542 return int64_to_float16_scalbn(a
, scale
, status
);
2545 float16
int64_to_float16(int64_t a
, float_status
*status
)
2547 return int64_to_float16_scalbn(a
, 0, status
);
2550 float16
int32_to_float16(int32_t a
, float_status
*status
)
2552 return int64_to_float16_scalbn(a
, 0, status
);
2555 float16
int16_to_float16(int16_t a
, float_status
*status
)
2557 return int64_to_float16_scalbn(a
, 0, status
);
2560 float32
int64_to_float32_scalbn(int64_t a
, int scale
, float_status
*status
)
2562 FloatParts pa
= int_to_float(a
, scale
, status
);
2563 return float32_round_pack_canonical(pa
, status
);
2566 float32
int32_to_float32_scalbn(int32_t a
, int scale
, float_status
*status
)
2568 return int64_to_float32_scalbn(a
, scale
, status
);
2571 float32
int16_to_float32_scalbn(int16_t a
, int scale
, float_status
*status
)
2573 return int64_to_float32_scalbn(a
, scale
, status
);
2576 float32
int64_to_float32(int64_t a
, float_status
*status
)
2578 return int64_to_float32_scalbn(a
, 0, status
);
2581 float32
int32_to_float32(int32_t a
, float_status
*status
)
2583 return int64_to_float32_scalbn(a
, 0, status
);
2586 float32
int16_to_float32(int16_t a
, float_status
*status
)
2588 return int64_to_float32_scalbn(a
, 0, status
);
2591 float64
int64_to_float64_scalbn(int64_t a
, int scale
, float_status
*status
)
2593 FloatParts pa
= int_to_float(a
, scale
, status
);
2594 return float64_round_pack_canonical(pa
, status
);
2597 float64
int32_to_float64_scalbn(int32_t a
, int scale
, float_status
*status
)
2599 return int64_to_float64_scalbn(a
, scale
, status
);
2602 float64
int16_to_float64_scalbn(int16_t a
, int scale
, float_status
*status
)
2604 return int64_to_float64_scalbn(a
, scale
, status
);
2607 float64
int64_to_float64(int64_t a
, float_status
*status
)
2609 return int64_to_float64_scalbn(a
, 0, status
);
2612 float64
int32_to_float64(int32_t a
, float_status
*status
)
2614 return int64_to_float64_scalbn(a
, 0, status
);
2617 float64
int16_to_float64(int16_t a
, float_status
*status
)
2619 return int64_to_float64_scalbn(a
, 0, status
);
2624 * Unsigned Integer to float conversions
2626 * Returns the result of converting the unsigned integer `a' to the
2627 * floating-point format. The conversion is performed according to the
2628 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2631 static FloatParts
uint_to_float(uint64_t a
, int scale
, float_status
*status
)
2633 FloatParts r
= { .sign
= false };
2636 r
.cls
= float_class_zero
;
2638 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2639 r
.cls
= float_class_normal
;
2640 if ((int64_t)a
< 0) {
2641 r
.exp
= DECOMPOSED_BINARY_POINT
+ 1 + scale
;
2642 shift64RightJamming(a
, 1, &a
);
2645 int shift
= clz64(a
) - 1;
2646 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2647 r
.frac
= a
<< shift
;
2654 float16
uint64_to_float16_scalbn(uint64_t a
, int scale
, float_status
*status
)
2656 FloatParts pa
= uint_to_float(a
, scale
, status
);
2657 return float16_round_pack_canonical(pa
, status
);
2660 float16
uint32_to_float16_scalbn(uint32_t a
, int scale
, float_status
*status
)
2662 return uint64_to_float16_scalbn(a
, scale
, status
);
2665 float16
uint16_to_float16_scalbn(uint16_t a
, int scale
, float_status
*status
)
2667 return uint64_to_float16_scalbn(a
, scale
, status
);
2670 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
2672 return uint64_to_float16_scalbn(a
, 0, status
);
2675 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
2677 return uint64_to_float16_scalbn(a
, 0, status
);
2680 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
2682 return uint64_to_float16_scalbn(a
, 0, status
);
2685 float32
uint64_to_float32_scalbn(uint64_t a
, int scale
, float_status
*status
)
2687 FloatParts pa
= uint_to_float(a
, scale
, status
);
2688 return float32_round_pack_canonical(pa
, status
);
2691 float32
uint32_to_float32_scalbn(uint32_t a
, int scale
, float_status
*status
)
2693 return uint64_to_float32_scalbn(a
, scale
, status
);
2696 float32
uint16_to_float32_scalbn(uint16_t a
, int scale
, float_status
*status
)
2698 return uint64_to_float32_scalbn(a
, scale
, status
);
2701 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
2703 return uint64_to_float32_scalbn(a
, 0, status
);
2706 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
2708 return uint64_to_float32_scalbn(a
, 0, status
);
2711 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
2713 return uint64_to_float32_scalbn(a
, 0, status
);
2716 float64
uint64_to_float64_scalbn(uint64_t a
, int scale
, float_status
*status
)
2718 FloatParts pa
= uint_to_float(a
, scale
, status
);
2719 return float64_round_pack_canonical(pa
, status
);
2722 float64
uint32_to_float64_scalbn(uint32_t a
, int scale
, float_status
*status
)
2724 return uint64_to_float64_scalbn(a
, scale
, status
);
2727 float64
uint16_to_float64_scalbn(uint16_t a
, int scale
, float_status
*status
)
2729 return uint64_to_float64_scalbn(a
, scale
, status
);
2732 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
2734 return uint64_to_float64_scalbn(a
, 0, status
);
2737 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
2739 return uint64_to_float64_scalbn(a
, 0, status
);
2742 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
2744 return uint64_to_float64_scalbn(a
, 0, status
);
2748 /* min() and max() functions. These can't be implemented as
2749 * 'compare and pick one input' because that would mishandle
2750 * NaNs and +0 vs -0.
2752 * minnum() and maxnum() functions. These are similar to the min()
2753 * and max() functions but if one of the arguments is a QNaN and
2754 * the other is numerical then the numerical argument is returned.
2755 * SNaNs will get quietened before being returned.
2756 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2757 * and maxNum() operations. min() and max() are the typical min/max
2758 * semantics provided by many CPUs which predate that specification.
2760 * minnummag() and maxnummag() functions correspond to minNumMag()
2761 * and minNumMag() from the IEEE-754 2008.
2763 static FloatParts
minmax_floats(FloatParts a
, FloatParts b
, bool ismin
,
2764 bool ieee
, bool ismag
, float_status
*s
)
2766 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
2768 /* Takes two floating-point values `a' and `b', one of
2769 * which is a NaN, and returns the appropriate NaN
2770 * result. If either `a' or `b' is a signaling NaN,
2771 * the invalid exception is raised.
2773 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
2774 return pick_nan(a
, b
, s
);
2775 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
2777 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
2781 return pick_nan(a
, b
, s
);
2786 case float_class_normal
:
2789 case float_class_inf
:
2792 case float_class_zero
:
2796 g_assert_not_reached();
2800 case float_class_normal
:
2803 case float_class_inf
:
2806 case float_class_zero
:
2810 g_assert_not_reached();
2814 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
2815 bool a_less
= a_exp
< b_exp
;
2816 if (a_exp
== b_exp
) {
2817 a_less
= a
.frac
< b
.frac
;
2819 return a_less
^ ismin
? b
: a
;
2822 if (a
.sign
== b
.sign
) {
2823 bool a_less
= a_exp
< b_exp
;
2824 if (a_exp
== b_exp
) {
2825 a_less
= a
.frac
< b
.frac
;
2827 return a
.sign
^ a_less
^ ismin
? b
: a
;
2829 return a
.sign
^ ismin
? b
: a
;
2834 #define MINMAX(sz, name, ismin, isiee, ismag) \
2835 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
2838 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2839 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2840 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
2842 return float ## sz ## _round_pack_canonical(pr, s); \
2845 MINMAX(16, min
, true, false, false)
2846 MINMAX(16, minnum
, true, true, false)
2847 MINMAX(16, minnummag
, true, true, true)
2848 MINMAX(16, max
, false, false, false)
2849 MINMAX(16, maxnum
, false, true, false)
2850 MINMAX(16, maxnummag
, false, true, true)
2852 MINMAX(32, min
, true, false, false)
2853 MINMAX(32, minnum
, true, true, false)
2854 MINMAX(32, minnummag
, true, true, true)
2855 MINMAX(32, max
, false, false, false)
2856 MINMAX(32, maxnum
, false, true, false)
2857 MINMAX(32, maxnummag
, false, true, true)
2859 MINMAX(64, min
, true, false, false)
2860 MINMAX(64, minnum
, true, true, false)
2861 MINMAX(64, minnummag
, true, true, true)
2862 MINMAX(64, max
, false, false, false)
2863 MINMAX(64, maxnum
, false, true, false)
2864 MINMAX(64, maxnummag
, false, true, true)
2868 /* Floating point compare */
2869 static int compare_floats(FloatParts a
, FloatParts b
, bool is_quiet
,
2872 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
2874 a
.cls
== float_class_snan
||
2875 b
.cls
== float_class_snan
) {
2876 s
->float_exception_flags
|= float_flag_invalid
;
2878 return float_relation_unordered
;
2881 if (a
.cls
== float_class_zero
) {
2882 if (b
.cls
== float_class_zero
) {
2883 return float_relation_equal
;
2885 return b
.sign
? float_relation_greater
: float_relation_less
;
2886 } else if (b
.cls
== float_class_zero
) {
2887 return a
.sign
? float_relation_less
: float_relation_greater
;
2890 /* The only really important thing about infinity is its sign. If
2891 * both are infinities the sign marks the smallest of the two.
2893 if (a
.cls
== float_class_inf
) {
2894 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
2895 return float_relation_equal
;
2897 return a
.sign
? float_relation_less
: float_relation_greater
;
2898 } else if (b
.cls
== float_class_inf
) {
2899 return b
.sign
? float_relation_greater
: float_relation_less
;
2902 if (a
.sign
!= b
.sign
) {
2903 return a
.sign
? float_relation_less
: float_relation_greater
;
2906 if (a
.exp
== b
.exp
) {
2907 if (a
.frac
== b
.frac
) {
2908 return float_relation_equal
;
2911 return a
.frac
> b
.frac
?
2912 float_relation_less
: float_relation_greater
;
2914 return a
.frac
> b
.frac
?
2915 float_relation_greater
: float_relation_less
;
2919 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
2921 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
2926 #define COMPARE(name, attr, sz) \
2928 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
2930 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2931 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2932 return compare_floats(pa, pb, is_quiet, s); \
2935 COMPARE(soft_f16_compare
, QEMU_FLATTEN
, 16)
2936 COMPARE(soft_f32_compare
, QEMU_SOFTFLOAT_ATTR
, 32)
2937 COMPARE(soft_f64_compare
, QEMU_SOFTFLOAT_ATTR
, 64)
2941 int float16_compare(float16 a
, float16 b
, float_status
*s
)
2943 return soft_f16_compare(a
, b
, false, s
);
2946 int float16_compare_quiet(float16 a
, float16 b
, float_status
*s
)
2948 return soft_f16_compare(a
, b
, true, s
);
2951 static int QEMU_FLATTEN
2952 f32_compare(float32 xa
, float32 xb
, bool is_quiet
, float_status
*s
)
2954 union_float32 ua
, ub
;
2959 if (QEMU_NO_HARDFLOAT
) {
2963 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
2964 if (isgreaterequal(ua
.h
, ub
.h
)) {
2965 if (isgreater(ua
.h
, ub
.h
)) {
2966 return float_relation_greater
;
2968 return float_relation_equal
;
2970 if (likely(isless(ua
.h
, ub
.h
))) {
2971 return float_relation_less
;
2973 /* The only condition remaining is unordered.
2974 * Fall through to set flags.
2977 return soft_f32_compare(ua
.s
, ub
.s
, is_quiet
, s
);
2980 int float32_compare(float32 a
, float32 b
, float_status
*s
)
2982 return f32_compare(a
, b
, false, s
);
2985 int float32_compare_quiet(float32 a
, float32 b
, float_status
*s
)
2987 return f32_compare(a
, b
, true, s
);
2990 static int QEMU_FLATTEN
2991 f64_compare(float64 xa
, float64 xb
, bool is_quiet
, float_status
*s
)
2993 union_float64 ua
, ub
;
2998 if (QEMU_NO_HARDFLOAT
) {
3002 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
3003 if (isgreaterequal(ua
.h
, ub
.h
)) {
3004 if (isgreater(ua
.h
, ub
.h
)) {
3005 return float_relation_greater
;
3007 return float_relation_equal
;
3009 if (likely(isless(ua
.h
, ub
.h
))) {
3010 return float_relation_less
;
3012 /* The only condition remaining is unordered.
3013 * Fall through to set flags.
3016 return soft_f64_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3019 int float64_compare(float64 a
, float64 b
, float_status
*s
)
3021 return f64_compare(a
, b
, false, s
);
3024 int float64_compare_quiet(float64 a
, float64 b
, float_status
*s
)
3026 return f64_compare(a
, b
, true, s
);
3029 /* Multiply A by 2 raised to the power N. */
3030 static FloatParts
scalbn_decomposed(FloatParts a
, int n
, float_status
*s
)
3032 if (unlikely(is_nan(a
.cls
))) {
3033 return return_nan(a
, s
);
3035 if (a
.cls
== float_class_normal
) {
3036 /* The largest float type (even though not supported by FloatParts)
3037 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3038 * still allows rounding to infinity, without allowing overflow
3039 * within the int32_t that backs FloatParts.exp.
3041 n
= MIN(MAX(n
, -0x10000), 0x10000);
3047 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
3049 FloatParts pa
= float16_unpack_canonical(a
, status
);
3050 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3051 return float16_round_pack_canonical(pr
, status
);
3054 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
3056 FloatParts pa
= float32_unpack_canonical(a
, status
);
3057 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3058 return float32_round_pack_canonical(pr
, status
);
3061 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
3063 FloatParts pa
= float64_unpack_canonical(a
, status
);
3064 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3065 return float64_round_pack_canonical(pr
, status
);
3071 * The old softfloat code did an approximation step before zeroing in
3072 * on the final result. However for simpleness we just compute the
3073 * square root by iterating down from the implicit bit to enough extra
3074 * bits to ensure we get a correctly rounded result.
3076 * This does mean however the calculation is slower than before,
3077 * especially for 64 bit floats.
3080 static FloatParts
sqrt_float(FloatParts a
, float_status
*s
, const FloatFmt
*p
)
3082 uint64_t a_frac
, r_frac
, s_frac
;
3085 if (is_nan(a
.cls
)) {
3086 return return_nan(a
, s
);
3088 if (a
.cls
== float_class_zero
) {
3089 return a
; /* sqrt(+-0) = +-0 */
3092 s
->float_exception_flags
|= float_flag_invalid
;
3093 return parts_default_nan(s
);
3095 if (a
.cls
== float_class_inf
) {
3096 return a
; /* sqrt(+inf) = +inf */
3099 assert(a
.cls
== float_class_normal
);
3101 /* We need two overflow bits at the top. Adding room for that is a
3102 * right shift. If the exponent is odd, we can discard the low bit
3103 * by multiplying the fraction by 2; that's a left shift. Combine
3104 * those and we shift right if the exponent is even.
3112 /* Bit-by-bit computation of sqrt. */
3116 /* Iterate from implicit bit down to the 3 extra bits to compute a
3117 * properly rounded result. Remember we've inserted one more bit
3118 * at the top, so these positions are one less.
3120 bit
= DECOMPOSED_BINARY_POINT
- 1;
3121 last_bit
= MAX(p
->frac_shift
- 4, 0);
3123 uint64_t q
= 1ULL << bit
;
3124 uint64_t t_frac
= s_frac
+ q
;
3125 if (t_frac
<= a_frac
) {
3126 s_frac
= t_frac
+ q
;
3131 } while (--bit
>= last_bit
);
3133 /* Undo the right shift done above. If there is any remaining
3134 * fraction, the result is inexact. Set the sticky bit.
3136 a
.frac
= (r_frac
<< 1) + (a_frac
!= 0);
3141 float16 QEMU_FLATTEN
float16_sqrt(float16 a
, float_status
*status
)
3143 FloatParts pa
= float16_unpack_canonical(a
, status
);
3144 FloatParts pr
= sqrt_float(pa
, status
, &float16_params
);
3145 return float16_round_pack_canonical(pr
, status
);
3148 static float32 QEMU_SOFTFLOAT_ATTR
3149 soft_f32_sqrt(float32 a
, float_status
*status
)
3151 FloatParts pa
= float32_unpack_canonical(a
, status
);
3152 FloatParts pr
= sqrt_float(pa
, status
, &float32_params
);
3153 return float32_round_pack_canonical(pr
, status
);
3156 static float64 QEMU_SOFTFLOAT_ATTR
3157 soft_f64_sqrt(float64 a
, float_status
*status
)
3159 FloatParts pa
= float64_unpack_canonical(a
, status
);
3160 FloatParts pr
= sqrt_float(pa
, status
, &float64_params
);
3161 return float64_round_pack_canonical(pr
, status
);
3164 float32 QEMU_FLATTEN
float32_sqrt(float32 xa
, float_status
*s
)
3166 union_float32 ua
, ur
;
3169 if (unlikely(!can_use_fpu(s
))) {
3173 float32_input_flush1(&ua
.s
, s
);
3174 if (QEMU_HARDFLOAT_1F32_USE_FP
) {
3175 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3176 fpclassify(ua
.h
) == FP_ZERO
) ||
3180 } else if (unlikely(!float32_is_zero_or_normal(ua
.s
) ||
3181 float32_is_neg(ua
.s
))) {
3188 return soft_f32_sqrt(ua
.s
, s
);
3191 float64 QEMU_FLATTEN
float64_sqrt(float64 xa
, float_status
*s
)
3193 union_float64 ua
, ur
;
3196 if (unlikely(!can_use_fpu(s
))) {
3200 float64_input_flush1(&ua
.s
, s
);
3201 if (QEMU_HARDFLOAT_1F64_USE_FP
) {
3202 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3203 fpclassify(ua
.h
) == FP_ZERO
) ||
3207 } else if (unlikely(!float64_is_zero_or_normal(ua
.s
) ||
3208 float64_is_neg(ua
.s
))) {
3215 return soft_f64_sqrt(ua
.s
, s
);
3218 /*----------------------------------------------------------------------------
3219 | The pattern for a default generated NaN.
3220 *----------------------------------------------------------------------------*/
3222 float16
float16_default_nan(float_status
*status
)
3224 FloatParts p
= parts_default_nan(status
);
3225 p
.frac
>>= float16_params
.frac_shift
;
3226 return float16_pack_raw(p
);
3229 float32
float32_default_nan(float_status
*status
)
3231 FloatParts p
= parts_default_nan(status
);
3232 p
.frac
>>= float32_params
.frac_shift
;
3233 return float32_pack_raw(p
);
3236 float64
float64_default_nan(float_status
*status
)
3238 FloatParts p
= parts_default_nan(status
);
3239 p
.frac
>>= float64_params
.frac_shift
;
3240 return float64_pack_raw(p
);
3243 float128
float128_default_nan(float_status
*status
)
3245 FloatParts p
= parts_default_nan(status
);
3248 /* Extrapolate from the choices made by parts_default_nan to fill
3249 * in the quad-floating format. If the low bit is set, assume we
3250 * want to set all non-snan bits.
3252 r
.low
= -(p
.frac
& 1);
3253 r
.high
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- 48);
3254 r
.high
|= UINT64_C(0x7FFF000000000000);
3255 r
.high
|= (uint64_t)p
.sign
<< 63;
3260 /*----------------------------------------------------------------------------
3261 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3262 *----------------------------------------------------------------------------*/
3264 float16
float16_silence_nan(float16 a
, float_status
*status
)
3266 FloatParts p
= float16_unpack_raw(a
);
3267 p
.frac
<<= float16_params
.frac_shift
;
3268 p
= parts_silence_nan(p
, status
);
3269 p
.frac
>>= float16_params
.frac_shift
;
3270 return float16_pack_raw(p
);
3273 float32
float32_silence_nan(float32 a
, float_status
*status
)
3275 FloatParts p
= float32_unpack_raw(a
);
3276 p
.frac
<<= float32_params
.frac_shift
;
3277 p
= parts_silence_nan(p
, status
);
3278 p
.frac
>>= float32_params
.frac_shift
;
3279 return float32_pack_raw(p
);
3282 float64
float64_silence_nan(float64 a
, float_status
*status
)
3284 FloatParts p
= float64_unpack_raw(a
);
3285 p
.frac
<<= float64_params
.frac_shift
;
3286 p
= parts_silence_nan(p
, status
);
3287 p
.frac
>>= float64_params
.frac_shift
;
3288 return float64_pack_raw(p
);
3292 /*----------------------------------------------------------------------------
3293 | If `a' is denormal and we are in flush-to-zero mode then set the
3294 | input-denormal exception and return zero. Otherwise just return the value.
3295 *----------------------------------------------------------------------------*/
3297 static bool parts_squash_denormal(FloatParts p
, float_status
*status
)
3299 if (p
.exp
== 0 && p
.frac
!= 0) {
3300 float_raise(float_flag_input_denormal
, status
);
3307 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
3309 if (status
->flush_inputs_to_zero
) {
3310 FloatParts p
= float16_unpack_raw(a
);
3311 if (parts_squash_denormal(p
, status
)) {
3312 return float16_set_sign(float16_zero
, p
.sign
);
3318 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
3320 if (status
->flush_inputs_to_zero
) {
3321 FloatParts p
= float32_unpack_raw(a
);
3322 if (parts_squash_denormal(p
, status
)) {
3323 return float32_set_sign(float32_zero
, p
.sign
);
3329 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
3331 if (status
->flush_inputs_to_zero
) {
3332 FloatParts p
= float64_unpack_raw(a
);
3333 if (parts_squash_denormal(p
, status
)) {
3334 return float64_set_sign(float64_zero
, p
.sign
);
3340 /*----------------------------------------------------------------------------
3341 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3342 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3343 | input. If `zSign' is 1, the input is negated before being converted to an
3344 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3345 | is simply rounded to an integer, with the inexact exception raised if the
3346 | input cannot be represented exactly as an integer. However, if the fixed-
3347 | point input is too large, the invalid exception is raised and the largest
3348 | positive or negative integer is returned.
3349 *----------------------------------------------------------------------------*/
3351 static int32_t roundAndPackInt32(flag zSign
, uint64_t absZ
, float_status
*status
)
3353 int8_t roundingMode
;
3354 flag roundNearestEven
;
3355 int8_t roundIncrement
, roundBits
;
3358 roundingMode
= status
->float_rounding_mode
;
3359 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3360 switch (roundingMode
) {
3361 case float_round_nearest_even
:
3362 case float_round_ties_away
:
3363 roundIncrement
= 0x40;
3365 case float_round_to_zero
:
3368 case float_round_up
:
3369 roundIncrement
= zSign
? 0 : 0x7f;
3371 case float_round_down
:
3372 roundIncrement
= zSign
? 0x7f : 0;
3374 case float_round_to_odd
:
3375 roundIncrement
= absZ
& 0x80 ? 0 : 0x7f;
3380 roundBits
= absZ
& 0x7F;
3381 absZ
= ( absZ
+ roundIncrement
)>>7;
3382 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
3384 if ( zSign
) z
= - z
;
3385 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
3386 float_raise(float_flag_invalid
, status
);
3387 return zSign
? INT32_MIN
: INT32_MAX
;
3390 status
->float_exception_flags
|= float_flag_inexact
;
3396 /*----------------------------------------------------------------------------
3397 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3398 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3399 | and returns the properly rounded 64-bit integer corresponding to the input.
3400 | If `zSign' is 1, the input is negated before being converted to an integer.
3401 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3402 | the inexact exception raised if the input cannot be represented exactly as
3403 | an integer. However, if the fixed-point input is too large, the invalid
3404 | exception is raised and the largest positive or negative integer is
3406 *----------------------------------------------------------------------------*/
3408 static int64_t roundAndPackInt64(flag zSign
, uint64_t absZ0
, uint64_t absZ1
,
3409 float_status
*status
)
3411 int8_t roundingMode
;
3412 flag roundNearestEven
, increment
;
3415 roundingMode
= status
->float_rounding_mode
;
3416 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3417 switch (roundingMode
) {
3418 case float_round_nearest_even
:
3419 case float_round_ties_away
:
3420 increment
= ((int64_t) absZ1
< 0);
3422 case float_round_to_zero
:
3425 case float_round_up
:
3426 increment
= !zSign
&& absZ1
;
3428 case float_round_down
:
3429 increment
= zSign
&& absZ1
;
3431 case float_round_to_odd
:
3432 increment
= !(absZ0
& 1) && absZ1
;
3439 if ( absZ0
== 0 ) goto overflow
;
3440 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
3443 if ( zSign
) z
= - z
;
3444 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
3446 float_raise(float_flag_invalid
, status
);
3447 return zSign
? INT64_MIN
: INT64_MAX
;
3450 status
->float_exception_flags
|= float_flag_inexact
;
3456 /*----------------------------------------------------------------------------
3457 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3458 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3459 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3460 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3461 | with the inexact exception raised if the input cannot be represented exactly
3462 | as an integer. However, if the fixed-point input is too large, the invalid
3463 | exception is raised and the largest unsigned integer is returned.
3464 *----------------------------------------------------------------------------*/
3466 static int64_t roundAndPackUint64(flag zSign
, uint64_t absZ0
,
3467 uint64_t absZ1
, float_status
*status
)
3469 int8_t roundingMode
;
3470 flag roundNearestEven
, increment
;
3472 roundingMode
= status
->float_rounding_mode
;
3473 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
3474 switch (roundingMode
) {
3475 case float_round_nearest_even
:
3476 case float_round_ties_away
:
3477 increment
= ((int64_t)absZ1
< 0);
3479 case float_round_to_zero
:
3482 case float_round_up
:
3483 increment
= !zSign
&& absZ1
;
3485 case float_round_down
:
3486 increment
= zSign
&& absZ1
;
3488 case float_round_to_odd
:
3489 increment
= !(absZ0
& 1) && absZ1
;
3497 float_raise(float_flag_invalid
, status
);
3500 absZ0
&= ~(((uint64_t)(absZ1
<<1) == 0) & roundNearestEven
);
3503 if (zSign
&& absZ0
) {
3504 float_raise(float_flag_invalid
, status
);
3509 status
->float_exception_flags
|= float_flag_inexact
;
3514 /*----------------------------------------------------------------------------
3515 | Normalizes the subnormal single-precision floating-point value represented
3516 | by the denormalized significand `aSig'. The normalized exponent and
3517 | significand are stored at the locations pointed to by `zExpPtr' and
3518 | `zSigPtr', respectively.
3519 *----------------------------------------------------------------------------*/
3522 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
3526 shiftCount
= clz32(aSig
) - 8;
3527 *zSigPtr
= aSig
<<shiftCount
;
3528 *zExpPtr
= 1 - shiftCount
;
3532 /*----------------------------------------------------------------------------
3533 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3534 | and significand `zSig', and returns the proper single-precision floating-
3535 | point value corresponding to the abstract input. Ordinarily, the abstract
3536 | value is simply rounded and packed into the single-precision format, with
3537 | the inexact exception raised if the abstract input cannot be represented
3538 | exactly. However, if the abstract value is too large, the overflow and
3539 | inexact exceptions are raised and an infinity or maximal finite value is
3540 | returned. If the abstract value is too small, the input value is rounded to
3541 | a subnormal number, and the underflow and inexact exceptions are raised if
3542 | the abstract input cannot be represented exactly as a subnormal single-
3543 | precision floating-point number.
3544 | The input significand `zSig' has its binary point between bits 30
3545 | and 29, which is 7 bits to the left of the usual location. This shifted
3546 | significand must be normalized or smaller. If `zSig' is not normalized,
3547 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3548 | and it must not require rounding. In the usual case that `zSig' is
3549 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3550 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3551 | Binary Floating-Point Arithmetic.
3552 *----------------------------------------------------------------------------*/
3554 static float32
roundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
3555 float_status
*status
)
3557 int8_t roundingMode
;
3558 flag roundNearestEven
;
3559 int8_t roundIncrement
, roundBits
;
3562 roundingMode
= status
->float_rounding_mode
;
3563 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3564 switch (roundingMode
) {
3565 case float_round_nearest_even
:
3566 case float_round_ties_away
:
3567 roundIncrement
= 0x40;
3569 case float_round_to_zero
:
3572 case float_round_up
:
3573 roundIncrement
= zSign
? 0 : 0x7f;
3575 case float_round_down
:
3576 roundIncrement
= zSign
? 0x7f : 0;
3578 case float_round_to_odd
:
3579 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
3585 roundBits
= zSig
& 0x7F;
3586 if ( 0xFD <= (uint16_t) zExp
) {
3587 if ( ( 0xFD < zExp
)
3588 || ( ( zExp
== 0xFD )
3589 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
3591 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
3592 roundIncrement
!= 0;
3593 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3594 return packFloat32(zSign
, 0xFF, -!overflow_to_inf
);
3597 if (status
->flush_to_zero
) {
3598 float_raise(float_flag_output_denormal
, status
);
3599 return packFloat32(zSign
, 0, 0);
3602 (status
->float_detect_tininess
3603 == float_tininess_before_rounding
)
3605 || ( zSig
+ roundIncrement
< 0x80000000 );
3606 shift32RightJamming( zSig
, - zExp
, &zSig
);
3608 roundBits
= zSig
& 0x7F;
3609 if (isTiny
&& roundBits
) {
3610 float_raise(float_flag_underflow
, status
);
3612 if (roundingMode
== float_round_to_odd
) {
3614 * For round-to-odd case, the roundIncrement depends on
3615 * zSig which just changed.
3617 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
3622 status
->float_exception_flags
|= float_flag_inexact
;
3624 zSig
= ( zSig
+ roundIncrement
)>>7;
3625 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
3626 if ( zSig
== 0 ) zExp
= 0;
3627 return packFloat32( zSign
, zExp
, zSig
);
3631 /*----------------------------------------------------------------------------
3632 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3633 | and significand `zSig', and returns the proper single-precision floating-
3634 | point value corresponding to the abstract input. This routine is just like
3635 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3636 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3637 | floating-point exponent.
3638 *----------------------------------------------------------------------------*/
3641 normalizeRoundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
3642 float_status
*status
)
3646 shiftCount
= clz32(zSig
) - 1;
3647 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
3652 /*----------------------------------------------------------------------------
3653 | Normalizes the subnormal double-precision floating-point value represented
3654 | by the denormalized significand `aSig'. The normalized exponent and
3655 | significand are stored at the locations pointed to by `zExpPtr' and
3656 | `zSigPtr', respectively.
3657 *----------------------------------------------------------------------------*/
3660 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
3664 shiftCount
= clz64(aSig
) - 11;
3665 *zSigPtr
= aSig
<<shiftCount
;
3666 *zExpPtr
= 1 - shiftCount
;
3670 /*----------------------------------------------------------------------------
3671 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3672 | double-precision floating-point value, returning the result. After being
3673 | shifted into the proper positions, the three fields are simply added
3674 | together to form the result. This means that any integer portion of `zSig'
3675 | will be added into the exponent. Since a properly normalized significand
3676 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3677 | than the desired result exponent whenever `zSig' is a complete, normalized
3679 *----------------------------------------------------------------------------*/
3681 static inline float64
packFloat64(flag zSign
, int zExp
, uint64_t zSig
)
3684 return make_float64(
3685 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
3689 /*----------------------------------------------------------------------------
3690 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3691 | and significand `zSig', and returns the proper double-precision floating-
3692 | point value corresponding to the abstract input. Ordinarily, the abstract
3693 | value is simply rounded and packed into the double-precision format, with
3694 | the inexact exception raised if the abstract input cannot be represented
3695 | exactly. However, if the abstract value is too large, the overflow and
3696 | inexact exceptions are raised and an infinity or maximal finite value is
3697 | returned. If the abstract value is too small, the input value is rounded to
3698 | a subnormal number, and the underflow and inexact exceptions are raised if
3699 | the abstract input cannot be represented exactly as a subnormal double-
3700 | precision floating-point number.
3701 | The input significand `zSig' has its binary point between bits 62
3702 | and 61, which is 10 bits to the left of the usual location. This shifted
3703 | significand must be normalized or smaller. If `zSig' is not normalized,
3704 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3705 | and it must not require rounding. In the usual case that `zSig' is
3706 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3707 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3708 | Binary Floating-Point Arithmetic.
3709 *----------------------------------------------------------------------------*/
3711 static float64
roundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
3712 float_status
*status
)
3714 int8_t roundingMode
;
3715 flag roundNearestEven
;
3716 int roundIncrement
, roundBits
;
3719 roundingMode
= status
->float_rounding_mode
;
3720 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3721 switch (roundingMode
) {
3722 case float_round_nearest_even
:
3723 case float_round_ties_away
:
3724 roundIncrement
= 0x200;
3726 case float_round_to_zero
:
3729 case float_round_up
:
3730 roundIncrement
= zSign
? 0 : 0x3ff;
3732 case float_round_down
:
3733 roundIncrement
= zSign
? 0x3ff : 0;
3735 case float_round_to_odd
:
3736 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
3741 roundBits
= zSig
& 0x3FF;
3742 if ( 0x7FD <= (uint16_t) zExp
) {
3743 if ( ( 0x7FD < zExp
)
3744 || ( ( zExp
== 0x7FD )
3745 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
3747 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
3748 roundIncrement
!= 0;
3749 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3750 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
3753 if (status
->flush_to_zero
) {
3754 float_raise(float_flag_output_denormal
, status
);
3755 return packFloat64(zSign
, 0, 0);
3758 (status
->float_detect_tininess
3759 == float_tininess_before_rounding
)
3761 || ( zSig
+ roundIncrement
< UINT64_C(0x8000000000000000) );
3762 shift64RightJamming( zSig
, - zExp
, &zSig
);
3764 roundBits
= zSig
& 0x3FF;
3765 if (isTiny
&& roundBits
) {
3766 float_raise(float_flag_underflow
, status
);
3768 if (roundingMode
== float_round_to_odd
) {
3770 * For round-to-odd case, the roundIncrement depends on
3771 * zSig which just changed.
3773 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
3778 status
->float_exception_flags
|= float_flag_inexact
;
3780 zSig
= ( zSig
+ roundIncrement
)>>10;
3781 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
3782 if ( zSig
== 0 ) zExp
= 0;
3783 return packFloat64( zSign
, zExp
, zSig
);
3787 /*----------------------------------------------------------------------------
3788 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3789 | and significand `zSig', and returns the proper double-precision floating-
3790 | point value corresponding to the abstract input. This routine is just like
3791 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3792 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3793 | floating-point exponent.
3794 *----------------------------------------------------------------------------*/
3797 normalizeRoundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
3798 float_status
*status
)
3802 shiftCount
= clz64(zSig
) - 1;
3803 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
3808 /*----------------------------------------------------------------------------
3809 | Normalizes the subnormal extended double-precision floating-point value
3810 | represented by the denormalized significand `aSig'. The normalized exponent
3811 | and significand are stored at the locations pointed to by `zExpPtr' and
3812 | `zSigPtr', respectively.
3813 *----------------------------------------------------------------------------*/
3815 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
3820 shiftCount
= clz64(aSig
);
3821 *zSigPtr
= aSig
<<shiftCount
;
3822 *zExpPtr
= 1 - shiftCount
;
3825 /*----------------------------------------------------------------------------
3826 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3827 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3828 | and returns the proper extended double-precision floating-point value
3829 | corresponding to the abstract input. Ordinarily, the abstract value is
3830 | rounded and packed into the extended double-precision format, with the
3831 | inexact exception raised if the abstract input cannot be represented
3832 | exactly. However, if the abstract value is too large, the overflow and
3833 | inexact exceptions are raised and an infinity or maximal finite value is
3834 | returned. If the abstract value is too small, the input value is rounded to
3835 | a subnormal number, and the underflow and inexact exceptions are raised if
3836 | the abstract input cannot be represented exactly as a subnormal extended
3837 | double-precision floating-point number.
3838 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
3839 | number of bits as single or double precision, respectively. Otherwise, the
3840 | result is rounded to the full precision of the extended double-precision
3842 | The input significand must be normalized or smaller. If the input
3843 | significand is not normalized, `zExp' must be 0; in that case, the result
3844 | returned is a subnormal number, and it must not require rounding. The
3845 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3846 | Floating-Point Arithmetic.
3847 *----------------------------------------------------------------------------*/
3849 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, flag zSign
,
3850 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
3851 float_status
*status
)
3853 int8_t roundingMode
;
3854 flag roundNearestEven
, increment
, isTiny
;
3855 int64_t roundIncrement
, roundMask
, roundBits
;
3857 roundingMode
= status
->float_rounding_mode
;
3858 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3859 if ( roundingPrecision
== 80 ) goto precision80
;
3860 if ( roundingPrecision
== 64 ) {
3861 roundIncrement
= UINT64_C(0x0000000000000400);
3862 roundMask
= UINT64_C(0x00000000000007FF);
3864 else if ( roundingPrecision
== 32 ) {
3865 roundIncrement
= UINT64_C(0x0000008000000000);
3866 roundMask
= UINT64_C(0x000000FFFFFFFFFF);
3871 zSig0
|= ( zSig1
!= 0 );
3872 switch (roundingMode
) {
3873 case float_round_nearest_even
:
3874 case float_round_ties_away
:
3876 case float_round_to_zero
:
3879 case float_round_up
:
3880 roundIncrement
= zSign
? 0 : roundMask
;
3882 case float_round_down
:
3883 roundIncrement
= zSign
? roundMask
: 0;
3888 roundBits
= zSig0
& roundMask
;
3889 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
3890 if ( ( 0x7FFE < zExp
)
3891 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
3896 if (status
->flush_to_zero
) {
3897 float_raise(float_flag_output_denormal
, status
);
3898 return packFloatx80(zSign
, 0, 0);
3901 (status
->float_detect_tininess
3902 == float_tininess_before_rounding
)
3904 || ( zSig0
<= zSig0
+ roundIncrement
);
3905 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
3907 roundBits
= zSig0
& roundMask
;
3908 if (isTiny
&& roundBits
) {
3909 float_raise(float_flag_underflow
, status
);
3912 status
->float_exception_flags
|= float_flag_inexact
;
3914 zSig0
+= roundIncrement
;
3915 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
3916 roundIncrement
= roundMask
+ 1;
3917 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
3918 roundMask
|= roundIncrement
;
3920 zSig0
&= ~ roundMask
;
3921 return packFloatx80( zSign
, zExp
, zSig0
);
3925 status
->float_exception_flags
|= float_flag_inexact
;
3927 zSig0
+= roundIncrement
;
3928 if ( zSig0
< roundIncrement
) {
3930 zSig0
= UINT64_C(0x8000000000000000);
3932 roundIncrement
= roundMask
+ 1;
3933 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
3934 roundMask
|= roundIncrement
;
3936 zSig0
&= ~ roundMask
;
3937 if ( zSig0
== 0 ) zExp
= 0;
3938 return packFloatx80( zSign
, zExp
, zSig0
);
3940 switch (roundingMode
) {
3941 case float_round_nearest_even
:
3942 case float_round_ties_away
:
3943 increment
= ((int64_t)zSig1
< 0);
3945 case float_round_to_zero
:
3948 case float_round_up
:
3949 increment
= !zSign
&& zSig1
;
3951 case float_round_down
:
3952 increment
= zSign
&& zSig1
;
3957 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
3958 if ( ( 0x7FFE < zExp
)
3959 || ( ( zExp
== 0x7FFE )
3960 && ( zSig0
== UINT64_C(0xFFFFFFFFFFFFFFFF) )
3966 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3967 if ( ( roundingMode
== float_round_to_zero
)
3968 || ( zSign
&& ( roundingMode
== float_round_up
) )
3969 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
3971 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
3973 return packFloatx80(zSign
,
3974 floatx80_infinity_high
,
3975 floatx80_infinity_low
);
3979 (status
->float_detect_tininess
3980 == float_tininess_before_rounding
)
3983 || ( zSig0
< UINT64_C(0xFFFFFFFFFFFFFFFF) );
3984 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
3986 if (isTiny
&& zSig1
) {
3987 float_raise(float_flag_underflow
, status
);
3990 status
->float_exception_flags
|= float_flag_inexact
;
3992 switch (roundingMode
) {
3993 case float_round_nearest_even
:
3994 case float_round_ties_away
:
3995 increment
= ((int64_t)zSig1
< 0);
3997 case float_round_to_zero
:
4000 case float_round_up
:
4001 increment
= !zSign
&& zSig1
;
4003 case float_round_down
:
4004 increment
= zSign
&& zSig1
;
4012 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
4013 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4015 return packFloatx80( zSign
, zExp
, zSig0
);
4019 status
->float_exception_flags
|= float_flag_inexact
;
4025 zSig0
= UINT64_C(0x8000000000000000);
4028 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
4032 if ( zSig0
== 0 ) zExp
= 0;
4034 return packFloatx80( zSign
, zExp
, zSig0
);
4038 /*----------------------------------------------------------------------------
4039 | Takes an abstract floating-point value having sign `zSign', exponent
4040 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4041 | and returns the proper extended double-precision floating-point value
4042 | corresponding to the abstract input. This routine is just like
4043 | `roundAndPackFloatx80' except that the input significand does not have to be
4045 *----------------------------------------------------------------------------*/
4047 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
4048 flag zSign
, int32_t zExp
,
4049 uint64_t zSig0
, uint64_t zSig1
,
4050 float_status
*status
)
4059 shiftCount
= clz64(zSig0
);
4060 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4062 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
4063 zSig0
, zSig1
, status
);
4067 /*----------------------------------------------------------------------------
4068 | Returns the least-significant 64 fraction bits of the quadruple-precision
4069 | floating-point value `a'.
4070 *----------------------------------------------------------------------------*/
4072 static inline uint64_t extractFloat128Frac1( float128 a
)
4079 /*----------------------------------------------------------------------------
4080 | Returns the most-significant 48 fraction bits of the quadruple-precision
4081 | floating-point value `a'.
4082 *----------------------------------------------------------------------------*/
4084 static inline uint64_t extractFloat128Frac0( float128 a
)
4087 return a
.high
& UINT64_C(0x0000FFFFFFFFFFFF);
4091 /*----------------------------------------------------------------------------
4092 | Returns the exponent bits of the quadruple-precision floating-point value
4094 *----------------------------------------------------------------------------*/
4096 static inline int32_t extractFloat128Exp( float128 a
)
4099 return ( a
.high
>>48 ) & 0x7FFF;
4103 /*----------------------------------------------------------------------------
4104 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4105 *----------------------------------------------------------------------------*/
4107 static inline flag
extractFloat128Sign( float128 a
)
4114 /*----------------------------------------------------------------------------
4115 | Normalizes the subnormal quadruple-precision floating-point value
4116 | represented by the denormalized significand formed by the concatenation of
4117 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4118 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4119 | significand are stored at the location pointed to by `zSig0Ptr', and the
4120 | least significant 64 bits of the normalized significand are stored at the
4121 | location pointed to by `zSig1Ptr'.
4122 *----------------------------------------------------------------------------*/
4125 normalizeFloat128Subnormal(
4136 shiftCount
= clz64(aSig1
) - 15;
4137 if ( shiftCount
< 0 ) {
4138 *zSig0Ptr
= aSig1
>>( - shiftCount
);
4139 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
4142 *zSig0Ptr
= aSig1
<<shiftCount
;
4145 *zExpPtr
= - shiftCount
- 63;
4148 shiftCount
= clz64(aSig0
) - 15;
4149 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
4150 *zExpPtr
= 1 - shiftCount
;
4155 /*----------------------------------------------------------------------------
4156 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4157 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4158 | floating-point value, returning the result. After being shifted into the
4159 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4160 | added together to form the most significant 32 bits of the result. This
4161 | means that any integer portion of `zSig0' will be added into the exponent.
4162 | Since a properly normalized significand will have an integer portion equal
4163 | to 1, the `zExp' input should be 1 less than the desired result exponent
4164 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4166 *----------------------------------------------------------------------------*/
4168 static inline float128
4169 packFloat128( flag zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
4174 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
4179 /*----------------------------------------------------------------------------
4180 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4181 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4182 | and `zSig2', and returns the proper quadruple-precision floating-point value
4183 | corresponding to the abstract input. Ordinarily, the abstract value is
4184 | simply rounded and packed into the quadruple-precision format, with the
4185 | inexact exception raised if the abstract input cannot be represented
4186 | exactly. However, if the abstract value is too large, the overflow and
4187 | inexact exceptions are raised and an infinity or maximal finite value is
4188 | returned. If the abstract value is too small, the input value is rounded to
4189 | a subnormal number, and the underflow and inexact exceptions are raised if
4190 | the abstract input cannot be represented exactly as a subnormal quadruple-
4191 | precision floating-point number.
4192 | The input significand must be normalized or smaller. If the input
4193 | significand is not normalized, `zExp' must be 0; in that case, the result
4194 | returned is a subnormal number, and it must not require rounding. In the
4195 | usual case that the input significand is normalized, `zExp' must be 1 less
4196 | than the ``true'' floating-point exponent. The handling of underflow and
4197 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4198 *----------------------------------------------------------------------------*/
4200 static float128
roundAndPackFloat128(flag zSign
, int32_t zExp
,
4201 uint64_t zSig0
, uint64_t zSig1
,
4202 uint64_t zSig2
, float_status
*status
)
4204 int8_t roundingMode
;
4205 flag roundNearestEven
, increment
, isTiny
;
4207 roundingMode
= status
->float_rounding_mode
;
4208 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4209 switch (roundingMode
) {
4210 case float_round_nearest_even
:
4211 case float_round_ties_away
:
4212 increment
= ((int64_t)zSig2
< 0);
4214 case float_round_to_zero
:
4217 case float_round_up
:
4218 increment
= !zSign
&& zSig2
;
4220 case float_round_down
:
4221 increment
= zSign
&& zSig2
;
4223 case float_round_to_odd
:
4224 increment
= !(zSig1
& 0x1) && zSig2
;
4229 if ( 0x7FFD <= (uint32_t) zExp
) {
4230 if ( ( 0x7FFD < zExp
)
4231 || ( ( zExp
== 0x7FFD )
4233 UINT64_C(0x0001FFFFFFFFFFFF),
4234 UINT64_C(0xFFFFFFFFFFFFFFFF),
4241 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4242 if ( ( roundingMode
== float_round_to_zero
)
4243 || ( zSign
&& ( roundingMode
== float_round_up
) )
4244 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4245 || (roundingMode
== float_round_to_odd
)
4251 UINT64_C(0x0000FFFFFFFFFFFF),
4252 UINT64_C(0xFFFFFFFFFFFFFFFF)
4255 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4258 if (status
->flush_to_zero
) {
4259 float_raise(float_flag_output_denormal
, status
);
4260 return packFloat128(zSign
, 0, 0, 0);
4263 (status
->float_detect_tininess
4264 == float_tininess_before_rounding
)
4270 UINT64_C(0x0001FFFFFFFFFFFF),
4271 UINT64_C(0xFFFFFFFFFFFFFFFF)
4273 shift128ExtraRightJamming(
4274 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
4276 if (isTiny
&& zSig2
) {
4277 float_raise(float_flag_underflow
, status
);
4279 switch (roundingMode
) {
4280 case float_round_nearest_even
:
4281 case float_round_ties_away
:
4282 increment
= ((int64_t)zSig2
< 0);
4284 case float_round_to_zero
:
4287 case float_round_up
:
4288 increment
= !zSign
&& zSig2
;
4290 case float_round_down
:
4291 increment
= zSign
&& zSig2
;
4293 case float_round_to_odd
:
4294 increment
= !(zSig1
& 0x1) && zSig2
;
4302 status
->float_exception_flags
|= float_flag_inexact
;
4305 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
4306 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
4309 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
4311 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4315 /*----------------------------------------------------------------------------
4316 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4317 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4318 | returns the proper quadruple-precision floating-point value corresponding
4319 | to the abstract input. This routine is just like `roundAndPackFloat128'
4320 | except that the input significand has fewer bits and does not have to be
4321 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4323 *----------------------------------------------------------------------------*/
4325 static float128
normalizeRoundAndPackFloat128(flag zSign
, int32_t zExp
,
4326 uint64_t zSig0
, uint64_t zSig1
,
4327 float_status
*status
)
4337 shiftCount
= clz64(zSig0
) - 15;
4338 if ( 0 <= shiftCount
) {
4340 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4343 shift128ExtraRightJamming(
4344 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
4347 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
4352 /*----------------------------------------------------------------------------
4353 | Returns the result of converting the 32-bit two's complement integer `a'
4354 | to the extended double-precision floating-point format. The conversion
4355 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4357 *----------------------------------------------------------------------------*/
4359 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
4366 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4368 absA
= zSign
? - a
: a
;
4369 shiftCount
= clz32(absA
) + 32;
4371 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
4375 /*----------------------------------------------------------------------------
4376 | Returns the result of converting the 32-bit two's complement integer `a' to
4377 | the quadruple-precision floating-point format. The conversion is performed
4378 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4379 *----------------------------------------------------------------------------*/
4381 float128
int32_to_float128(int32_t a
, float_status
*status
)
4388 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4390 absA
= zSign
? - a
: a
;
4391 shiftCount
= clz32(absA
) + 17;
4393 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
4397 /*----------------------------------------------------------------------------
4398 | Returns the result of converting the 64-bit two's complement integer `a'
4399 | to the extended double-precision floating-point format. The conversion
4400 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4402 *----------------------------------------------------------------------------*/
4404 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
4410 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4412 absA
= zSign
? - a
: a
;
4413 shiftCount
= clz64(absA
);
4414 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
4418 /*----------------------------------------------------------------------------
4419 | Returns the result of converting the 64-bit two's complement integer `a' to
4420 | the quadruple-precision floating-point format. The conversion is performed
4421 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4422 *----------------------------------------------------------------------------*/
4424 float128
int64_to_float128(int64_t a
, float_status
*status
)
4430 uint64_t zSig0
, zSig1
;
4432 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4434 absA
= zSign
? - a
: a
;
4435 shiftCount
= clz64(absA
) + 49;
4436 zExp
= 0x406E - shiftCount
;
4437 if ( 64 <= shiftCount
) {
4446 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4447 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4451 /*----------------------------------------------------------------------------
4452 | Returns the result of converting the 64-bit unsigned integer `a'
4453 | to the quadruple-precision floating-point format. The conversion is performed
4454 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4455 *----------------------------------------------------------------------------*/
4457 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
4460 return float128_zero
;
4462 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a
, status
);
4465 /*----------------------------------------------------------------------------
4466 | Returns the result of converting the single-precision floating-point value
4467 | `a' to the extended double-precision floating-point format. The conversion
4468 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4470 *----------------------------------------------------------------------------*/
4472 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
4478 a
= float32_squash_input_denormal(a
, status
);
4479 aSig
= extractFloat32Frac( a
);
4480 aExp
= extractFloat32Exp( a
);
4481 aSign
= extractFloat32Sign( a
);
4482 if ( aExp
== 0xFF ) {
4484 return commonNaNToFloatx80(float32ToCommonNaN(a
, status
), status
);
4486 return packFloatx80(aSign
,
4487 floatx80_infinity_high
,
4488 floatx80_infinity_low
);
4491 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
4492 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4495 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
4499 /*----------------------------------------------------------------------------
4500 | Returns the result of converting the single-precision floating-point value
4501 | `a' to the double-precision floating-point format. The conversion is
4502 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4504 *----------------------------------------------------------------------------*/
4506 float128
float32_to_float128(float32 a
, float_status
*status
)
4512 a
= float32_squash_input_denormal(a
, status
);
4513 aSig
= extractFloat32Frac( a
);
4514 aExp
= extractFloat32Exp( a
);
4515 aSign
= extractFloat32Sign( a
);
4516 if ( aExp
== 0xFF ) {
4518 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
4520 return packFloat128( aSign
, 0x7FFF, 0, 0 );
4523 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
4524 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4527 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
4531 /*----------------------------------------------------------------------------
4532 | Returns the remainder of the single-precision floating-point value `a'
4533 | with respect to the corresponding value `b'. The operation is performed
4534 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4535 *----------------------------------------------------------------------------*/
4537 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
4540 int aExp
, bExp
, expDiff
;
4541 uint32_t aSig
, bSig
;
4543 uint64_t aSig64
, bSig64
, q64
;
4544 uint32_t alternateASig
;
4546 a
= float32_squash_input_denormal(a
, status
);
4547 b
= float32_squash_input_denormal(b
, status
);
4549 aSig
= extractFloat32Frac( a
);
4550 aExp
= extractFloat32Exp( a
);
4551 aSign
= extractFloat32Sign( a
);
4552 bSig
= extractFloat32Frac( b
);
4553 bExp
= extractFloat32Exp( b
);
4554 if ( aExp
== 0xFF ) {
4555 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
4556 return propagateFloat32NaN(a
, b
, status
);
4558 float_raise(float_flag_invalid
, status
);
4559 return float32_default_nan(status
);
4561 if ( bExp
== 0xFF ) {
4563 return propagateFloat32NaN(a
, b
, status
);
4569 float_raise(float_flag_invalid
, status
);
4570 return float32_default_nan(status
);
4572 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
4575 if ( aSig
== 0 ) return a
;
4576 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4578 expDiff
= aExp
- bExp
;
4581 if ( expDiff
< 32 ) {
4584 if ( expDiff
< 0 ) {
4585 if ( expDiff
< -1 ) return a
;
4588 q
= ( bSig
<= aSig
);
4589 if ( q
) aSig
-= bSig
;
4590 if ( 0 < expDiff
) {
4591 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
4594 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4602 if ( bSig
<= aSig
) aSig
-= bSig
;
4603 aSig64
= ( (uint64_t) aSig
)<<40;
4604 bSig64
= ( (uint64_t) bSig
)<<40;
4606 while ( 0 < expDiff
) {
4607 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
4608 q64
= ( 2 < q64
) ? q64
- 2 : 0;
4609 aSig64
= - ( ( bSig
* q64
)<<38 );
4613 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
4614 q64
= ( 2 < q64
) ? q64
- 2 : 0;
4615 q
= q64
>>( 64 - expDiff
);
4617 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
4620 alternateASig
= aSig
;
4623 } while ( 0 <= (int32_t) aSig
);
4624 sigMean
= aSig
+ alternateASig
;
4625 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4626 aSig
= alternateASig
;
4628 zSign
= ( (int32_t) aSig
< 0 );
4629 if ( zSign
) aSig
= - aSig
;
4630 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
4635 /*----------------------------------------------------------------------------
4636 | Returns the binary exponential of the single-precision floating-point value
4637 | `a'. The operation is performed according to the IEC/IEEE Standard for
4638 | Binary Floating-Point Arithmetic.
4640 | Uses the following identities:
4642 | 1. -------------------------------------------------------------------------
4646 | 2. -------------------------------------------------------------------------
4649 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4651 *----------------------------------------------------------------------------*/
4653 static const float64 float32_exp2_coefficients
[15] =
4655 const_float64( 0x3ff0000000000000ll
), /* 1 */
4656 const_float64( 0x3fe0000000000000ll
), /* 2 */
4657 const_float64( 0x3fc5555555555555ll
), /* 3 */
4658 const_float64( 0x3fa5555555555555ll
), /* 4 */
4659 const_float64( 0x3f81111111111111ll
), /* 5 */
4660 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
4661 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
4662 const_float64( 0x3efa01a01a01a01all
), /* 8 */
4663 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
4664 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
4665 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
4666 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
4667 const_float64( 0x3de6124613a86d09ll
), /* 13 */
4668 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
4669 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
4672 float32
float32_exp2(float32 a
, float_status
*status
)
4679 a
= float32_squash_input_denormal(a
, status
);
4681 aSig
= extractFloat32Frac( a
);
4682 aExp
= extractFloat32Exp( a
);
4683 aSign
= extractFloat32Sign( a
);
4685 if ( aExp
== 0xFF) {
4687 return propagateFloat32NaN(a
, float32_zero
, status
);
4689 return (aSign
) ? float32_zero
: a
;
4692 if (aSig
== 0) return float32_one
;
4695 float_raise(float_flag_inexact
, status
);
4697 /* ******************************* */
4698 /* using float64 for approximation */
4699 /* ******************************* */
4700 x
= float32_to_float64(a
, status
);
4701 x
= float64_mul(x
, float64_ln2
, status
);
4705 for (i
= 0 ; i
< 15 ; i
++) {
4708 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
4709 r
= float64_add(r
, f
, status
);
4711 xn
= float64_mul(xn
, x
, status
);
4714 return float64_to_float32(r
, status
);
4717 /*----------------------------------------------------------------------------
4718 | Returns the binary log of the single-precision floating-point value `a'.
4719 | The operation is performed according to the IEC/IEEE Standard for Binary
4720 | Floating-Point Arithmetic.
4721 *----------------------------------------------------------------------------*/
4722 float32
float32_log2(float32 a
, float_status
*status
)
4726 uint32_t aSig
, zSig
, i
;
4728 a
= float32_squash_input_denormal(a
, status
);
4729 aSig
= extractFloat32Frac( a
);
4730 aExp
= extractFloat32Exp( a
);
4731 aSign
= extractFloat32Sign( a
);
4734 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
4735 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4738 float_raise(float_flag_invalid
, status
);
4739 return float32_default_nan(status
);
4741 if ( aExp
== 0xFF ) {
4743 return propagateFloat32NaN(a
, float32_zero
, status
);
4753 for (i
= 1 << 22; i
> 0; i
>>= 1) {
4754 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
4755 if ( aSig
& 0x01000000 ) {
4764 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
4767 /*----------------------------------------------------------------------------
4768 | Returns 1 if the single-precision floating-point value `a' is equal to
4769 | the corresponding value `b', and 0 otherwise. The invalid exception is
4770 | raised if either operand is a NaN. Otherwise, the comparison is performed
4771 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4772 *----------------------------------------------------------------------------*/
4774 int float32_eq(float32 a
, float32 b
, float_status
*status
)
4777 a
= float32_squash_input_denormal(a
, status
);
4778 b
= float32_squash_input_denormal(b
, status
);
4780 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4781 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4783 float_raise(float_flag_invalid
, status
);
4786 av
= float32_val(a
);
4787 bv
= float32_val(b
);
4788 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
4791 /*----------------------------------------------------------------------------
4792 | Returns 1 if the single-precision floating-point value `a' is less than
4793 | or equal to the corresponding value `b', and 0 otherwise. The invalid
4794 | exception is raised if either operand is a NaN. The comparison is performed
4795 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4796 *----------------------------------------------------------------------------*/
4798 int float32_le(float32 a
, float32 b
, float_status
*status
)
4802 a
= float32_squash_input_denormal(a
, status
);
4803 b
= float32_squash_input_denormal(b
, status
);
4805 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4806 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4808 float_raise(float_flag_invalid
, status
);
4811 aSign
= extractFloat32Sign( a
);
4812 bSign
= extractFloat32Sign( b
);
4813 av
= float32_val(a
);
4814 bv
= float32_val(b
);
4815 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
4816 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4820 /*----------------------------------------------------------------------------
4821 | Returns 1 if the single-precision floating-point value `a' is less than
4822 | the corresponding value `b', and 0 otherwise. The invalid exception is
4823 | raised if either operand is a NaN. The comparison is performed according
4824 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4825 *----------------------------------------------------------------------------*/
4827 int float32_lt(float32 a
, float32 b
, float_status
*status
)
4831 a
= float32_squash_input_denormal(a
, status
);
4832 b
= float32_squash_input_denormal(b
, status
);
4834 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4835 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4837 float_raise(float_flag_invalid
, status
);
4840 aSign
= extractFloat32Sign( a
);
4841 bSign
= extractFloat32Sign( b
);
4842 av
= float32_val(a
);
4843 bv
= float32_val(b
);
4844 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
4845 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4849 /*----------------------------------------------------------------------------
4850 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4851 | be compared, and 0 otherwise. The invalid exception is raised if either
4852 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4853 | Standard for Binary Floating-Point Arithmetic.
4854 *----------------------------------------------------------------------------*/
4856 int float32_unordered(float32 a
, float32 b
, float_status
*status
)
4858 a
= float32_squash_input_denormal(a
, status
);
4859 b
= float32_squash_input_denormal(b
, status
);
4861 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4862 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4864 float_raise(float_flag_invalid
, status
);
4870 /*----------------------------------------------------------------------------
4871 | Returns 1 if the single-precision floating-point value `a' is equal to
4872 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4873 | exception. The comparison is performed according to the IEC/IEEE Standard
4874 | for Binary Floating-Point Arithmetic.
4875 *----------------------------------------------------------------------------*/
4877 int float32_eq_quiet(float32 a
, float32 b
, float_status
*status
)
4879 a
= float32_squash_input_denormal(a
, status
);
4880 b
= float32_squash_input_denormal(b
, status
);
4882 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4883 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4885 if (float32_is_signaling_nan(a
, status
)
4886 || float32_is_signaling_nan(b
, status
)) {
4887 float_raise(float_flag_invalid
, status
);
4891 return ( float32_val(a
) == float32_val(b
) ) ||
4892 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
4895 /*----------------------------------------------------------------------------
4896 | Returns 1 if the single-precision floating-point value `a' is less than or
4897 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4898 | cause an exception. Otherwise, the comparison is performed according to the
4899 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4900 *----------------------------------------------------------------------------*/
4902 int float32_le_quiet(float32 a
, float32 b
, float_status
*status
)
4906 a
= float32_squash_input_denormal(a
, status
);
4907 b
= float32_squash_input_denormal(b
, status
);
4909 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4910 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4912 if (float32_is_signaling_nan(a
, status
)
4913 || float32_is_signaling_nan(b
, status
)) {
4914 float_raise(float_flag_invalid
, status
);
4918 aSign
= extractFloat32Sign( a
);
4919 bSign
= extractFloat32Sign( b
);
4920 av
= float32_val(a
);
4921 bv
= float32_val(b
);
4922 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
4923 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4927 /*----------------------------------------------------------------------------
4928 | Returns 1 if the single-precision floating-point value `a' is less than
4929 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4930 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4931 | Standard for Binary Floating-Point Arithmetic.
4932 *----------------------------------------------------------------------------*/
4934 int float32_lt_quiet(float32 a
, float32 b
, float_status
*status
)
4938 a
= float32_squash_input_denormal(a
, status
);
4939 b
= float32_squash_input_denormal(b
, status
);
4941 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4942 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4944 if (float32_is_signaling_nan(a
, status
)
4945 || float32_is_signaling_nan(b
, status
)) {
4946 float_raise(float_flag_invalid
, status
);
4950 aSign
= extractFloat32Sign( a
);
4951 bSign
= extractFloat32Sign( b
);
4952 av
= float32_val(a
);
4953 bv
= float32_val(b
);
4954 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
4955 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4959 /*----------------------------------------------------------------------------
4960 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4961 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4962 | comparison is performed according to the IEC/IEEE Standard for Binary
4963 | Floating-Point Arithmetic.
4964 *----------------------------------------------------------------------------*/
4966 int float32_unordered_quiet(float32 a
, float32 b
, float_status
*status
)
4968 a
= float32_squash_input_denormal(a
, status
);
4969 b
= float32_squash_input_denormal(b
, status
);
4971 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
4972 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
4974 if (float32_is_signaling_nan(a
, status
)
4975 || float32_is_signaling_nan(b
, status
)) {
4976 float_raise(float_flag_invalid
, status
);
4983 /*----------------------------------------------------------------------------
4984 | Returns the result of converting the double-precision floating-point value
4985 | `a' to the extended double-precision floating-point format. The conversion
4986 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4988 *----------------------------------------------------------------------------*/
4990 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
4996 a
= float64_squash_input_denormal(a
, status
);
4997 aSig
= extractFloat64Frac( a
);
4998 aExp
= extractFloat64Exp( a
);
4999 aSign
= extractFloat64Sign( a
);
5000 if ( aExp
== 0x7FF ) {
5002 return commonNaNToFloatx80(float64ToCommonNaN(a
, status
), status
);
5004 return packFloatx80(aSign
,
5005 floatx80_infinity_high
,
5006 floatx80_infinity_low
);
5009 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
5010 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5014 aSign
, aExp
+ 0x3C00, (aSig
| UINT64_C(0x0010000000000000)) << 11);
5018 /*----------------------------------------------------------------------------
5019 | Returns the result of converting the double-precision floating-point value
5020 | `a' to the quadruple-precision floating-point format. The conversion is
5021 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5023 *----------------------------------------------------------------------------*/
5025 float128
float64_to_float128(float64 a
, float_status
*status
)
5029 uint64_t aSig
, zSig0
, zSig1
;
5031 a
= float64_squash_input_denormal(a
, status
);
5032 aSig
= extractFloat64Frac( a
);
5033 aExp
= extractFloat64Exp( a
);
5034 aSign
= extractFloat64Sign( a
);
5035 if ( aExp
== 0x7FF ) {
5037 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
5039 return packFloat128( aSign
, 0x7FFF, 0, 0 );
5042 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
5043 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5046 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
5047 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
5052 /*----------------------------------------------------------------------------
5053 | Returns the remainder of the double-precision floating-point value `a'
5054 | with respect to the corresponding value `b'. The operation is performed
5055 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5056 *----------------------------------------------------------------------------*/
5058 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
5061 int aExp
, bExp
, expDiff
;
5062 uint64_t aSig
, bSig
;
5063 uint64_t q
, alternateASig
;
5066 a
= float64_squash_input_denormal(a
, status
);
5067 b
= float64_squash_input_denormal(b
, status
);
5068 aSig
= extractFloat64Frac( a
);
5069 aExp
= extractFloat64Exp( a
);
5070 aSign
= extractFloat64Sign( a
);
5071 bSig
= extractFloat64Frac( b
);
5072 bExp
= extractFloat64Exp( b
);
5073 if ( aExp
== 0x7FF ) {
5074 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
5075 return propagateFloat64NaN(a
, b
, status
);
5077 float_raise(float_flag_invalid
, status
);
5078 return float64_default_nan(status
);
5080 if ( bExp
== 0x7FF ) {
5082 return propagateFloat64NaN(a
, b
, status
);
5088 float_raise(float_flag_invalid
, status
);
5089 return float64_default_nan(status
);
5091 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
5094 if ( aSig
== 0 ) return a
;
5095 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5097 expDiff
= aExp
- bExp
;
5098 aSig
= (aSig
| UINT64_C(0x0010000000000000)) << 11;
5099 bSig
= (bSig
| UINT64_C(0x0010000000000000)) << 11;
5100 if ( expDiff
< 0 ) {
5101 if ( expDiff
< -1 ) return a
;
5104 q
= ( bSig
<= aSig
);
5105 if ( q
) aSig
-= bSig
;
5107 while ( 0 < expDiff
) {
5108 q
= estimateDiv128To64( aSig
, 0, bSig
);
5109 q
= ( 2 < q
) ? q
- 2 : 0;
5110 aSig
= - ( ( bSig
>>2 ) * q
);
5114 if ( 0 < expDiff
) {
5115 q
= estimateDiv128To64( aSig
, 0, bSig
);
5116 q
= ( 2 < q
) ? q
- 2 : 0;
5119 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5126 alternateASig
= aSig
;
5129 } while ( 0 <= (int64_t) aSig
);
5130 sigMean
= aSig
+ alternateASig
;
5131 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5132 aSig
= alternateASig
;
5134 zSign
= ( (int64_t) aSig
< 0 );
5135 if ( zSign
) aSig
= - aSig
;
5136 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
5140 /*----------------------------------------------------------------------------
5141 | Returns the binary log of the double-precision floating-point value `a'.
5142 | The operation is performed according to the IEC/IEEE Standard for Binary
5143 | Floating-Point Arithmetic.
5144 *----------------------------------------------------------------------------*/
5145 float64
float64_log2(float64 a
, float_status
*status
)
5149 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
5150 a
= float64_squash_input_denormal(a
, status
);
5152 aSig
= extractFloat64Frac( a
);
5153 aExp
= extractFloat64Exp( a
);
5154 aSign
= extractFloat64Sign( a
);
5157 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
5158 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5161 float_raise(float_flag_invalid
, status
);
5162 return float64_default_nan(status
);
5164 if ( aExp
== 0x7FF ) {
5166 return propagateFloat64NaN(a
, float64_zero
, status
);
5172 aSig
|= UINT64_C(0x0010000000000000);
5174 zSig
= (uint64_t)aExp
<< 52;
5175 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
5176 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
5177 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
5178 if ( aSig
& UINT64_C(0x0020000000000000) ) {
5186 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
5189 /*----------------------------------------------------------------------------
5190 | Returns 1 if the double-precision floating-point value `a' is equal to the
5191 | corresponding value `b', and 0 otherwise. The invalid exception is raised
5192 | if either operand is a NaN. Otherwise, the comparison is performed
5193 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5194 *----------------------------------------------------------------------------*/
5196 int float64_eq(float64 a
, float64 b
, float_status
*status
)
5199 a
= float64_squash_input_denormal(a
, status
);
5200 b
= float64_squash_input_denormal(b
, status
);
5202 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5203 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5205 float_raise(float_flag_invalid
, status
);
5208 av
= float64_val(a
);
5209 bv
= float64_val(b
);
5210 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
5214 /*----------------------------------------------------------------------------
5215 | Returns 1 if the double-precision floating-point value `a' is less than or
5216 | equal to the corresponding value `b', and 0 otherwise. The invalid
5217 | exception is raised if either operand is a NaN. The comparison is performed
5218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5219 *----------------------------------------------------------------------------*/
5221 int float64_le(float64 a
, float64 b
, float_status
*status
)
5225 a
= float64_squash_input_denormal(a
, status
);
5226 b
= float64_squash_input_denormal(b
, status
);
5228 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5229 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5231 float_raise(float_flag_invalid
, status
);
5234 aSign
= extractFloat64Sign( a
);
5235 bSign
= extractFloat64Sign( b
);
5236 av
= float64_val(a
);
5237 bv
= float64_val(b
);
5238 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
5239 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
5243 /*----------------------------------------------------------------------------
5244 | Returns 1 if the double-precision floating-point value `a' is less than
5245 | the corresponding value `b', and 0 otherwise. The invalid exception is
5246 | raised if either operand is a NaN. The comparison is performed according
5247 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5248 *----------------------------------------------------------------------------*/
5250 int float64_lt(float64 a
, float64 b
, float_status
*status
)
5255 a
= float64_squash_input_denormal(a
, status
);
5256 b
= float64_squash_input_denormal(b
, status
);
5257 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5258 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5260 float_raise(float_flag_invalid
, status
);
5263 aSign
= extractFloat64Sign( a
);
5264 bSign
= extractFloat64Sign( b
);
5265 av
= float64_val(a
);
5266 bv
= float64_val(b
);
5267 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
5268 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
5272 /*----------------------------------------------------------------------------
5273 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5274 | be compared, and 0 otherwise. The invalid exception is raised if either
5275 | operand is a NaN. The comparison is performed according to the IEC/IEEE
5276 | Standard for Binary Floating-Point Arithmetic.
5277 *----------------------------------------------------------------------------*/
5279 int float64_unordered(float64 a
, float64 b
, float_status
*status
)
5281 a
= float64_squash_input_denormal(a
, status
);
5282 b
= float64_squash_input_denormal(b
, status
);
5284 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5285 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5287 float_raise(float_flag_invalid
, status
);
5293 /*----------------------------------------------------------------------------
5294 | Returns 1 if the double-precision floating-point value `a' is equal to the
5295 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5296 | exception.The comparison is performed according to the IEC/IEEE Standard
5297 | for Binary Floating-Point Arithmetic.
5298 *----------------------------------------------------------------------------*/
5300 int float64_eq_quiet(float64 a
, float64 b
, float_status
*status
)
5303 a
= float64_squash_input_denormal(a
, status
);
5304 b
= float64_squash_input_denormal(b
, status
);
5306 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5307 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5309 if (float64_is_signaling_nan(a
, status
)
5310 || float64_is_signaling_nan(b
, status
)) {
5311 float_raise(float_flag_invalid
, status
);
5315 av
= float64_val(a
);
5316 bv
= float64_val(b
);
5317 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
5321 /*----------------------------------------------------------------------------
5322 | Returns 1 if the double-precision floating-point value `a' is less than or
5323 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5324 | cause an exception. Otherwise, the comparison is performed according to the
5325 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5326 *----------------------------------------------------------------------------*/
5328 int float64_le_quiet(float64 a
, float64 b
, float_status
*status
)
5332 a
= float64_squash_input_denormal(a
, status
);
5333 b
= float64_squash_input_denormal(b
, status
);
5335 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5336 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5338 if (float64_is_signaling_nan(a
, status
)
5339 || float64_is_signaling_nan(b
, status
)) {
5340 float_raise(float_flag_invalid
, status
);
5344 aSign
= extractFloat64Sign( a
);
5345 bSign
= extractFloat64Sign( b
);
5346 av
= float64_val(a
);
5347 bv
= float64_val(b
);
5348 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
5349 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
5353 /*----------------------------------------------------------------------------
5354 | Returns 1 if the double-precision floating-point value `a' is less than
5355 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5356 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5357 | Standard for Binary Floating-Point Arithmetic.
5358 *----------------------------------------------------------------------------*/
5360 int float64_lt_quiet(float64 a
, float64 b
, float_status
*status
)
5364 a
= float64_squash_input_denormal(a
, status
);
5365 b
= float64_squash_input_denormal(b
, status
);
5367 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5368 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5370 if (float64_is_signaling_nan(a
, status
)
5371 || float64_is_signaling_nan(b
, status
)) {
5372 float_raise(float_flag_invalid
, status
);
5376 aSign
= extractFloat64Sign( a
);
5377 bSign
= extractFloat64Sign( b
);
5378 av
= float64_val(a
);
5379 bv
= float64_val(b
);
5380 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
5381 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
5385 /*----------------------------------------------------------------------------
5386 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5387 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
5388 | comparison is performed according to the IEC/IEEE Standard for Binary
5389 | Floating-Point Arithmetic.
5390 *----------------------------------------------------------------------------*/
5392 int float64_unordered_quiet(float64 a
, float64 b
, float_status
*status
)
5394 a
= float64_squash_input_denormal(a
, status
);
5395 b
= float64_squash_input_denormal(b
, status
);
5397 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
5398 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
5400 if (float64_is_signaling_nan(a
, status
)
5401 || float64_is_signaling_nan(b
, status
)) {
5402 float_raise(float_flag_invalid
, status
);
5409 /*----------------------------------------------------------------------------
5410 | Returns the result of converting the extended double-precision floating-
5411 | point value `a' to the 32-bit two's complement integer format. The
5412 | conversion is performed according to the IEC/IEEE Standard for Binary
5413 | Floating-Point Arithmetic---which means in particular that the conversion
5414 | is rounded according to the current rounding mode. If `a' is a NaN, the
5415 | largest positive integer is returned. Otherwise, if the conversion
5416 | overflows, the largest integer with the same sign as `a' is returned.
5417 *----------------------------------------------------------------------------*/
5419 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
5422 int32_t aExp
, shiftCount
;
5425 if (floatx80_invalid_encoding(a
)) {
5426 float_raise(float_flag_invalid
, status
);
5429 aSig
= extractFloatx80Frac( a
);
5430 aExp
= extractFloatx80Exp( a
);
5431 aSign
= extractFloatx80Sign( a
);
5432 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5433 shiftCount
= 0x4037 - aExp
;
5434 if ( shiftCount
<= 0 ) shiftCount
= 1;
5435 shift64RightJamming( aSig
, shiftCount
, &aSig
);
5436 return roundAndPackInt32(aSign
, aSig
, status
);
5440 /*----------------------------------------------------------------------------
5441 | Returns the result of converting the extended double-precision floating-
5442 | point value `a' to the 32-bit two's complement integer format. The
5443 | conversion is performed according to the IEC/IEEE Standard for Binary
5444 | Floating-Point Arithmetic, except that the conversion is always rounded
5445 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5446 | Otherwise, if the conversion overflows, the largest integer with the same
5447 | sign as `a' is returned.
5448 *----------------------------------------------------------------------------*/
5450 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
5453 int32_t aExp
, shiftCount
;
5454 uint64_t aSig
, savedASig
;
5457 if (floatx80_invalid_encoding(a
)) {
5458 float_raise(float_flag_invalid
, status
);
5461 aSig
= extractFloatx80Frac( a
);
5462 aExp
= extractFloatx80Exp( a
);
5463 aSign
= extractFloatx80Sign( a
);
5464 if ( 0x401E < aExp
) {
5465 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5468 else if ( aExp
< 0x3FFF ) {
5470 status
->float_exception_flags
|= float_flag_inexact
;
5474 shiftCount
= 0x403E - aExp
;
5476 aSig
>>= shiftCount
;
5478 if ( aSign
) z
= - z
;
5479 if ( ( z
< 0 ) ^ aSign
) {
5481 float_raise(float_flag_invalid
, status
);
5482 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5484 if ( ( aSig
<<shiftCount
) != savedASig
) {
5485 status
->float_exception_flags
|= float_flag_inexact
;
5491 /*----------------------------------------------------------------------------
5492 | Returns the result of converting the extended double-precision floating-
5493 | point value `a' to the 64-bit two's complement integer format. The
5494 | conversion is performed according to the IEC/IEEE Standard for Binary
5495 | Floating-Point Arithmetic---which means in particular that the conversion
5496 | is rounded according to the current rounding mode. If `a' is a NaN,
5497 | the largest positive integer is returned. Otherwise, if the conversion
5498 | overflows, the largest integer with the same sign as `a' is returned.
5499 *----------------------------------------------------------------------------*/
5501 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
5504 int32_t aExp
, shiftCount
;
5505 uint64_t aSig
, aSigExtra
;
5507 if (floatx80_invalid_encoding(a
)) {
5508 float_raise(float_flag_invalid
, status
);
5511 aSig
= extractFloatx80Frac( a
);
5512 aExp
= extractFloatx80Exp( a
);
5513 aSign
= extractFloatx80Sign( a
);
5514 shiftCount
= 0x403E - aExp
;
5515 if ( shiftCount
<= 0 ) {
5517 float_raise(float_flag_invalid
, status
);
5518 if (!aSign
|| floatx80_is_any_nan(a
)) {
5526 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
5528 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
5532 /*----------------------------------------------------------------------------
5533 | Returns the result of converting the extended double-precision floating-
5534 | point value `a' to the 64-bit two's complement integer format. The
5535 | conversion is performed according to the IEC/IEEE Standard for Binary
5536 | Floating-Point Arithmetic, except that the conversion is always rounded
5537 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5538 | Otherwise, if the conversion overflows, the largest integer with the same
5539 | sign as `a' is returned.
5540 *----------------------------------------------------------------------------*/
5542 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
5545 int32_t aExp
, shiftCount
;
5549 if (floatx80_invalid_encoding(a
)) {
5550 float_raise(float_flag_invalid
, status
);
5553 aSig
= extractFloatx80Frac( a
);
5554 aExp
= extractFloatx80Exp( a
);
5555 aSign
= extractFloatx80Sign( a
);
5556 shiftCount
= aExp
- 0x403E;
5557 if ( 0 <= shiftCount
) {
5558 aSig
&= UINT64_C(0x7FFFFFFFFFFFFFFF);
5559 if ( ( a
.high
!= 0xC03E ) || aSig
) {
5560 float_raise(float_flag_invalid
, status
);
5561 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
5567 else if ( aExp
< 0x3FFF ) {
5569 status
->float_exception_flags
|= float_flag_inexact
;
5573 z
= aSig
>>( - shiftCount
);
5574 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
5575 status
->float_exception_flags
|= float_flag_inexact
;
5577 if ( aSign
) z
= - z
;
5582 /*----------------------------------------------------------------------------
5583 | Returns the result of converting the extended double-precision floating-
5584 | point value `a' to the single-precision floating-point format. The
5585 | conversion is performed according to the IEC/IEEE Standard for Binary
5586 | Floating-Point Arithmetic.
5587 *----------------------------------------------------------------------------*/
5589 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
5595 if (floatx80_invalid_encoding(a
)) {
5596 float_raise(float_flag_invalid
, status
);
5597 return float32_default_nan(status
);
5599 aSig
= extractFloatx80Frac( a
);
5600 aExp
= extractFloatx80Exp( a
);
5601 aSign
= extractFloatx80Sign( a
);
5602 if ( aExp
== 0x7FFF ) {
5603 if ( (uint64_t) ( aSig
<<1 ) ) {
5604 return commonNaNToFloat32(floatx80ToCommonNaN(a
, status
), status
);
5606 return packFloat32( aSign
, 0xFF, 0 );
5608 shift64RightJamming( aSig
, 33, &aSig
);
5609 if ( aExp
|| aSig
) aExp
-= 0x3F81;
5610 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
5614 /*----------------------------------------------------------------------------
5615 | Returns the result of converting the extended double-precision floating-
5616 | point value `a' to the double-precision floating-point format. The
5617 | conversion is performed according to the IEC/IEEE Standard for Binary
5618 | Floating-Point Arithmetic.
5619 *----------------------------------------------------------------------------*/
5621 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
5625 uint64_t aSig
, zSig
;
5627 if (floatx80_invalid_encoding(a
)) {
5628 float_raise(float_flag_invalid
, status
);
5629 return float64_default_nan(status
);
5631 aSig
= extractFloatx80Frac( a
);
5632 aExp
= extractFloatx80Exp( a
);
5633 aSign
= extractFloatx80Sign( a
);
5634 if ( aExp
== 0x7FFF ) {
5635 if ( (uint64_t) ( aSig
<<1 ) ) {
5636 return commonNaNToFloat64(floatx80ToCommonNaN(a
, status
), status
);
5638 return packFloat64( aSign
, 0x7FF, 0 );
5640 shift64RightJamming( aSig
, 1, &zSig
);
5641 if ( aExp
|| aSig
) aExp
-= 0x3C01;
5642 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
5646 /*----------------------------------------------------------------------------
5647 | Returns the result of converting the extended double-precision floating-
5648 | point value `a' to the quadruple-precision floating-point format. The
5649 | conversion is performed according to the IEC/IEEE Standard for Binary
5650 | Floating-Point Arithmetic.
5651 *----------------------------------------------------------------------------*/
5653 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
5657 uint64_t aSig
, zSig0
, zSig1
;
5659 if (floatx80_invalid_encoding(a
)) {
5660 float_raise(float_flag_invalid
, status
);
5661 return float128_default_nan(status
);
5663 aSig
= extractFloatx80Frac( a
);
5664 aExp
= extractFloatx80Exp( a
);
5665 aSign
= extractFloatx80Sign( a
);
5666 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
5667 return commonNaNToFloat128(floatx80ToCommonNaN(a
, status
), status
);
5669 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
5670 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
5674 /*----------------------------------------------------------------------------
5675 | Rounds the extended double-precision floating-point value `a'
5676 | to the precision provided by floatx80_rounding_precision and returns the
5677 | result as an extended double-precision floating-point value.
5678 | The operation is performed according to the IEC/IEEE Standard for Binary
5679 | Floating-Point Arithmetic.
5680 *----------------------------------------------------------------------------*/
5682 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
5684 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5685 extractFloatx80Sign(a
),
5686 extractFloatx80Exp(a
),
5687 extractFloatx80Frac(a
), 0, status
);
5690 /*----------------------------------------------------------------------------
5691 | Rounds the extended double-precision floating-point value `a' to an integer,
5692 | and returns the result as an extended quadruple-precision floating-point
5693 | value. The operation is performed according to the IEC/IEEE Standard for
5694 | Binary Floating-Point Arithmetic.
5695 *----------------------------------------------------------------------------*/
5697 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
5701 uint64_t lastBitMask
, roundBitsMask
;
5704 if (floatx80_invalid_encoding(a
)) {
5705 float_raise(float_flag_invalid
, status
);
5706 return floatx80_default_nan(status
);
5708 aExp
= extractFloatx80Exp( a
);
5709 if ( 0x403E <= aExp
) {
5710 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
5711 return propagateFloatx80NaN(a
, a
, status
);
5715 if ( aExp
< 0x3FFF ) {
5717 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
5720 status
->float_exception_flags
|= float_flag_inexact
;
5721 aSign
= extractFloatx80Sign( a
);
5722 switch (status
->float_rounding_mode
) {
5723 case float_round_nearest_even
:
5724 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
5727 packFloatx80( aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5730 case float_round_ties_away
:
5731 if (aExp
== 0x3FFE) {
5732 return packFloatx80(aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5735 case float_round_down
:
5738 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5739 : packFloatx80( 0, 0, 0 );
5740 case float_round_up
:
5742 aSign
? packFloatx80( 1, 0, 0 )
5743 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5745 return packFloatx80( aSign
, 0, 0 );
5748 lastBitMask
<<= 0x403E - aExp
;
5749 roundBitsMask
= lastBitMask
- 1;
5751 switch (status
->float_rounding_mode
) {
5752 case float_round_nearest_even
:
5753 z
.low
+= lastBitMask
>>1;
5754 if ((z
.low
& roundBitsMask
) == 0) {
5755 z
.low
&= ~lastBitMask
;
5758 case float_round_ties_away
:
5759 z
.low
+= lastBitMask
>> 1;
5761 case float_round_to_zero
:
5763 case float_round_up
:
5764 if (!extractFloatx80Sign(z
)) {
5765 z
.low
+= roundBitsMask
;
5768 case float_round_down
:
5769 if (extractFloatx80Sign(z
)) {
5770 z
.low
+= roundBitsMask
;
5776 z
.low
&= ~ roundBitsMask
;
5779 z
.low
= UINT64_C(0x8000000000000000);
5781 if (z
.low
!= a
.low
) {
5782 status
->float_exception_flags
|= float_flag_inexact
;
5788 /*----------------------------------------------------------------------------
5789 | Returns the result of adding the absolute values of the extended double-
5790 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5791 | negated before being returned. `zSign' is ignored if the result is a NaN.
5792 | The addition is performed according to the IEC/IEEE Standard for Binary
5793 | Floating-Point Arithmetic.
5794 *----------------------------------------------------------------------------*/
5796 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
5797 float_status
*status
)
5799 int32_t aExp
, bExp
, zExp
;
5800 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5803 aSig
= extractFloatx80Frac( a
);
5804 aExp
= extractFloatx80Exp( a
);
5805 bSig
= extractFloatx80Frac( b
);
5806 bExp
= extractFloatx80Exp( b
);
5807 expDiff
= aExp
- bExp
;
5808 if ( 0 < expDiff
) {
5809 if ( aExp
== 0x7FFF ) {
5810 if ((uint64_t)(aSig
<< 1)) {
5811 return propagateFloatx80NaN(a
, b
, status
);
5815 if ( bExp
== 0 ) --expDiff
;
5816 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5819 else if ( expDiff
< 0 ) {
5820 if ( bExp
== 0x7FFF ) {
5821 if ((uint64_t)(bSig
<< 1)) {
5822 return propagateFloatx80NaN(a
, b
, status
);
5824 return packFloatx80(zSign
,
5825 floatx80_infinity_high
,
5826 floatx80_infinity_low
);
5828 if ( aExp
== 0 ) ++expDiff
;
5829 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5833 if ( aExp
== 0x7FFF ) {
5834 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5835 return propagateFloatx80NaN(a
, b
, status
);
5840 zSig0
= aSig
+ bSig
;
5842 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5848 zSig0
= aSig
+ bSig
;
5849 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5851 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5852 zSig0
|= UINT64_C(0x8000000000000000);
5855 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5856 zSign
, zExp
, zSig0
, zSig1
, status
);
5859 /*----------------------------------------------------------------------------
5860 | Returns the result of subtracting the absolute values of the extended
5861 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5862 | difference is negated before being returned. `zSign' is ignored if the
5863 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5864 | Standard for Binary Floating-Point Arithmetic.
5865 *----------------------------------------------------------------------------*/
5867 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
5868 float_status
*status
)
5870 int32_t aExp
, bExp
, zExp
;
5871 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5874 aSig
= extractFloatx80Frac( a
);
5875 aExp
= extractFloatx80Exp( a
);
5876 bSig
= extractFloatx80Frac( b
);
5877 bExp
= extractFloatx80Exp( b
);
5878 expDiff
= aExp
- bExp
;
5879 if ( 0 < expDiff
) goto aExpBigger
;
5880 if ( expDiff
< 0 ) goto bExpBigger
;
5881 if ( aExp
== 0x7FFF ) {
5882 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5883 return propagateFloatx80NaN(a
, b
, status
);
5885 float_raise(float_flag_invalid
, status
);
5886 return floatx80_default_nan(status
);
5893 if ( bSig
< aSig
) goto aBigger
;
5894 if ( aSig
< bSig
) goto bBigger
;
5895 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
5897 if ( bExp
== 0x7FFF ) {
5898 if ((uint64_t)(bSig
<< 1)) {
5899 return propagateFloatx80NaN(a
, b
, status
);
5901 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
5902 floatx80_infinity_low
);
5904 if ( aExp
== 0 ) ++expDiff
;
5905 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5907 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
5910 goto normalizeRoundAndPack
;
5912 if ( aExp
== 0x7FFF ) {
5913 if ((uint64_t)(aSig
<< 1)) {
5914 return propagateFloatx80NaN(a
, b
, status
);
5918 if ( bExp
== 0 ) --expDiff
;
5919 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5921 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
5923 normalizeRoundAndPack
:
5924 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
5925 zSign
, zExp
, zSig0
, zSig1
, status
);
5928 /*----------------------------------------------------------------------------
5929 | Returns the result of adding the extended double-precision floating-point
5930 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5931 | Standard for Binary Floating-Point Arithmetic.
5932 *----------------------------------------------------------------------------*/
5934 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
5938 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5939 float_raise(float_flag_invalid
, status
);
5940 return floatx80_default_nan(status
);
5942 aSign
= extractFloatx80Sign( a
);
5943 bSign
= extractFloatx80Sign( b
);
5944 if ( aSign
== bSign
) {
5945 return addFloatx80Sigs(a
, b
, aSign
, status
);
5948 return subFloatx80Sigs(a
, b
, aSign
, status
);
5953 /*----------------------------------------------------------------------------
5954 | Returns the result of subtracting the extended double-precision floating-
5955 | point values `a' and `b'. The operation is performed according to the
5956 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5957 *----------------------------------------------------------------------------*/
5959 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5963 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5964 float_raise(float_flag_invalid
, status
);
5965 return floatx80_default_nan(status
);
5967 aSign
= extractFloatx80Sign( a
);
5968 bSign
= extractFloatx80Sign( b
);
5969 if ( aSign
== bSign
) {
5970 return subFloatx80Sigs(a
, b
, aSign
, status
);
5973 return addFloatx80Sigs(a
, b
, aSign
, status
);
5978 /*----------------------------------------------------------------------------
5979 | Returns the result of multiplying the extended double-precision floating-
5980 | point values `a' and `b'. The operation is performed according to the
5981 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5982 *----------------------------------------------------------------------------*/
5984 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
5986 flag aSign
, bSign
, zSign
;
5987 int32_t aExp
, bExp
, zExp
;
5988 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5990 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5991 float_raise(float_flag_invalid
, status
);
5992 return floatx80_default_nan(status
);
5994 aSig
= extractFloatx80Frac( a
);
5995 aExp
= extractFloatx80Exp( a
);
5996 aSign
= extractFloatx80Sign( a
);
5997 bSig
= extractFloatx80Frac( b
);
5998 bExp
= extractFloatx80Exp( b
);
5999 bSign
= extractFloatx80Sign( b
);
6000 zSign
= aSign
^ bSign
;
6001 if ( aExp
== 0x7FFF ) {
6002 if ( (uint64_t) ( aSig
<<1 )
6003 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6004 return propagateFloatx80NaN(a
, b
, status
);
6006 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
6007 return packFloatx80(zSign
, floatx80_infinity_high
,
6008 floatx80_infinity_low
);
6010 if ( bExp
== 0x7FFF ) {
6011 if ((uint64_t)(bSig
<< 1)) {
6012 return propagateFloatx80NaN(a
, b
, status
);
6014 if ( ( aExp
| aSig
) == 0 ) {
6016 float_raise(float_flag_invalid
, status
);
6017 return floatx80_default_nan(status
);
6019 return packFloatx80(zSign
, floatx80_infinity_high
,
6020 floatx80_infinity_low
);
6023 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6024 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6027 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6028 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6030 zExp
= aExp
+ bExp
- 0x3FFE;
6031 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
6032 if ( 0 < (int64_t) zSig0
) {
6033 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
6036 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6037 zSign
, zExp
, zSig0
, zSig1
, status
);
6040 /*----------------------------------------------------------------------------
6041 | Returns the result of dividing the extended double-precision floating-point
6042 | value `a' by the corresponding value `b'. The operation is performed
6043 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6044 *----------------------------------------------------------------------------*/
6046 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
6048 flag aSign
, bSign
, zSign
;
6049 int32_t aExp
, bExp
, zExp
;
6050 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6051 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
6053 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6054 float_raise(float_flag_invalid
, status
);
6055 return floatx80_default_nan(status
);
6057 aSig
= extractFloatx80Frac( a
);
6058 aExp
= extractFloatx80Exp( a
);
6059 aSign
= extractFloatx80Sign( a
);
6060 bSig
= extractFloatx80Frac( b
);
6061 bExp
= extractFloatx80Exp( b
);
6062 bSign
= extractFloatx80Sign( b
);
6063 zSign
= aSign
^ bSign
;
6064 if ( aExp
== 0x7FFF ) {
6065 if ((uint64_t)(aSig
<< 1)) {
6066 return propagateFloatx80NaN(a
, b
, status
);
6068 if ( bExp
== 0x7FFF ) {
6069 if ((uint64_t)(bSig
<< 1)) {
6070 return propagateFloatx80NaN(a
, b
, status
);
6074 return packFloatx80(zSign
, floatx80_infinity_high
,
6075 floatx80_infinity_low
);
6077 if ( bExp
== 0x7FFF ) {
6078 if ((uint64_t)(bSig
<< 1)) {
6079 return propagateFloatx80NaN(a
, b
, status
);
6081 return packFloatx80( zSign
, 0, 0 );
6085 if ( ( aExp
| aSig
) == 0 ) {
6087 float_raise(float_flag_invalid
, status
);
6088 return floatx80_default_nan(status
);
6090 float_raise(float_flag_divbyzero
, status
);
6091 return packFloatx80(zSign
, floatx80_infinity_high
,
6092 floatx80_infinity_low
);
6094 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6097 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6098 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6100 zExp
= aExp
- bExp
+ 0x3FFE;
6102 if ( bSig
<= aSig
) {
6103 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
6106 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
6107 mul64To128( bSig
, zSig0
, &term0
, &term1
);
6108 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
6109 while ( (int64_t) rem0
< 0 ) {
6111 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
6113 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
6114 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
6115 mul64To128( bSig
, zSig1
, &term1
, &term2
);
6116 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6117 while ( (int64_t) rem1
< 0 ) {
6119 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
6121 zSig1
|= ( ( rem1
| rem2
) != 0 );
6123 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6124 zSign
, zExp
, zSig0
, zSig1
, status
);
6127 /*----------------------------------------------------------------------------
6128 | Returns the remainder of the extended double-precision floating-point value
6129 | `a' with respect to the corresponding value `b'. The operation is performed
6130 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6131 *----------------------------------------------------------------------------*/
6133 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
6136 int32_t aExp
, bExp
, expDiff
;
6137 uint64_t aSig0
, aSig1
, bSig
;
6138 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
6140 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6141 float_raise(float_flag_invalid
, status
);
6142 return floatx80_default_nan(status
);
6144 aSig0
= extractFloatx80Frac( a
);
6145 aExp
= extractFloatx80Exp( a
);
6146 aSign
= extractFloatx80Sign( a
);
6147 bSig
= extractFloatx80Frac( b
);
6148 bExp
= extractFloatx80Exp( b
);
6149 if ( aExp
== 0x7FFF ) {
6150 if ( (uint64_t) ( aSig0
<<1 )
6151 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6152 return propagateFloatx80NaN(a
, b
, status
);
6156 if ( bExp
== 0x7FFF ) {
6157 if ((uint64_t)(bSig
<< 1)) {
6158 return propagateFloatx80NaN(a
, b
, status
);
6165 float_raise(float_flag_invalid
, status
);
6166 return floatx80_default_nan(status
);
6168 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6171 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
6172 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6174 bSig
|= UINT64_C(0x8000000000000000);
6176 expDiff
= aExp
- bExp
;
6178 if ( expDiff
< 0 ) {
6179 if ( expDiff
< -1 ) return a
;
6180 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
6183 q
= ( bSig
<= aSig0
);
6184 if ( q
) aSig0
-= bSig
;
6186 while ( 0 < expDiff
) {
6187 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6188 q
= ( 2 < q
) ? q
- 2 : 0;
6189 mul64To128( bSig
, q
, &term0
, &term1
);
6190 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6191 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
6195 if ( 0 < expDiff
) {
6196 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6197 q
= ( 2 < q
) ? q
- 2 : 0;
6199 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
6200 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6201 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
6202 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
6204 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6211 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
6212 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6213 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6216 aSig0
= alternateASig0
;
6217 aSig1
= alternateASig1
;
6221 normalizeRoundAndPackFloatx80(
6222 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
6226 /*----------------------------------------------------------------------------
6227 | Returns the square root of the extended double-precision floating-point
6228 | value `a'. The operation is performed according to the IEC/IEEE Standard
6229 | for Binary Floating-Point Arithmetic.
6230 *----------------------------------------------------------------------------*/
6232 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
6236 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
6237 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6239 if (floatx80_invalid_encoding(a
)) {
6240 float_raise(float_flag_invalid
, status
);
6241 return floatx80_default_nan(status
);
6243 aSig0
= extractFloatx80Frac( a
);
6244 aExp
= extractFloatx80Exp( a
);
6245 aSign
= extractFloatx80Sign( a
);
6246 if ( aExp
== 0x7FFF ) {
6247 if ((uint64_t)(aSig0
<< 1)) {
6248 return propagateFloatx80NaN(a
, a
, status
);
6250 if ( ! aSign
) return a
;
6254 if ( ( aExp
| aSig0
) == 0 ) return a
;
6256 float_raise(float_flag_invalid
, status
);
6257 return floatx80_default_nan(status
);
6260 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
6261 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6263 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
6264 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
6265 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
6266 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6267 doubleZSig0
= zSig0
<<1;
6268 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6269 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6270 while ( (int64_t) rem0
< 0 ) {
6273 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6275 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6276 if ( ( zSig1
& UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6277 if ( zSig1
== 0 ) zSig1
= 1;
6278 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6279 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6280 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6281 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6282 while ( (int64_t) rem1
< 0 ) {
6284 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6286 term2
|= doubleZSig0
;
6287 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6289 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6291 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
6292 zSig0
|= doubleZSig0
;
6293 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6294 0, zExp
, zSig0
, zSig1
, status
);
6297 /*----------------------------------------------------------------------------
6298 | Returns 1 if the extended double-precision floating-point value `a' is equal
6299 | to the corresponding value `b', and 0 otherwise. The invalid exception is
6300 | raised if either operand is a NaN. Otherwise, the comparison is performed
6301 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6302 *----------------------------------------------------------------------------*/
6304 int floatx80_eq(floatx80 a
, floatx80 b
, float_status
*status
)
6307 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
6308 || (extractFloatx80Exp(a
) == 0x7FFF
6309 && (uint64_t) (extractFloatx80Frac(a
) << 1))
6310 || (extractFloatx80Exp(b
) == 0x7FFF
6311 && (uint64_t) (extractFloatx80Frac(b
) << 1))
6313 float_raise(float_flag_invalid
, status
);
6318 && ( ( a
.high
== b
.high
)
6320 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6325 /*----------------------------------------------------------------------------
6326 | Returns 1 if the extended double-precision floating-point value `a' is
6327 | less than or equal to the corresponding value `b', and 0 otherwise. The
6328 | invalid exception is raised if either operand is a NaN. The comparison is
6329 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6331 *----------------------------------------------------------------------------*/
6333 int floatx80_le(floatx80 a
, floatx80 b
, float_status
*status
)
6337 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
6338 || (extractFloatx80Exp(a
) == 0x7FFF
6339 && (uint64_t) (extractFloatx80Frac(a
) << 1))
6340 || (extractFloatx80Exp(b
) == 0x7FFF
6341 && (uint64_t) (extractFloatx80Frac(b
) << 1))
6343 float_raise(float_flag_invalid
, status
);
6346 aSign
= extractFloatx80Sign( a
);
6347 bSign
= extractFloatx80Sign( b
);
6348 if ( aSign
!= bSign
) {
6351 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6355 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6356 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6360 /*----------------------------------------------------------------------------
6361 | Returns 1 if the extended double-precision floating-point value `a' is
6362 | less than the corresponding value `b', and 0 otherwise. The invalid
6363 | exception is raised if either operand is a NaN. The comparison is performed
6364 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6365 *----------------------------------------------------------------------------*/
6367 int floatx80_lt(floatx80 a
, floatx80 b
, float_status
*status
)
6371 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
6372 || (extractFloatx80Exp(a
) == 0x7FFF
6373 && (uint64_t) (extractFloatx80Frac(a
) << 1))
6374 || (extractFloatx80Exp(b
) == 0x7FFF
6375 && (uint64_t) (extractFloatx80Frac(b
) << 1))
6377 float_raise(float_flag_invalid
, status
);
6380 aSign
= extractFloatx80Sign( a
);
6381 bSign
= extractFloatx80Sign( b
);
6382 if ( aSign
!= bSign
) {
6385 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6389 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6390 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6394 /*----------------------------------------------------------------------------
6395 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6396 | cannot be compared, and 0 otherwise. The invalid exception is raised if
6397 | either operand is a NaN. The comparison is performed according to the
6398 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6399 *----------------------------------------------------------------------------*/
6400 int floatx80_unordered(floatx80 a
, floatx80 b
, float_status
*status
)
6402 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
6403 || (extractFloatx80Exp(a
) == 0x7FFF
6404 && (uint64_t) (extractFloatx80Frac(a
) << 1))
6405 || (extractFloatx80Exp(b
) == 0x7FFF
6406 && (uint64_t) (extractFloatx80Frac(b
) << 1))
6408 float_raise(float_flag_invalid
, status
);
6414 /*----------------------------------------------------------------------------
6415 | Returns 1 if the extended double-precision floating-point value `a' is
6416 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6417 | cause an exception. The comparison is performed according to the IEC/IEEE
6418 | Standard for Binary Floating-Point Arithmetic.
6419 *----------------------------------------------------------------------------*/
6421 int floatx80_eq_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6424 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6425 float_raise(float_flag_invalid
, status
);
6428 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
6429 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
6430 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
6431 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
6433 if (floatx80_is_signaling_nan(a
, status
)
6434 || floatx80_is_signaling_nan(b
, status
)) {
6435 float_raise(float_flag_invalid
, status
);
6441 && ( ( a
.high
== b
.high
)
6443 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6448 /*----------------------------------------------------------------------------
6449 | Returns 1 if the extended double-precision floating-point value `a' is less
6450 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
6451 | do not cause an exception. Otherwise, the comparison is performed according
6452 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6453 *----------------------------------------------------------------------------*/
6455 int floatx80_le_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6459 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6460 float_raise(float_flag_invalid
, status
);
6463 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
6464 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
6465 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
6466 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
6468 if (floatx80_is_signaling_nan(a
, status
)
6469 || floatx80_is_signaling_nan(b
, status
)) {
6470 float_raise(float_flag_invalid
, status
);
6474 aSign
= extractFloatx80Sign( a
);
6475 bSign
= extractFloatx80Sign( b
);
6476 if ( aSign
!= bSign
) {
6479 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6483 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6484 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6488 /*----------------------------------------------------------------------------
6489 | Returns 1 if the extended double-precision floating-point value `a' is less
6490 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
6491 | an exception. Otherwise, the comparison is performed according to the
6492 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6493 *----------------------------------------------------------------------------*/
6495 int floatx80_lt_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6499 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6500 float_raise(float_flag_invalid
, status
);
6503 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
6504 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
6505 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
6506 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
6508 if (floatx80_is_signaling_nan(a
, status
)
6509 || floatx80_is_signaling_nan(b
, status
)) {
6510 float_raise(float_flag_invalid
, status
);
6514 aSign
= extractFloatx80Sign( a
);
6515 bSign
= extractFloatx80Sign( b
);
6516 if ( aSign
!= bSign
) {
6519 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6523 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6524 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6528 /*----------------------------------------------------------------------------
6529 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6530 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
6531 | The comparison is performed according to the IEC/IEEE Standard for Binary
6532 | Floating-Point Arithmetic.
6533 *----------------------------------------------------------------------------*/
6534 int floatx80_unordered_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6536 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6537 float_raise(float_flag_invalid
, status
);
6540 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
6541 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
6542 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
6543 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
6545 if (floatx80_is_signaling_nan(a
, status
)
6546 || floatx80_is_signaling_nan(b
, status
)) {
6547 float_raise(float_flag_invalid
, status
);
6554 /*----------------------------------------------------------------------------
6555 | Returns the result of converting the quadruple-precision floating-point
6556 | value `a' to the 32-bit two's complement integer format. The conversion
6557 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6558 | Arithmetic---which means in particular that the conversion is rounded
6559 | according to the current rounding mode. If `a' is a NaN, the largest
6560 | positive integer is returned. Otherwise, if the conversion overflows, the
6561 | largest integer with the same sign as `a' is returned.
6562 *----------------------------------------------------------------------------*/
6564 int32_t float128_to_int32(float128 a
, float_status
*status
)
6567 int32_t aExp
, shiftCount
;
6568 uint64_t aSig0
, aSig1
;
6570 aSig1
= extractFloat128Frac1( a
);
6571 aSig0
= extractFloat128Frac0( a
);
6572 aExp
= extractFloat128Exp( a
);
6573 aSign
= extractFloat128Sign( a
);
6574 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
6575 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6576 aSig0
|= ( aSig1
!= 0 );
6577 shiftCount
= 0x4028 - aExp
;
6578 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
6579 return roundAndPackInt32(aSign
, aSig0
, status
);
6583 /*----------------------------------------------------------------------------
6584 | Returns the result of converting the quadruple-precision floating-point
6585 | value `a' to the 32-bit two's complement integer format. The conversion
6586 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6587 | Arithmetic, except that the conversion is always rounded toward zero. If
6588 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6589 | conversion overflows, the largest integer with the same sign as `a' is
6591 *----------------------------------------------------------------------------*/
6593 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
6596 int32_t aExp
, shiftCount
;
6597 uint64_t aSig0
, aSig1
, savedASig
;
6600 aSig1
= extractFloat128Frac1( a
);
6601 aSig0
= extractFloat128Frac0( a
);
6602 aExp
= extractFloat128Exp( a
);
6603 aSign
= extractFloat128Sign( a
);
6604 aSig0
|= ( aSig1
!= 0 );
6605 if ( 0x401E < aExp
) {
6606 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
6609 else if ( aExp
< 0x3FFF ) {
6610 if (aExp
|| aSig0
) {
6611 status
->float_exception_flags
|= float_flag_inexact
;
6615 aSig0
|= UINT64_C(0x0001000000000000);
6616 shiftCount
= 0x402F - aExp
;
6618 aSig0
>>= shiftCount
;
6620 if ( aSign
) z
= - z
;
6621 if ( ( z
< 0 ) ^ aSign
) {
6623 float_raise(float_flag_invalid
, status
);
6624 return aSign
? INT32_MIN
: INT32_MAX
;
6626 if ( ( aSig0
<<shiftCount
) != savedASig
) {
6627 status
->float_exception_flags
|= float_flag_inexact
;
6633 /*----------------------------------------------------------------------------
6634 | Returns the result of converting the quadruple-precision floating-point
6635 | value `a' to the 64-bit two's complement integer format. The conversion
6636 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6637 | Arithmetic---which means in particular that the conversion is rounded
6638 | according to the current rounding mode. If `a' is a NaN, the largest
6639 | positive integer is returned. Otherwise, if the conversion overflows, the
6640 | largest integer with the same sign as `a' is returned.
6641 *----------------------------------------------------------------------------*/
6643 int64_t float128_to_int64(float128 a
, float_status
*status
)
6646 int32_t aExp
, shiftCount
;
6647 uint64_t aSig0
, aSig1
;
6649 aSig1
= extractFloat128Frac1( a
);
6650 aSig0
= extractFloat128Frac0( a
);
6651 aExp
= extractFloat128Exp( a
);
6652 aSign
= extractFloat128Sign( a
);
6653 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6654 shiftCount
= 0x402F - aExp
;
6655 if ( shiftCount
<= 0 ) {
6656 if ( 0x403E < aExp
) {
6657 float_raise(float_flag_invalid
, status
);
6659 || ( ( aExp
== 0x7FFF )
6660 && ( aSig1
|| ( aSig0
!= UINT64_C(0x0001000000000000) ) )
6667 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
6670 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6672 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
6676 /*----------------------------------------------------------------------------
6677 | Returns the result of converting the quadruple-precision floating-point
6678 | value `a' to the 64-bit two's complement integer format. The conversion
6679 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6680 | Arithmetic, except that the conversion is always rounded toward zero.
6681 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6682 | the conversion overflows, the largest integer with the same sign as `a' is
6684 *----------------------------------------------------------------------------*/
6686 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
6689 int32_t aExp
, shiftCount
;
6690 uint64_t aSig0
, aSig1
;
6693 aSig1
= extractFloat128Frac1( a
);
6694 aSig0
= extractFloat128Frac0( a
);
6695 aExp
= extractFloat128Exp( a
);
6696 aSign
= extractFloat128Sign( a
);
6697 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6698 shiftCount
= aExp
- 0x402F;
6699 if ( 0 < shiftCount
) {
6700 if ( 0x403E <= aExp
) {
6701 aSig0
&= UINT64_C(0x0000FFFFFFFFFFFF);
6702 if ( ( a
.high
== UINT64_C(0xC03E000000000000) )
6703 && ( aSig1
< UINT64_C(0x0002000000000000) ) ) {
6705 status
->float_exception_flags
|= float_flag_inexact
;
6709 float_raise(float_flag_invalid
, status
);
6710 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
6716 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
6717 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
6718 status
->float_exception_flags
|= float_flag_inexact
;
6722 if ( aExp
< 0x3FFF ) {
6723 if ( aExp
| aSig0
| aSig1
) {
6724 status
->float_exception_flags
|= float_flag_inexact
;
6728 z
= aSig0
>>( - shiftCount
);
6730 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
6731 status
->float_exception_flags
|= float_flag_inexact
;
6734 if ( aSign
) z
= - z
;
6739 /*----------------------------------------------------------------------------
6740 | Returns the result of converting the quadruple-precision floating-point value
6741 | `a' to the 64-bit unsigned integer format. The conversion is
6742 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6743 | Arithmetic---which means in particular that the conversion is rounded
6744 | according to the current rounding mode. If `a' is a NaN, the largest
6745 | positive integer is returned. If the conversion overflows, the
6746 | largest unsigned integer is returned. If 'a' is negative, the value is
6747 | rounded and zero is returned; negative values that do not round to zero
6748 | will raise the inexact exception.
6749 *----------------------------------------------------------------------------*/
6751 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
6756 uint64_t aSig0
, aSig1
;
6758 aSig0
= extractFloat128Frac0(a
);
6759 aSig1
= extractFloat128Frac1(a
);
6760 aExp
= extractFloat128Exp(a
);
6761 aSign
= extractFloat128Sign(a
);
6762 if (aSign
&& (aExp
> 0x3FFE)) {
6763 float_raise(float_flag_invalid
, status
);
6764 if (float128_is_any_nan(a
)) {
6771 aSig0
|= UINT64_C(0x0001000000000000);
6773 shiftCount
= 0x402F - aExp
;
6774 if (shiftCount
<= 0) {
6775 if (0x403E < aExp
) {
6776 float_raise(float_flag_invalid
, status
);
6779 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
6781 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6783 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
6786 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
6789 signed char current_rounding_mode
= status
->float_rounding_mode
;
6791 set_float_rounding_mode(float_round_to_zero
, status
);
6792 v
= float128_to_uint64(a
, status
);
6793 set_float_rounding_mode(current_rounding_mode
, status
);
6798 /*----------------------------------------------------------------------------
6799 | Returns the result of converting the quadruple-precision floating-point
6800 | value `a' to the 32-bit unsigned integer format. The conversion
6801 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6802 | Arithmetic except that the conversion is always rounded toward zero.
6803 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6804 | if the conversion overflows, the largest unsigned integer is returned.
6805 | If 'a' is negative, the value is rounded and zero is returned; negative
6806 | values that do not round to zero will raise the inexact exception.
6807 *----------------------------------------------------------------------------*/
6809 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
6813 int old_exc_flags
= get_float_exception_flags(status
);
6815 v
= float128_to_uint64_round_to_zero(a
, status
);
6816 if (v
> 0xffffffff) {
6821 set_float_exception_flags(old_exc_flags
, status
);
6822 float_raise(float_flag_invalid
, status
);
6826 /*----------------------------------------------------------------------------
6827 | Returns the result of converting the quadruple-precision floating-point value
6828 | `a' to the 32-bit unsigned integer format. The conversion is
6829 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6830 | Arithmetic---which means in particular that the conversion is rounded
6831 | according to the current rounding mode. If `a' is a NaN, the largest
6832 | positive integer is returned. If the conversion overflows, the
6833 | largest unsigned integer is returned. If 'a' is negative, the value is
6834 | rounded and zero is returned; negative values that do not round to zero
6835 | will raise the inexact exception.
6836 *----------------------------------------------------------------------------*/
6838 uint32_t float128_to_uint32(float128 a
, float_status
*status
)
6842 int old_exc_flags
= get_float_exception_flags(status
);
6844 v
= float128_to_uint64(a
, status
);
6845 if (v
> 0xffffffff) {
6850 set_float_exception_flags(old_exc_flags
, status
);
6851 float_raise(float_flag_invalid
, status
);
6855 /*----------------------------------------------------------------------------
6856 | Returns the result of converting the quadruple-precision floating-point
6857 | value `a' to the single-precision floating-point format. The conversion
6858 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6860 *----------------------------------------------------------------------------*/
6862 float32
float128_to_float32(float128 a
, float_status
*status
)
6866 uint64_t aSig0
, aSig1
;
6869 aSig1
= extractFloat128Frac1( a
);
6870 aSig0
= extractFloat128Frac0( a
);
6871 aExp
= extractFloat128Exp( a
);
6872 aSign
= extractFloat128Sign( a
);
6873 if ( aExp
== 0x7FFF ) {
6874 if ( aSig0
| aSig1
) {
6875 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
6877 return packFloat32( aSign
, 0xFF, 0 );
6879 aSig0
|= ( aSig1
!= 0 );
6880 shift64RightJamming( aSig0
, 18, &aSig0
);
6882 if ( aExp
|| zSig
) {
6886 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
6890 /*----------------------------------------------------------------------------
6891 | Returns the result of converting the quadruple-precision floating-point
6892 | value `a' to the double-precision floating-point format. The conversion
6893 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6895 *----------------------------------------------------------------------------*/
6897 float64
float128_to_float64(float128 a
, float_status
*status
)
6901 uint64_t aSig0
, aSig1
;
6903 aSig1
= extractFloat128Frac1( a
);
6904 aSig0
= extractFloat128Frac0( a
);
6905 aExp
= extractFloat128Exp( a
);
6906 aSign
= extractFloat128Sign( a
);
6907 if ( aExp
== 0x7FFF ) {
6908 if ( aSig0
| aSig1
) {
6909 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
6911 return packFloat64( aSign
, 0x7FF, 0 );
6913 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6914 aSig0
|= ( aSig1
!= 0 );
6915 if ( aExp
|| aSig0
) {
6916 aSig0
|= UINT64_C(0x4000000000000000);
6919 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
6923 /*----------------------------------------------------------------------------
6924 | Returns the result of converting the quadruple-precision floating-point
6925 | value `a' to the extended double-precision floating-point format. The
6926 | conversion is performed according to the IEC/IEEE Standard for Binary
6927 | Floating-Point Arithmetic.
6928 *----------------------------------------------------------------------------*/
6930 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6934 uint64_t aSig0
, aSig1
;
6936 aSig1
= extractFloat128Frac1( a
);
6937 aSig0
= extractFloat128Frac0( a
);
6938 aExp
= extractFloat128Exp( a
);
6939 aSign
= extractFloat128Sign( a
);
6940 if ( aExp
== 0x7FFF ) {
6941 if ( aSig0
| aSig1
) {
6942 return commonNaNToFloatx80(float128ToCommonNaN(a
, status
), status
);
6944 return packFloatx80(aSign
, floatx80_infinity_high
,
6945 floatx80_infinity_low
);
6948 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6949 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6952 aSig0
|= UINT64_C(0x0001000000000000);
6954 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6955 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6959 /*----------------------------------------------------------------------------
6960 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6961 | returns the result as a quadruple-precision floating-point value. The
6962 | operation is performed according to the IEC/IEEE Standard for Binary
6963 | Floating-Point Arithmetic.
6964 *----------------------------------------------------------------------------*/
6966 float128
float128_round_to_int(float128 a
, float_status
*status
)
6970 uint64_t lastBitMask
, roundBitsMask
;
6973 aExp
= extractFloat128Exp( a
);
6974 if ( 0x402F <= aExp
) {
6975 if ( 0x406F <= aExp
) {
6976 if ( ( aExp
== 0x7FFF )
6977 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6979 return propagateFloat128NaN(a
, a
, status
);
6984 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6985 roundBitsMask
= lastBitMask
- 1;
6987 switch (status
->float_rounding_mode
) {
6988 case float_round_nearest_even
:
6989 if ( lastBitMask
) {
6990 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6991 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6994 if ( (int64_t) z
.low
< 0 ) {
6996 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
7000 case float_round_ties_away
:
7002 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
7004 if ((int64_t) z
.low
< 0) {
7009 case float_round_to_zero
:
7011 case float_round_up
:
7012 if (!extractFloat128Sign(z
)) {
7013 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
7016 case float_round_down
:
7017 if (extractFloat128Sign(z
)) {
7018 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
7021 case float_round_to_odd
:
7023 * Note that if lastBitMask == 0, the last bit is the lsb
7024 * of high, and roundBitsMask == -1.
7026 if ((lastBitMask
? z
.low
& lastBitMask
: z
.high
& 1) == 0) {
7027 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
7033 z
.low
&= ~ roundBitsMask
;
7036 if ( aExp
< 0x3FFF ) {
7037 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
7038 status
->float_exception_flags
|= float_flag_inexact
;
7039 aSign
= extractFloat128Sign( a
);
7040 switch (status
->float_rounding_mode
) {
7041 case float_round_nearest_even
:
7042 if ( ( aExp
== 0x3FFE )
7043 && ( extractFloat128Frac0( a
)
7044 | extractFloat128Frac1( a
) )
7046 return packFloat128( aSign
, 0x3FFF, 0, 0 );
7049 case float_round_ties_away
:
7050 if (aExp
== 0x3FFE) {
7051 return packFloat128(aSign
, 0x3FFF, 0, 0);
7054 case float_round_down
:
7056 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
7057 : packFloat128( 0, 0, 0, 0 );
7058 case float_round_up
:
7060 aSign
? packFloat128( 1, 0, 0, 0 )
7061 : packFloat128( 0, 0x3FFF, 0, 0 );
7063 case float_round_to_odd
:
7064 return packFloat128(aSign
, 0x3FFF, 0, 0);
7066 return packFloat128( aSign
, 0, 0, 0 );
7069 lastBitMask
<<= 0x402F - aExp
;
7070 roundBitsMask
= lastBitMask
- 1;
7073 switch (status
->float_rounding_mode
) {
7074 case float_round_nearest_even
:
7075 z
.high
+= lastBitMask
>>1;
7076 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
7077 z
.high
&= ~ lastBitMask
;
7080 case float_round_ties_away
:
7081 z
.high
+= lastBitMask
>>1;
7083 case float_round_to_zero
:
7085 case float_round_up
:
7086 if (!extractFloat128Sign(z
)) {
7087 z
.high
|= ( a
.low
!= 0 );
7088 z
.high
+= roundBitsMask
;
7091 case float_round_down
:
7092 if (extractFloat128Sign(z
)) {
7093 z
.high
|= (a
.low
!= 0);
7094 z
.high
+= roundBitsMask
;
7097 case float_round_to_odd
:
7098 if ((z
.high
& lastBitMask
) == 0) {
7099 z
.high
|= (a
.low
!= 0);
7100 z
.high
+= roundBitsMask
;
7106 z
.high
&= ~ roundBitsMask
;
7108 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
7109 status
->float_exception_flags
|= float_flag_inexact
;
7115 /*----------------------------------------------------------------------------
7116 | Returns the result of adding the absolute values of the quadruple-precision
7117 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
7118 | before being returned. `zSign' is ignored if the result is a NaN.
7119 | The addition is performed according to the IEC/IEEE Standard for Binary
7120 | Floating-Point Arithmetic.
7121 *----------------------------------------------------------------------------*/
7123 static float128
addFloat128Sigs(float128 a
, float128 b
, flag zSign
,
7124 float_status
*status
)
7126 int32_t aExp
, bExp
, zExp
;
7127 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
7130 aSig1
= extractFloat128Frac1( a
);
7131 aSig0
= extractFloat128Frac0( a
);
7132 aExp
= extractFloat128Exp( a
);
7133 bSig1
= extractFloat128Frac1( b
);
7134 bSig0
= extractFloat128Frac0( b
);
7135 bExp
= extractFloat128Exp( b
);
7136 expDiff
= aExp
- bExp
;
7137 if ( 0 < expDiff
) {
7138 if ( aExp
== 0x7FFF ) {
7139 if (aSig0
| aSig1
) {
7140 return propagateFloat128NaN(a
, b
, status
);
7148 bSig0
|= UINT64_C(0x0001000000000000);
7150 shift128ExtraRightJamming(
7151 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
7154 else if ( expDiff
< 0 ) {
7155 if ( bExp
== 0x7FFF ) {
7156 if (bSig0
| bSig1
) {
7157 return propagateFloat128NaN(a
, b
, status
);
7159 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7165 aSig0
|= UINT64_C(0x0001000000000000);
7167 shift128ExtraRightJamming(
7168 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
7172 if ( aExp
== 0x7FFF ) {
7173 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
7174 return propagateFloat128NaN(a
, b
, status
);
7178 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7180 if (status
->flush_to_zero
) {
7181 if (zSig0
| zSig1
) {
7182 float_raise(float_flag_output_denormal
, status
);
7184 return packFloat128(zSign
, 0, 0, 0);
7186 return packFloat128( zSign
, 0, zSig0
, zSig1
);
7189 zSig0
|= UINT64_C(0x0002000000000000);
7193 aSig0
|= UINT64_C(0x0001000000000000);
7194 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7196 if ( zSig0
< UINT64_C(0x0002000000000000) ) goto roundAndPack
;
7199 shift128ExtraRightJamming(
7200 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
7202 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7206 /*----------------------------------------------------------------------------
7207 | Returns the result of subtracting the absolute values of the quadruple-
7208 | precision floating-point values `a' and `b'. If `zSign' is 1, the
7209 | difference is negated before being returned. `zSign' is ignored if the
7210 | result is a NaN. The subtraction is performed according to the IEC/IEEE
7211 | Standard for Binary Floating-Point Arithmetic.
7212 *----------------------------------------------------------------------------*/
7214 static float128
subFloat128Sigs(float128 a
, float128 b
, flag zSign
,
7215 float_status
*status
)
7217 int32_t aExp
, bExp
, zExp
;
7218 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
7221 aSig1
= extractFloat128Frac1( a
);
7222 aSig0
= extractFloat128Frac0( a
);
7223 aExp
= extractFloat128Exp( a
);
7224 bSig1
= extractFloat128Frac1( b
);
7225 bSig0
= extractFloat128Frac0( b
);
7226 bExp
= extractFloat128Exp( b
);
7227 expDiff
= aExp
- bExp
;
7228 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
7229 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
7230 if ( 0 < expDiff
) goto aExpBigger
;
7231 if ( expDiff
< 0 ) goto bExpBigger
;
7232 if ( aExp
== 0x7FFF ) {
7233 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
7234 return propagateFloat128NaN(a
, b
, status
);
7236 float_raise(float_flag_invalid
, status
);
7237 return float128_default_nan(status
);
7243 if ( bSig0
< aSig0
) goto aBigger
;
7244 if ( aSig0
< bSig0
) goto bBigger
;
7245 if ( bSig1
< aSig1
) goto aBigger
;
7246 if ( aSig1
< bSig1
) goto bBigger
;
7247 return packFloat128(status
->float_rounding_mode
== float_round_down
,
7250 if ( bExp
== 0x7FFF ) {
7251 if (bSig0
| bSig1
) {
7252 return propagateFloat128NaN(a
, b
, status
);
7254 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
7260 aSig0
|= UINT64_C(0x4000000000000000);
7262 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7263 bSig0
|= UINT64_C(0x4000000000000000);
7265 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7268 goto normalizeRoundAndPack
;
7270 if ( aExp
== 0x7FFF ) {
7271 if (aSig0
| aSig1
) {
7272 return propagateFloat128NaN(a
, b
, status
);
7280 bSig0
|= UINT64_C(0x4000000000000000);
7282 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
7283 aSig0
|= UINT64_C(0x4000000000000000);
7285 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7287 normalizeRoundAndPack
:
7289 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
7294 /*----------------------------------------------------------------------------
7295 | Returns the result of adding the quadruple-precision floating-point values
7296 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7297 | for Binary Floating-Point Arithmetic.
7298 *----------------------------------------------------------------------------*/
7300 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
7304 aSign
= extractFloat128Sign( a
);
7305 bSign
= extractFloat128Sign( b
);
7306 if ( aSign
== bSign
) {
7307 return addFloat128Sigs(a
, b
, aSign
, status
);
7310 return subFloat128Sigs(a
, b
, aSign
, status
);
7315 /*----------------------------------------------------------------------------
7316 | Returns the result of subtracting the quadruple-precision floating-point
7317 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7318 | Standard for Binary Floating-Point Arithmetic.
7319 *----------------------------------------------------------------------------*/
7321 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
7325 aSign
= extractFloat128Sign( a
);
7326 bSign
= extractFloat128Sign( b
);
7327 if ( aSign
== bSign
) {
7328 return subFloat128Sigs(a
, b
, aSign
, status
);
7331 return addFloat128Sigs(a
, b
, aSign
, status
);
7336 /*----------------------------------------------------------------------------
7337 | Returns the result of multiplying the quadruple-precision floating-point
7338 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7339 | Standard for Binary Floating-Point Arithmetic.
7340 *----------------------------------------------------------------------------*/
7342 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
7344 flag aSign
, bSign
, zSign
;
7345 int32_t aExp
, bExp
, zExp
;
7346 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
7348 aSig1
= extractFloat128Frac1( a
);
7349 aSig0
= extractFloat128Frac0( a
);
7350 aExp
= extractFloat128Exp( a
);
7351 aSign
= extractFloat128Sign( a
);
7352 bSig1
= extractFloat128Frac1( b
);
7353 bSig0
= extractFloat128Frac0( b
);
7354 bExp
= extractFloat128Exp( b
);
7355 bSign
= extractFloat128Sign( b
);
7356 zSign
= aSign
^ bSign
;
7357 if ( aExp
== 0x7FFF ) {
7358 if ( ( aSig0
| aSig1
)
7359 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7360 return propagateFloat128NaN(a
, b
, status
);
7362 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
7363 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7365 if ( bExp
== 0x7FFF ) {
7366 if (bSig0
| bSig1
) {
7367 return propagateFloat128NaN(a
, b
, status
);
7369 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7371 float_raise(float_flag_invalid
, status
);
7372 return float128_default_nan(status
);
7374 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7377 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7378 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7381 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7382 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7384 zExp
= aExp
+ bExp
- 0x4000;
7385 aSig0
|= UINT64_C(0x0001000000000000);
7386 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
7387 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
7388 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7389 zSig2
|= ( zSig3
!= 0 );
7390 if (UINT64_C( 0x0002000000000000) <= zSig0
) {
7391 shift128ExtraRightJamming(
7392 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
7395 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7399 /*----------------------------------------------------------------------------
7400 | Returns the result of dividing the quadruple-precision floating-point value
7401 | `a' by the corresponding value `b'. The operation is performed according to
7402 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7403 *----------------------------------------------------------------------------*/
7405 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
7407 flag aSign
, bSign
, zSign
;
7408 int32_t aExp
, bExp
, zExp
;
7409 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
7410 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7412 aSig1
= extractFloat128Frac1( a
);
7413 aSig0
= extractFloat128Frac0( a
);
7414 aExp
= extractFloat128Exp( a
);
7415 aSign
= extractFloat128Sign( a
);
7416 bSig1
= extractFloat128Frac1( b
);
7417 bSig0
= extractFloat128Frac0( b
);
7418 bExp
= extractFloat128Exp( b
);
7419 bSign
= extractFloat128Sign( b
);
7420 zSign
= aSign
^ bSign
;
7421 if ( aExp
== 0x7FFF ) {
7422 if (aSig0
| aSig1
) {
7423 return propagateFloat128NaN(a
, b
, status
);
7425 if ( bExp
== 0x7FFF ) {
7426 if (bSig0
| bSig1
) {
7427 return propagateFloat128NaN(a
, b
, status
);
7431 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7433 if ( bExp
== 0x7FFF ) {
7434 if (bSig0
| bSig1
) {
7435 return propagateFloat128NaN(a
, b
, status
);
7437 return packFloat128( zSign
, 0, 0, 0 );
7440 if ( ( bSig0
| bSig1
) == 0 ) {
7441 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7443 float_raise(float_flag_invalid
, status
);
7444 return float128_default_nan(status
);
7446 float_raise(float_flag_divbyzero
, status
);
7447 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7449 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7452 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7453 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7455 zExp
= aExp
- bExp
+ 0x3FFD;
7457 aSig0
| UINT64_C(0x0001000000000000), aSig1
, 15, &aSig0
, &aSig1
);
7459 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7460 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
7461 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
7464 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7465 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
7466 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
7467 while ( (int64_t) rem0
< 0 ) {
7469 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
7471 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
7472 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
7473 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
7474 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
7475 while ( (int64_t) rem1
< 0 ) {
7477 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
7479 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7481 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
7482 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7486 /*----------------------------------------------------------------------------
7487 | Returns the remainder of the quadruple-precision floating-point value `a'
7488 | with respect to the corresponding value `b'. The operation is performed
7489 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7490 *----------------------------------------------------------------------------*/
7492 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
7495 int32_t aExp
, bExp
, expDiff
;
7496 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
7497 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
7500 aSig1
= extractFloat128Frac1( a
);
7501 aSig0
= extractFloat128Frac0( a
);
7502 aExp
= extractFloat128Exp( a
);
7503 aSign
= extractFloat128Sign( a
);
7504 bSig1
= extractFloat128Frac1( b
);
7505 bSig0
= extractFloat128Frac0( b
);
7506 bExp
= extractFloat128Exp( b
);
7507 if ( aExp
== 0x7FFF ) {
7508 if ( ( aSig0
| aSig1
)
7509 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7510 return propagateFloat128NaN(a
, b
, status
);
7514 if ( bExp
== 0x7FFF ) {
7515 if (bSig0
| bSig1
) {
7516 return propagateFloat128NaN(a
, b
, status
);
7521 if ( ( bSig0
| bSig1
) == 0 ) {
7523 float_raise(float_flag_invalid
, status
);
7524 return float128_default_nan(status
);
7526 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7529 if ( ( aSig0
| aSig1
) == 0 ) return a
;
7530 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7532 expDiff
= aExp
- bExp
;
7533 if ( expDiff
< -1 ) return a
;
7535 aSig0
| UINT64_C(0x0001000000000000),
7537 15 - ( expDiff
< 0 ),
7542 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7543 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
7544 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7546 while ( 0 < expDiff
) {
7547 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7548 q
= ( 4 < q
) ? q
- 4 : 0;
7549 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7550 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
7551 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
7552 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
7555 if ( -64 < expDiff
) {
7556 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7557 q
= ( 4 < q
) ? q
- 4 : 0;
7559 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7561 if ( expDiff
< 0 ) {
7562 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7565 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
7567 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7568 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
7571 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
7572 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7575 alternateASig0
= aSig0
;
7576 alternateASig1
= aSig1
;
7578 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7579 } while ( 0 <= (int64_t) aSig0
);
7581 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
7582 if ( ( sigMean0
< 0 )
7583 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
7584 aSig0
= alternateASig0
;
7585 aSig1
= alternateASig1
;
7587 zSign
= ( (int64_t) aSig0
< 0 );
7588 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
7589 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
7593 /*----------------------------------------------------------------------------
7594 | Returns the square root of the quadruple-precision floating-point value `a'.
7595 | The operation is performed according to the IEC/IEEE Standard for Binary
7596 | Floating-Point Arithmetic.
7597 *----------------------------------------------------------------------------*/
7599 float128
float128_sqrt(float128 a
, float_status
*status
)
7603 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
7604 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7606 aSig1
= extractFloat128Frac1( a
);
7607 aSig0
= extractFloat128Frac0( a
);
7608 aExp
= extractFloat128Exp( a
);
7609 aSign
= extractFloat128Sign( a
);
7610 if ( aExp
== 0x7FFF ) {
7611 if (aSig0
| aSig1
) {
7612 return propagateFloat128NaN(a
, a
, status
);
7614 if ( ! aSign
) return a
;
7618 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
7620 float_raise(float_flag_invalid
, status
);
7621 return float128_default_nan(status
);
7624 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
7625 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7627 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
7628 aSig0
|= UINT64_C(0x0001000000000000);
7629 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
7630 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
7631 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
7632 doubleZSig0
= zSig0
<<1;
7633 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
7634 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
7635 while ( (int64_t) rem0
< 0 ) {
7638 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
7640 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
7641 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
7642 if ( zSig1
== 0 ) zSig1
= 1;
7643 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
7644 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
7645 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
7646 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7647 while ( (int64_t) rem1
< 0 ) {
7649 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
7651 term2
|= doubleZSig0
;
7652 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7654 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7656 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
7657 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
7661 /*----------------------------------------------------------------------------
7662 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7663 | the corresponding value `b', and 0 otherwise. The invalid exception is
7664 | raised if either operand is a NaN. Otherwise, the comparison is performed
7665 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7666 *----------------------------------------------------------------------------*/
7668 int float128_eq(float128 a
, float128 b
, float_status
*status
)
7671 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7672 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7673 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7674 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7676 float_raise(float_flag_invalid
, status
);
7681 && ( ( a
.high
== b
.high
)
7683 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
7688 /*----------------------------------------------------------------------------
7689 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7690 | or equal to the corresponding value `b', and 0 otherwise. The invalid
7691 | exception is raised if either operand is a NaN. The comparison is performed
7692 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7693 *----------------------------------------------------------------------------*/
7695 int float128_le(float128 a
, float128 b
, float_status
*status
)
7699 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7700 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7701 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7702 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7704 float_raise(float_flag_invalid
, status
);
7707 aSign
= extractFloat128Sign( a
);
7708 bSign
= extractFloat128Sign( b
);
7709 if ( aSign
!= bSign
) {
7712 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7716 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
7717 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
7721 /*----------------------------------------------------------------------------
7722 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7723 | the corresponding value `b', and 0 otherwise. The invalid exception is
7724 | raised if either operand is a NaN. The comparison is performed according
7725 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7726 *----------------------------------------------------------------------------*/
7728 int float128_lt(float128 a
, float128 b
, float_status
*status
)
7732 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7733 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7734 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7735 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7737 float_raise(float_flag_invalid
, status
);
7740 aSign
= extractFloat128Sign( a
);
7741 bSign
= extractFloat128Sign( b
);
7742 if ( aSign
!= bSign
) {
7745 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7749 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
7750 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
7754 /*----------------------------------------------------------------------------
7755 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7756 | be compared, and 0 otherwise. The invalid exception is raised if either
7757 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7758 | Standard for Binary Floating-Point Arithmetic.
7759 *----------------------------------------------------------------------------*/
7761 int float128_unordered(float128 a
, float128 b
, float_status
*status
)
7763 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7764 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7765 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7766 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7768 float_raise(float_flag_invalid
, status
);
7774 /*----------------------------------------------------------------------------
7775 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7776 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7777 | exception. The comparison is performed according to the IEC/IEEE Standard
7778 | for Binary Floating-Point Arithmetic.
7779 *----------------------------------------------------------------------------*/
7781 int float128_eq_quiet(float128 a
, float128 b
, float_status
*status
)
7784 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7785 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7786 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7787 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7789 if (float128_is_signaling_nan(a
, status
)
7790 || float128_is_signaling_nan(b
, status
)) {
7791 float_raise(float_flag_invalid
, status
);
7797 && ( ( a
.high
== b
.high
)
7799 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
7804 /*----------------------------------------------------------------------------
7805 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7806 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
7807 | cause an exception. Otherwise, the comparison is performed according to the
7808 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7809 *----------------------------------------------------------------------------*/
7811 int float128_le_quiet(float128 a
, float128 b
, float_status
*status
)
7815 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7816 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7817 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7818 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7820 if (float128_is_signaling_nan(a
, status
)
7821 || float128_is_signaling_nan(b
, status
)) {
7822 float_raise(float_flag_invalid
, status
);
7826 aSign
= extractFloat128Sign( a
);
7827 bSign
= extractFloat128Sign( b
);
7828 if ( aSign
!= bSign
) {
7831 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7835 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
7836 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
7840 /*----------------------------------------------------------------------------
7841 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7842 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7843 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
7844 | Standard for Binary Floating-Point Arithmetic.
7845 *----------------------------------------------------------------------------*/
7847 int float128_lt_quiet(float128 a
, float128 b
, float_status
*status
)
7851 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7852 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7853 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7854 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7856 if (float128_is_signaling_nan(a
, status
)
7857 || float128_is_signaling_nan(b
, status
)) {
7858 float_raise(float_flag_invalid
, status
);
7862 aSign
= extractFloat128Sign( a
);
7863 bSign
= extractFloat128Sign( b
);
7864 if ( aSign
!= bSign
) {
7867 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
7871 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
7872 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
7876 /*----------------------------------------------------------------------------
7877 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7878 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7879 | comparison is performed according to the IEC/IEEE Standard for Binary
7880 | Floating-Point Arithmetic.
7881 *----------------------------------------------------------------------------*/
7883 int float128_unordered_quiet(float128 a
, float128 b
, float_status
*status
)
7885 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
7886 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
7887 || ( ( extractFloat128Exp( b
) == 0x7FFF )
7888 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
7890 if (float128_is_signaling_nan(a
, status
)
7891 || float128_is_signaling_nan(b
, status
)) {
7892 float_raise(float_flag_invalid
, status
);
7899 static inline int floatx80_compare_internal(floatx80 a
, floatx80 b
,
7900 int is_quiet
, float_status
*status
)
7904 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
7905 float_raise(float_flag_invalid
, status
);
7906 return float_relation_unordered
;
7908 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7909 ( extractFloatx80Frac( a
)<<1 ) ) ||
7910 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7911 ( extractFloatx80Frac( b
)<<1 ) )) {
7913 floatx80_is_signaling_nan(a
, status
) ||
7914 floatx80_is_signaling_nan(b
, status
)) {
7915 float_raise(float_flag_invalid
, status
);
7917 return float_relation_unordered
;
7919 aSign
= extractFloatx80Sign( a
);
7920 bSign
= extractFloatx80Sign( b
);
7921 if ( aSign
!= bSign
) {
7923 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7924 ( ( a
.low
| b
.low
) == 0 ) ) {
7926 return float_relation_equal
;
7928 return 1 - (2 * aSign
);
7931 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7932 return float_relation_equal
;
7934 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7939 int floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
7941 return floatx80_compare_internal(a
, b
, 0, status
);
7944 int floatx80_compare_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
7946 return floatx80_compare_internal(a
, b
, 1, status
);
7949 static inline int float128_compare_internal(float128 a
, float128 b
,
7950 int is_quiet
, float_status
*status
)
7954 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7955 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7956 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7957 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7959 float128_is_signaling_nan(a
, status
) ||
7960 float128_is_signaling_nan(b
, status
)) {
7961 float_raise(float_flag_invalid
, status
);
7963 return float_relation_unordered
;
7965 aSign
= extractFloat128Sign( a
);
7966 bSign
= extractFloat128Sign( b
);
7967 if ( aSign
!= bSign
) {
7968 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7970 return float_relation_equal
;
7972 return 1 - (2 * aSign
);
7975 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7976 return float_relation_equal
;
7978 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7983 int float128_compare(float128 a
, float128 b
, float_status
*status
)
7985 return float128_compare_internal(a
, b
, 0, status
);
7988 int float128_compare_quiet(float128 a
, float128 b
, float_status
*status
)
7990 return float128_compare_internal(a
, b
, 1, status
);
7993 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7999 if (floatx80_invalid_encoding(a
)) {
8000 float_raise(float_flag_invalid
, status
);
8001 return floatx80_default_nan(status
);
8003 aSig
= extractFloatx80Frac( a
);
8004 aExp
= extractFloatx80Exp( a
);
8005 aSign
= extractFloatx80Sign( a
);
8007 if ( aExp
== 0x7FFF ) {
8009 return propagateFloatx80NaN(a
, a
, status
);
8023 } else if (n
< -0x10000) {
8028 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
8029 aSign
, aExp
, aSig
, 0, status
);
8032 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
8036 uint64_t aSig0
, aSig1
;
8038 aSig1
= extractFloat128Frac1( a
);
8039 aSig0
= extractFloat128Frac0( a
);
8040 aExp
= extractFloat128Exp( a
);
8041 aSign
= extractFloat128Sign( a
);
8042 if ( aExp
== 0x7FFF ) {
8043 if ( aSig0
| aSig1
) {
8044 return propagateFloat128NaN(a
, a
, status
);
8049 aSig0
|= UINT64_C(0x0001000000000000);
8050 } else if (aSig0
== 0 && aSig1
== 0) {
8058 } else if (n
< -0x10000) {
8063 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1
8068 static void __attribute__((constructor
)) softfloat_init(void)
8070 union_float64 ua
, ub
, uc
, ur
;
8072 if (QEMU_NO_HARDFLOAT
) {
8076 * Test that the host's FMA is not obviously broken. For example,
8077 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
8078 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
8080 ua
.s
= 0x0020000000000001ULL
;
8081 ub
.s
= 0x3ca0000000000000ULL
;
8082 uc
.s
= 0x0020000000000000ULL
;
8083 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
8084 if (ur
.s
!= 0x0020000000000001ULL
) {
8085 force_soft_fma
= true;