4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
90 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 s->float_exception_flags |= float_flag_input_denormal; \
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck
, float32
)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck
, float64
)
141 #undef GEN_INPUT_FLUSH__NOCHECK
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
146 if (likely(!s->flush_inputs_to_zero)) { \
149 soft_t ## _input_flush__nocheck(a, s); \
152 GEN_INPUT_FLUSH1(float32_input_flush1
, float32
)
153 GEN_INPUT_FLUSH1(float64_input_flush1
, float64
)
154 #undef GEN_INPUT_FLUSH1
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
159 if (likely(!s->flush_inputs_to_zero)) { \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
166 GEN_INPUT_FLUSH2(float32_input_flush2
, float32
)
167 GEN_INPUT_FLUSH2(float64_input_flush2
, float64
)
168 #undef GEN_INPUT_FLUSH2
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
173 if (likely(!s->flush_inputs_to_zero)) { \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
181 GEN_INPUT_FLUSH3(float32_input_flush3
, float32
)
182 GEN_INPUT_FLUSH3(float64_input_flush3
, float64
)
183 #undef GEN_INPUT_FLUSH3
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
215 # define QEMU_HARDFLOAT_USE_ISINF 0
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
235 static inline bool can_use_fpu(const float_status
*s
)
237 if (QEMU_NO_HARDFLOAT
) {
240 return likely(s
->float_exception_flags
& float_flag_inexact
&&
241 s
->float_rounding_mode
== float_round_nearest_even
);
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
266 typedef bool (*f32_check_fn
)(union_float32 a
, union_float32 b
);
267 typedef bool (*f64_check_fn
)(union_float64 a
, union_float64 b
);
269 typedef float32 (*soft_f32_op2_fn
)(float32 a
, float32 b
, float_status
*s
);
270 typedef float64 (*soft_f64_op2_fn
)(float64 a
, float64 b
, float_status
*s
);
271 typedef float (*hard_f32_op2_fn
)(float a
, float b
);
272 typedef double (*hard_f64_op2_fn
)(double a
, double b
);
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a
, union_float32 b
)
277 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
282 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
283 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
285 return float32_is_zero_or_normal(a
.s
) &&
286 float32_is_zero_or_normal(b
.s
);
289 static inline bool f64_is_zon2(union_float64 a
, union_float64 b
)
291 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
292 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
293 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
295 return float64_is_zero_or_normal(a
.s
) &&
296 float64_is_zero_or_normal(b
.s
);
299 /* 3-input is-zero-or-normal */
301 bool f32_is_zon3(union_float32 a
, union_float32 b
, union_float32 c
)
303 if (QEMU_HARDFLOAT_3F32_USE_FP
) {
304 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
305 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
306 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
308 return float32_is_zero_or_normal(a
.s
) &&
309 float32_is_zero_or_normal(b
.s
) &&
310 float32_is_zero_or_normal(c
.s
);
314 bool f64_is_zon3(union_float64 a
, union_float64 b
, union_float64 c
)
316 if (QEMU_HARDFLOAT_3F64_USE_FP
) {
317 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
318 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
319 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
321 return float64_is_zero_or_normal(a
.s
) &&
322 float64_is_zero_or_normal(b
.s
) &&
323 float64_is_zero_or_normal(c
.s
);
326 static inline bool f32_is_inf(union_float32 a
)
328 if (QEMU_HARDFLOAT_USE_ISINF
) {
331 return float32_is_infinity(a
.s
);
334 static inline bool f64_is_inf(union_float64 a
)
336 if (QEMU_HARDFLOAT_USE_ISINF
) {
339 return float64_is_infinity(a
.s
);
342 static inline float32
343 float32_gen2(float32 xa
, float32 xb
, float_status
*s
,
344 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
,
345 f32_check_fn pre
, f32_check_fn post
)
347 union_float32 ua
, ub
, ur
;
352 if (unlikely(!can_use_fpu(s
))) {
356 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
357 if (unlikely(!pre(ua
, ub
))) {
361 ur
.h
= hard(ua
.h
, ub
.h
);
362 if (unlikely(f32_is_inf(ur
))) {
363 s
->float_exception_flags
|= float_flag_overflow
;
364 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
) && post(ua
, ub
)) {
370 return soft(ua
.s
, ub
.s
, s
);
373 static inline float64
374 float64_gen2(float64 xa
, float64 xb
, float_status
*s
,
375 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
,
376 f64_check_fn pre
, f64_check_fn post
)
378 union_float64 ua
, ub
, ur
;
383 if (unlikely(!can_use_fpu(s
))) {
387 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
388 if (unlikely(!pre(ua
, ub
))) {
392 ur
.h
= hard(ua
.h
, ub
.h
);
393 if (unlikely(f64_is_inf(ur
))) {
394 s
->float_exception_flags
|= float_flag_overflow
;
395 } else if (unlikely(fabs(ur
.h
) <= DBL_MIN
) && post(ua
, ub
)) {
401 return soft(ua
.s
, ub
.s
, s
);
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
408 static inline uint32_t extractFloat32Frac(float32 a
)
410 return float32_val(a
) & 0x007FFFFF;
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
417 static inline int extractFloat32Exp(float32 a
)
419 return (float32_val(a
) >> 23) & 0xFF;
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
426 static inline bool extractFloat32Sign(float32 a
)
428 return float32_val(a
) >> 31;
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
435 static inline uint64_t extractFloat64Frac(float64 a
)
437 return float64_val(a
) & UINT64_C(0x000FFFFFFFFFFFFF);
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
444 static inline int extractFloat64Exp(float64 a
)
446 return (float64_val(a
) >> 52) & 0x7FF;
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
453 static inline bool extractFloat64Sign(float64 a
)
455 return float64_val(a
) >> 63;
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
463 typedef enum __attribute__ ((__packed__
)) {
464 float_class_unclassified
,
468 float_class_qnan
, /* all NaNs from here */
472 /* Simple helpers for checking if, or what kind of, NaN we have */
473 static inline __attribute__((unused
)) bool is_nan(FloatClass c
)
475 return unlikely(c
>= float_class_qnan
);
478 static inline __attribute__((unused
)) bool is_snan(FloatClass c
)
480 return c
== float_class_snan
;
483 static inline __attribute__((unused
)) bool is_qnan(FloatClass c
)
485 return c
== float_class_qnan
;
489 * Structure holding all of the decomposed parts of a float. The
490 * exponent is unbiased and the fraction is normalized. All
491 * calculations are done with a 64 bit fraction and then rounded as
492 * appropriate for the final format.
494 * Thanks to the packed FloatClass a decent compiler should be able to
495 * fit the whole structure into registers and avoid using the stack
496 * for parameter passing.
506 #define DECOMPOSED_BINARY_POINT (64 - 2)
507 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
508 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
510 /* Structure holding all of the relevant parameters for a format.
511 * exp_size: the size of the exponent field
512 * exp_bias: the offset applied to the exponent field
513 * exp_max: the maximum normalised exponent
514 * frac_size: the size of the fraction field
515 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
516 * The following are computed based the size of fraction
517 * frac_lsb: least significant bit of fraction
518 * frac_lsbm1: the bit below the least significant bit (for rounding)
519 * round_mask/roundeven_mask: masks used for rounding
520 * The following optional modifiers are available:
521 * arm_althp: handle ARM Alternative Half Precision
532 uint64_t roundeven_mask
;
536 /* Expand fields based on the size of exponent and fraction */
537 #define FLOAT_PARAMS(E, F) \
539 .exp_bias = ((1 << E) - 1) >> 1, \
540 .exp_max = (1 << E) - 1, \
542 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
543 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
544 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
545 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
546 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
548 static const FloatFmt float16_params
= {
552 static const FloatFmt float16_params_ahp
= {
557 static const FloatFmt float32_params
= {
561 static const FloatFmt float64_params
= {
565 /* Unpack a float to parts, but do not canonicalize. */
566 static inline FloatParts
unpack_raw(FloatFmt fmt
, uint64_t raw
)
568 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
570 return (FloatParts
) {
571 .cls
= float_class_unclassified
,
572 .sign
= extract64(raw
, sign_pos
, 1),
573 .exp
= extract64(raw
, fmt
.frac_size
, fmt
.exp_size
),
574 .frac
= extract64(raw
, 0, fmt
.frac_size
),
578 static inline FloatParts
float16_unpack_raw(float16 f
)
580 return unpack_raw(float16_params
, f
);
583 static inline FloatParts
float32_unpack_raw(float32 f
)
585 return unpack_raw(float32_params
, f
);
588 static inline FloatParts
float64_unpack_raw(float64 f
)
590 return unpack_raw(float64_params
, f
);
593 /* Pack a float from parts, but do not canonicalize. */
594 static inline uint64_t pack_raw(FloatFmt fmt
, FloatParts p
)
596 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
597 uint64_t ret
= deposit64(p
.frac
, fmt
.frac_size
, fmt
.exp_size
, p
.exp
);
598 return deposit64(ret
, sign_pos
, 1, p
.sign
);
601 static inline float16
float16_pack_raw(FloatParts p
)
603 return make_float16(pack_raw(float16_params
, p
));
606 static inline float32
float32_pack_raw(FloatParts p
)
608 return make_float32(pack_raw(float32_params
, p
));
611 static inline float64
float64_pack_raw(FloatParts p
)
613 return make_float64(pack_raw(float64_params
, p
));
616 /*----------------------------------------------------------------------------
617 | Functions and definitions to determine: (1) whether tininess for underflow
618 | is detected before or after rounding by default, (2) what (if anything)
619 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
620 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
621 | are propagated from function inputs to output. These details are target-
623 *----------------------------------------------------------------------------*/
624 #include "softfloat-specialize.inc.c"
626 /* Canonicalize EXP and FRAC, setting CLS. */
627 static FloatParts
sf_canonicalize(FloatParts part
, const FloatFmt
*parm
,
628 float_status
*status
)
630 if (part
.exp
== parm
->exp_max
&& !parm
->arm_althp
) {
631 if (part
.frac
== 0) {
632 part
.cls
= float_class_inf
;
634 part
.frac
<<= parm
->frac_shift
;
635 part
.cls
= (parts_is_snan_frac(part
.frac
, status
)
636 ? float_class_snan
: float_class_qnan
);
638 } else if (part
.exp
== 0) {
639 if (likely(part
.frac
== 0)) {
640 part
.cls
= float_class_zero
;
641 } else if (status
->flush_inputs_to_zero
) {
642 float_raise(float_flag_input_denormal
, status
);
643 part
.cls
= float_class_zero
;
646 int shift
= clz64(part
.frac
) - 1;
647 part
.cls
= float_class_normal
;
648 part
.exp
= parm
->frac_shift
- parm
->exp_bias
- shift
+ 1;
652 part
.cls
= float_class_normal
;
653 part
.exp
-= parm
->exp_bias
;
654 part
.frac
= DECOMPOSED_IMPLICIT_BIT
+ (part
.frac
<< parm
->frac_shift
);
659 /* Round and uncanonicalize a floating-point number by parts. There
660 * are FRAC_SHIFT bits that may require rounding at the bottom of the
661 * fraction; these bits will be removed. The exponent will be biased
662 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
665 static FloatParts
round_canonical(FloatParts p
, float_status
*s
,
666 const FloatFmt
*parm
)
668 const uint64_t frac_lsb
= parm
->frac_lsb
;
669 const uint64_t frac_lsbm1
= parm
->frac_lsbm1
;
670 const uint64_t round_mask
= parm
->round_mask
;
671 const uint64_t roundeven_mask
= parm
->roundeven_mask
;
672 const int exp_max
= parm
->exp_max
;
673 const int frac_shift
= parm
->frac_shift
;
682 case float_class_normal
:
683 switch (s
->float_rounding_mode
) {
684 case float_round_nearest_even
:
685 overflow_norm
= false;
686 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
688 case float_round_ties_away
:
689 overflow_norm
= false;
692 case float_round_to_zero
:
693 overflow_norm
= true;
697 inc
= p
.sign
? 0 : round_mask
;
698 overflow_norm
= p
.sign
;
700 case float_round_down
:
701 inc
= p
.sign
? round_mask
: 0;
702 overflow_norm
= !p
.sign
;
704 case float_round_to_odd
:
705 overflow_norm
= true;
706 inc
= frac
& frac_lsb
? 0 : round_mask
;
709 g_assert_not_reached();
712 exp
+= parm
->exp_bias
;
713 if (likely(exp
> 0)) {
714 if (frac
& round_mask
) {
715 flags
|= float_flag_inexact
;
717 if (frac
& DECOMPOSED_OVERFLOW_BIT
) {
724 if (parm
->arm_althp
) {
725 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
726 if (unlikely(exp
> exp_max
)) {
727 /* Overflow. Return the maximum normal. */
728 flags
= float_flag_invalid
;
732 } else if (unlikely(exp
>= exp_max
)) {
733 flags
|= float_flag_overflow
| float_flag_inexact
;
738 p
.cls
= float_class_inf
;
742 } else if (s
->flush_to_zero
) {
743 flags
|= float_flag_output_denormal
;
744 p
.cls
= float_class_zero
;
747 bool is_tiny
= s
->tininess_before_rounding
749 || !((frac
+ inc
) & DECOMPOSED_OVERFLOW_BIT
);
751 shift64RightJamming(frac
, 1 - exp
, &frac
);
752 if (frac
& round_mask
) {
753 /* Need to recompute round-to-even. */
754 switch (s
->float_rounding_mode
) {
755 case float_round_nearest_even
:
756 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
759 case float_round_to_odd
:
760 inc
= frac
& frac_lsb
? 0 : round_mask
;
765 flags
|= float_flag_inexact
;
769 exp
= (frac
& DECOMPOSED_IMPLICIT_BIT
? 1 : 0);
772 if (is_tiny
&& (flags
& float_flag_inexact
)) {
773 flags
|= float_flag_underflow
;
775 if (exp
== 0 && frac
== 0) {
776 p
.cls
= float_class_zero
;
781 case float_class_zero
:
787 case float_class_inf
:
789 assert(!parm
->arm_althp
);
794 case float_class_qnan
:
795 case float_class_snan
:
796 assert(!parm
->arm_althp
);
798 frac
>>= parm
->frac_shift
;
802 g_assert_not_reached();
805 float_raise(flags
, s
);
811 /* Explicit FloatFmt version */
812 static FloatParts
float16a_unpack_canonical(float16 f
, float_status
*s
,
813 const FloatFmt
*params
)
815 return sf_canonicalize(float16_unpack_raw(f
), params
, s
);
818 static FloatParts
float16_unpack_canonical(float16 f
, float_status
*s
)
820 return float16a_unpack_canonical(f
, s
, &float16_params
);
823 static float16
float16a_round_pack_canonical(FloatParts p
, float_status
*s
,
824 const FloatFmt
*params
)
826 return float16_pack_raw(round_canonical(p
, s
, params
));
829 static float16
float16_round_pack_canonical(FloatParts p
, float_status
*s
)
831 return float16a_round_pack_canonical(p
, s
, &float16_params
);
834 static FloatParts
float32_unpack_canonical(float32 f
, float_status
*s
)
836 return sf_canonicalize(float32_unpack_raw(f
), &float32_params
, s
);
839 static float32
float32_round_pack_canonical(FloatParts p
, float_status
*s
)
841 return float32_pack_raw(round_canonical(p
, s
, &float32_params
));
844 static FloatParts
float64_unpack_canonical(float64 f
, float_status
*s
)
846 return sf_canonicalize(float64_unpack_raw(f
), &float64_params
, s
);
849 static float64
float64_round_pack_canonical(FloatParts p
, float_status
*s
)
851 return float64_pack_raw(round_canonical(p
, s
, &float64_params
));
854 static FloatParts
return_nan(FloatParts a
, float_status
*s
)
857 case float_class_snan
:
858 s
->float_exception_flags
|= float_flag_invalid
;
859 a
= parts_silence_nan(a
, s
);
861 case float_class_qnan
:
862 if (s
->default_nan_mode
) {
863 return parts_default_nan(s
);
868 g_assert_not_reached();
873 static FloatParts
pick_nan(FloatParts a
, FloatParts b
, float_status
*s
)
875 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
876 s
->float_exception_flags
|= float_flag_invalid
;
879 if (s
->default_nan_mode
) {
880 return parts_default_nan(s
);
882 if (pickNaN(a
.cls
, b
.cls
,
884 (a
.frac
== b
.frac
&& a
.sign
< b
.sign
))) {
887 if (is_snan(a
.cls
)) {
888 return parts_silence_nan(a
, s
);
894 static FloatParts
pick_nan_muladd(FloatParts a
, FloatParts b
, FloatParts c
,
895 bool inf_zero
, float_status
*s
)
899 if (is_snan(a
.cls
) || is_snan(b
.cls
) || is_snan(c
.cls
)) {
900 s
->float_exception_flags
|= float_flag_invalid
;
903 which
= pickNaNMulAdd(a
.cls
, b
.cls
, c
.cls
, inf_zero
, s
);
905 if (s
->default_nan_mode
) {
906 /* Note that this check is after pickNaNMulAdd so that function
907 * has an opportunity to set the Invalid flag.
922 return parts_default_nan(s
);
924 g_assert_not_reached();
927 if (is_snan(a
.cls
)) {
928 return parts_silence_nan(a
, s
);
934 * Returns the result of adding or subtracting the values of the
935 * floating-point values `a' and `b'. The operation is performed
936 * according to the IEC/IEEE Standard for Binary Floating-Point
940 static FloatParts
addsub_floats(FloatParts a
, FloatParts b
, bool subtract
,
943 bool a_sign
= a
.sign
;
944 bool b_sign
= b
.sign
^ subtract
;
946 if (a_sign
!= b_sign
) {
949 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
950 if (a
.exp
> b
.exp
|| (a
.exp
== b
.exp
&& a
.frac
>= b
.frac
)) {
951 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
952 a
.frac
= a
.frac
- b
.frac
;
954 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
955 a
.frac
= b
.frac
- a
.frac
;
961 a
.cls
= float_class_zero
;
962 a
.sign
= s
->float_rounding_mode
== float_round_down
;
964 int shift
= clz64(a
.frac
) - 1;
965 a
.frac
= a
.frac
<< shift
;
966 a
.exp
= a
.exp
- shift
;
971 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
972 return pick_nan(a
, b
, s
);
974 if (a
.cls
== float_class_inf
) {
975 if (b
.cls
== float_class_inf
) {
976 float_raise(float_flag_invalid
, s
);
977 return parts_default_nan(s
);
981 if (a
.cls
== float_class_zero
&& b
.cls
== float_class_zero
) {
982 a
.sign
= s
->float_rounding_mode
== float_round_down
;
985 if (a
.cls
== float_class_zero
|| b
.cls
== float_class_inf
) {
989 if (b
.cls
== float_class_zero
) {
994 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
996 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
997 } else if (a
.exp
< b
.exp
) {
998 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
1002 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
1003 shift64RightJamming(a
.frac
, 1, &a
.frac
);
1008 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1009 return pick_nan(a
, b
, s
);
1011 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1014 if (b
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1019 g_assert_not_reached();
1023 * Returns the result of adding or subtracting the floating-point
1024 * values `a' and `b'. The operation is performed according to the
1025 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1028 float16 QEMU_FLATTEN
float16_add(float16 a
, float16 b
, float_status
*status
)
1030 FloatParts pa
= float16_unpack_canonical(a
, status
);
1031 FloatParts pb
= float16_unpack_canonical(b
, status
);
1032 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
1034 return float16_round_pack_canonical(pr
, status
);
1037 float16 QEMU_FLATTEN
float16_sub(float16 a
, float16 b
, float_status
*status
)
1039 FloatParts pa
= float16_unpack_canonical(a
, status
);
1040 FloatParts pb
= float16_unpack_canonical(b
, status
);
1041 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
1043 return float16_round_pack_canonical(pr
, status
);
1046 static float32 QEMU_SOFTFLOAT_ATTR
1047 soft_f32_addsub(float32 a
, float32 b
, bool subtract
, float_status
*status
)
1049 FloatParts pa
= float32_unpack_canonical(a
, status
);
1050 FloatParts pb
= float32_unpack_canonical(b
, status
);
1051 FloatParts pr
= addsub_floats(pa
, pb
, subtract
, status
);
1053 return float32_round_pack_canonical(pr
, status
);
1056 static inline float32
soft_f32_add(float32 a
, float32 b
, float_status
*status
)
1058 return soft_f32_addsub(a
, b
, false, status
);
1061 static inline float32
soft_f32_sub(float32 a
, float32 b
, float_status
*status
)
1063 return soft_f32_addsub(a
, b
, true, status
);
1066 static float64 QEMU_SOFTFLOAT_ATTR
1067 soft_f64_addsub(float64 a
, float64 b
, bool subtract
, float_status
*status
)
1069 FloatParts pa
= float64_unpack_canonical(a
, status
);
1070 FloatParts pb
= float64_unpack_canonical(b
, status
);
1071 FloatParts pr
= addsub_floats(pa
, pb
, subtract
, status
);
1073 return float64_round_pack_canonical(pr
, status
);
1076 static inline float64
soft_f64_add(float64 a
, float64 b
, float_status
*status
)
1078 return soft_f64_addsub(a
, b
, false, status
);
1081 static inline float64
soft_f64_sub(float64 a
, float64 b
, float_status
*status
)
1083 return soft_f64_addsub(a
, b
, true, status
);
1086 static float hard_f32_add(float a
, float b
)
1091 static float hard_f32_sub(float a
, float b
)
1096 static double hard_f64_add(double a
, double b
)
1101 static double hard_f64_sub(double a
, double b
)
1106 static bool f32_addsubmul_post(union_float32 a
, union_float32 b
)
1108 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1109 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1111 return !(float32_is_zero(a
.s
) && float32_is_zero(b
.s
));
1114 static bool f64_addsubmul_post(union_float64 a
, union_float64 b
)
1116 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1117 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1119 return !(float64_is_zero(a
.s
) && float64_is_zero(b
.s
));
1123 static float32
float32_addsub(float32 a
, float32 b
, float_status
*s
,
1124 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
)
1126 return float32_gen2(a
, b
, s
, hard
, soft
,
1127 f32_is_zon2
, f32_addsubmul_post
);
1130 static float64
float64_addsub(float64 a
, float64 b
, float_status
*s
,
1131 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
)
1133 return float64_gen2(a
, b
, s
, hard
, soft
,
1134 f64_is_zon2
, f64_addsubmul_post
);
1137 float32 QEMU_FLATTEN
1138 float32_add(float32 a
, float32 b
, float_status
*s
)
1140 return float32_addsub(a
, b
, s
, hard_f32_add
, soft_f32_add
);
1143 float32 QEMU_FLATTEN
1144 float32_sub(float32 a
, float32 b
, float_status
*s
)
1146 return float32_addsub(a
, b
, s
, hard_f32_sub
, soft_f32_sub
);
1149 float64 QEMU_FLATTEN
1150 float64_add(float64 a
, float64 b
, float_status
*s
)
1152 return float64_addsub(a
, b
, s
, hard_f64_add
, soft_f64_add
);
1155 float64 QEMU_FLATTEN
1156 float64_sub(float64 a
, float64 b
, float_status
*s
)
1158 return float64_addsub(a
, b
, s
, hard_f64_sub
, soft_f64_sub
);
1162 * Returns the result of multiplying the floating-point values `a' and
1163 * `b'. The operation is performed according to the IEC/IEEE Standard
1164 * for Binary Floating-Point Arithmetic.
1167 static FloatParts
mul_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1169 bool sign
= a
.sign
^ b
.sign
;
1171 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1173 int exp
= a
.exp
+ b
.exp
;
1175 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1176 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1177 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
1178 shift64RightJamming(lo
, 1, &lo
);
1188 /* handle all the NaN cases */
1189 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1190 return pick_nan(a
, b
, s
);
1192 /* Inf * Zero == NaN */
1193 if ((a
.cls
== float_class_inf
&& b
.cls
== float_class_zero
) ||
1194 (a
.cls
== float_class_zero
&& b
.cls
== float_class_inf
)) {
1195 s
->float_exception_flags
|= float_flag_invalid
;
1196 return parts_default_nan(s
);
1198 /* Multiply by 0 or Inf */
1199 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1203 if (b
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1207 g_assert_not_reached();
1210 float16 QEMU_FLATTEN
float16_mul(float16 a
, float16 b
, float_status
*status
)
1212 FloatParts pa
= float16_unpack_canonical(a
, status
);
1213 FloatParts pb
= float16_unpack_canonical(b
, status
);
1214 FloatParts pr
= mul_floats(pa
, pb
, status
);
1216 return float16_round_pack_canonical(pr
, status
);
1219 static float32 QEMU_SOFTFLOAT_ATTR
1220 soft_f32_mul(float32 a
, float32 b
, float_status
*status
)
1222 FloatParts pa
= float32_unpack_canonical(a
, status
);
1223 FloatParts pb
= float32_unpack_canonical(b
, status
);
1224 FloatParts pr
= mul_floats(pa
, pb
, status
);
1226 return float32_round_pack_canonical(pr
, status
);
1229 static float64 QEMU_SOFTFLOAT_ATTR
1230 soft_f64_mul(float64 a
, float64 b
, float_status
*status
)
1232 FloatParts pa
= float64_unpack_canonical(a
, status
);
1233 FloatParts pb
= float64_unpack_canonical(b
, status
);
1234 FloatParts pr
= mul_floats(pa
, pb
, status
);
1236 return float64_round_pack_canonical(pr
, status
);
1239 static float hard_f32_mul(float a
, float b
)
1244 static double hard_f64_mul(double a
, double b
)
1249 float32 QEMU_FLATTEN
1250 float32_mul(float32 a
, float32 b
, float_status
*s
)
1252 return float32_gen2(a
, b
, s
, hard_f32_mul
, soft_f32_mul
,
1253 f32_is_zon2
, f32_addsubmul_post
);
1256 float64 QEMU_FLATTEN
1257 float64_mul(float64 a
, float64 b
, float_status
*s
)
1259 return float64_gen2(a
, b
, s
, hard_f64_mul
, soft_f64_mul
,
1260 f64_is_zon2
, f64_addsubmul_post
);
1264 * Returns the result of multiplying the floating-point values `a' and
1265 * `b' then adding 'c', with no intermediate rounding step after the
1266 * multiplication. The operation is performed according to the
1267 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1268 * The flags argument allows the caller to select negation of the
1269 * addend, the intermediate product, or the final result. (The
1270 * difference between this and having the caller do a separate
1271 * negation is that negating externally will flip the sign bit on
1275 static FloatParts
muladd_floats(FloatParts a
, FloatParts b
, FloatParts c
,
1276 int flags
, float_status
*s
)
1278 bool inf_zero
= ((1 << a
.cls
) | (1 << b
.cls
)) ==
1279 ((1 << float_class_inf
) | (1 << float_class_zero
));
1281 bool sign_flip
= flags
& float_muladd_negate_result
;
1286 /* It is implementation-defined whether the cases of (0,inf,qnan)
1287 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1288 * they return if they do), so we have to hand this information
1289 * off to the target-specific pick-a-NaN routine.
1291 if (is_nan(a
.cls
) || is_nan(b
.cls
) || is_nan(c
.cls
)) {
1292 return pick_nan_muladd(a
, b
, c
, inf_zero
, s
);
1296 s
->float_exception_flags
|= float_flag_invalid
;
1297 return parts_default_nan(s
);
1300 if (flags
& float_muladd_negate_c
) {
1304 p_sign
= a
.sign
^ b
.sign
;
1306 if (flags
& float_muladd_negate_product
) {
1310 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_inf
) {
1311 p_class
= float_class_inf
;
1312 } else if (a
.cls
== float_class_zero
|| b
.cls
== float_class_zero
) {
1313 p_class
= float_class_zero
;
1315 p_class
= float_class_normal
;
1318 if (c
.cls
== float_class_inf
) {
1319 if (p_class
== float_class_inf
&& p_sign
!= c
.sign
) {
1320 s
->float_exception_flags
|= float_flag_invalid
;
1321 return parts_default_nan(s
);
1323 a
.cls
= float_class_inf
;
1324 a
.sign
= c
.sign
^ sign_flip
;
1329 if (p_class
== float_class_inf
) {
1330 a
.cls
= float_class_inf
;
1331 a
.sign
= p_sign
^ sign_flip
;
1335 if (p_class
== float_class_zero
) {
1336 if (c
.cls
== float_class_zero
) {
1337 if (p_sign
!= c
.sign
) {
1338 p_sign
= s
->float_rounding_mode
== float_round_down
;
1341 } else if (flags
& float_muladd_halve_result
) {
1344 c
.sign
^= sign_flip
;
1348 /* a & b should be normals now... */
1349 assert(a
.cls
== float_class_normal
&&
1350 b
.cls
== float_class_normal
);
1352 p_exp
= a
.exp
+ b
.exp
;
1354 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1357 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1358 /* binary point now at bit 124 */
1360 /* check for overflow */
1361 if (hi
& (1ULL << (DECOMPOSED_BINARY_POINT
* 2 + 1 - 64))) {
1362 shift128RightJamming(hi
, lo
, 1, &hi
, &lo
);
1367 if (c
.cls
== float_class_zero
) {
1368 /* move binary point back to 62 */
1369 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1371 int exp_diff
= p_exp
- c
.exp
;
1372 if (p_sign
== c
.sign
) {
1374 if (exp_diff
<= 0) {
1375 shift128RightJamming(hi
, lo
,
1376 DECOMPOSED_BINARY_POINT
- exp_diff
,
1381 uint64_t c_hi
, c_lo
;
1382 /* shift c to the same binary point as the product (124) */
1385 shift128RightJamming(c_hi
, c_lo
,
1388 add128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1389 /* move binary point back to 62 */
1390 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1393 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
1394 shift64RightJamming(lo
, 1, &lo
);
1400 uint64_t c_hi
, c_lo
;
1401 /* make C binary point match product at bit 124 */
1405 if (exp_diff
<= 0) {
1406 shift128RightJamming(hi
, lo
, -exp_diff
, &hi
, &lo
);
1409 (hi
> c_hi
|| (hi
== c_hi
&& lo
>= c_lo
))) {
1410 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1412 sub128(c_hi
, c_lo
, hi
, lo
, &hi
, &lo
);
1417 shift128RightJamming(c_hi
, c_lo
,
1420 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1423 if (hi
== 0 && lo
== 0) {
1424 a
.cls
= float_class_zero
;
1425 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1426 a
.sign
^= sign_flip
;
1433 shift
= clz64(lo
) + 64;
1435 /* Normalizing to a binary point of 124 is the
1436 correct adjust for the exponent. However since we're
1437 shifting, we might as well put the binary point back
1438 at 62 where we really want it. Therefore shift as
1439 if we're leaving 1 bit at the top of the word, but
1440 adjust the exponent as if we're leaving 3 bits. */
1443 lo
= lo
<< (shift
- 64);
1445 hi
= (hi
<< shift
) | (lo
>> (64 - shift
));
1446 lo
= hi
| ((lo
<< shift
) != 0);
1453 if (flags
& float_muladd_halve_result
) {
1457 /* finally prepare our result */
1458 a
.cls
= float_class_normal
;
1459 a
.sign
= p_sign
^ sign_flip
;
1466 float16 QEMU_FLATTEN
float16_muladd(float16 a
, float16 b
, float16 c
,
1467 int flags
, float_status
*status
)
1469 FloatParts pa
= float16_unpack_canonical(a
, status
);
1470 FloatParts pb
= float16_unpack_canonical(b
, status
);
1471 FloatParts pc
= float16_unpack_canonical(c
, status
);
1472 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1474 return float16_round_pack_canonical(pr
, status
);
1477 static float32 QEMU_SOFTFLOAT_ATTR
1478 soft_f32_muladd(float32 a
, float32 b
, float32 c
, int flags
,
1479 float_status
*status
)
1481 FloatParts pa
= float32_unpack_canonical(a
, status
);
1482 FloatParts pb
= float32_unpack_canonical(b
, status
);
1483 FloatParts pc
= float32_unpack_canonical(c
, status
);
1484 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1486 return float32_round_pack_canonical(pr
, status
);
1489 static float64 QEMU_SOFTFLOAT_ATTR
1490 soft_f64_muladd(float64 a
, float64 b
, float64 c
, int flags
,
1491 float_status
*status
)
1493 FloatParts pa
= float64_unpack_canonical(a
, status
);
1494 FloatParts pb
= float64_unpack_canonical(b
, status
);
1495 FloatParts pc
= float64_unpack_canonical(c
, status
);
1496 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1498 return float64_round_pack_canonical(pr
, status
);
1501 static bool force_soft_fma
;
1503 float32 QEMU_FLATTEN
1504 float32_muladd(float32 xa
, float32 xb
, float32 xc
, int flags
, float_status
*s
)
1506 union_float32 ua
, ub
, uc
, ur
;
1512 if (unlikely(!can_use_fpu(s
))) {
1515 if (unlikely(flags
& float_muladd_halve_result
)) {
1519 float32_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1520 if (unlikely(!f32_is_zon3(ua
, ub
, uc
))) {
1524 if (unlikely(force_soft_fma
)) {
1529 * When (a || b) == 0, there's no need to check for under/over flow,
1530 * since we know the addend is (normal || 0) and the product is 0.
1532 if (float32_is_zero(ua
.s
) || float32_is_zero(ub
.s
)) {
1536 prod_sign
= float32_is_neg(ua
.s
) ^ float32_is_neg(ub
.s
);
1537 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1538 up
.s
= float32_set_sign(float32_zero
, prod_sign
);
1540 if (flags
& float_muladd_negate_c
) {
1545 union_float32 ua_orig
= ua
;
1546 union_float32 uc_orig
= uc
;
1548 if (flags
& float_muladd_negate_product
) {
1551 if (flags
& float_muladd_negate_c
) {
1555 ur
.h
= fmaf(ua
.h
, ub
.h
, uc
.h
);
1557 if (unlikely(f32_is_inf(ur
))) {
1558 s
->float_exception_flags
|= float_flag_overflow
;
1559 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
)) {
1565 if (flags
& float_muladd_negate_result
) {
1566 return float32_chs(ur
.s
);
1571 return soft_f32_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1574 float64 QEMU_FLATTEN
1575 float64_muladd(float64 xa
, float64 xb
, float64 xc
, int flags
, float_status
*s
)
1577 union_float64 ua
, ub
, uc
, ur
;
1583 if (unlikely(!can_use_fpu(s
))) {
1586 if (unlikely(flags
& float_muladd_halve_result
)) {
1590 float64_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1591 if (unlikely(!f64_is_zon3(ua
, ub
, uc
))) {
1595 if (unlikely(force_soft_fma
)) {
1600 * When (a || b) == 0, there's no need to check for under/over flow,
1601 * since we know the addend is (normal || 0) and the product is 0.
1603 if (float64_is_zero(ua
.s
) || float64_is_zero(ub
.s
)) {
1607 prod_sign
= float64_is_neg(ua
.s
) ^ float64_is_neg(ub
.s
);
1608 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1609 up
.s
= float64_set_sign(float64_zero
, prod_sign
);
1611 if (flags
& float_muladd_negate_c
) {
1616 union_float64 ua_orig
= ua
;
1617 union_float64 uc_orig
= uc
;
1619 if (flags
& float_muladd_negate_product
) {
1622 if (flags
& float_muladd_negate_c
) {
1626 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
1628 if (unlikely(f64_is_inf(ur
))) {
1629 s
->float_exception_flags
|= float_flag_overflow
;
1630 } else if (unlikely(fabs(ur
.h
) <= FLT_MIN
)) {
1636 if (flags
& float_muladd_negate_result
) {
1637 return float64_chs(ur
.s
);
1642 return soft_f64_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1646 * Returns the result of dividing the floating-point value `a' by the
1647 * corresponding value `b'. The operation is performed according to
1648 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1651 static FloatParts
div_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1653 bool sign
= a
.sign
^ b
.sign
;
1655 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1656 uint64_t n0
, n1
, q
, r
;
1657 int exp
= a
.exp
- b
.exp
;
1660 * We want a 2*N / N-bit division to produce exactly an N-bit
1661 * result, so that we do not lose any precision and so that we
1662 * do not have to renormalize afterward. If A.frac < B.frac,
1663 * then division would produce an (N-1)-bit result; shift A left
1664 * by one to produce the an N-bit result, and decrement the
1665 * exponent to match.
1667 * The udiv_qrnnd algorithm that we're using requires normalization,
1668 * i.e. the msb of the denominator must be set. Since we know that
1669 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1670 * by one (more), and the remainder must be shifted right by one.
1672 if (a
.frac
< b
.frac
) {
1674 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 2, &n1
, &n0
);
1676 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1, &n1
, &n0
);
1678 q
= udiv_qrnnd(&r
, n1
, n0
, b
.frac
<< 1);
1681 * Set lsb if there is a remainder, to set inexact.
1682 * As mentioned above, to find the actual value of the remainder we
1683 * would need to shift right, but (1) we are only concerned about
1684 * non-zero-ness, and (2) the remainder will always be even because
1685 * both inputs to the division primitive are even.
1687 a
.frac
= q
| (r
!= 0);
1692 /* handle all the NaN cases */
1693 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1694 return pick_nan(a
, b
, s
);
1696 /* 0/0 or Inf/Inf */
1699 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1700 s
->float_exception_flags
|= float_flag_invalid
;
1701 return parts_default_nan(s
);
1703 /* Inf / x or 0 / x */
1704 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1709 if (b
.cls
== float_class_zero
) {
1710 s
->float_exception_flags
|= float_flag_divbyzero
;
1711 a
.cls
= float_class_inf
;
1716 if (b
.cls
== float_class_inf
) {
1717 a
.cls
= float_class_zero
;
1721 g_assert_not_reached();
1724 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1726 FloatParts pa
= float16_unpack_canonical(a
, status
);
1727 FloatParts pb
= float16_unpack_canonical(b
, status
);
1728 FloatParts pr
= div_floats(pa
, pb
, status
);
1730 return float16_round_pack_canonical(pr
, status
);
1733 static float32 QEMU_SOFTFLOAT_ATTR
1734 soft_f32_div(float32 a
, float32 b
, float_status
*status
)
1736 FloatParts pa
= float32_unpack_canonical(a
, status
);
1737 FloatParts pb
= float32_unpack_canonical(b
, status
);
1738 FloatParts pr
= div_floats(pa
, pb
, status
);
1740 return float32_round_pack_canonical(pr
, status
);
1743 static float64 QEMU_SOFTFLOAT_ATTR
1744 soft_f64_div(float64 a
, float64 b
, float_status
*status
)
1746 FloatParts pa
= float64_unpack_canonical(a
, status
);
1747 FloatParts pb
= float64_unpack_canonical(b
, status
);
1748 FloatParts pr
= div_floats(pa
, pb
, status
);
1750 return float64_round_pack_canonical(pr
, status
);
1753 static float hard_f32_div(float a
, float b
)
1758 static double hard_f64_div(double a
, double b
)
1763 static bool f32_div_pre(union_float32 a
, union_float32 b
)
1765 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1766 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1767 fpclassify(b
.h
) == FP_NORMAL
;
1769 return float32_is_zero_or_normal(a
.s
) && float32_is_normal(b
.s
);
1772 static bool f64_div_pre(union_float64 a
, union_float64 b
)
1774 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1775 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1776 fpclassify(b
.h
) == FP_NORMAL
;
1778 return float64_is_zero_or_normal(a
.s
) && float64_is_normal(b
.s
);
1781 static bool f32_div_post(union_float32 a
, union_float32 b
)
1783 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1784 return fpclassify(a
.h
) != FP_ZERO
;
1786 return !float32_is_zero(a
.s
);
1789 static bool f64_div_post(union_float64 a
, union_float64 b
)
1791 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1792 return fpclassify(a
.h
) != FP_ZERO
;
1794 return !float64_is_zero(a
.s
);
1797 float32 QEMU_FLATTEN
1798 float32_div(float32 a
, float32 b
, float_status
*s
)
1800 return float32_gen2(a
, b
, s
, hard_f32_div
, soft_f32_div
,
1801 f32_div_pre
, f32_div_post
);
1804 float64 QEMU_FLATTEN
1805 float64_div(float64 a
, float64 b
, float_status
*s
)
1807 return float64_gen2(a
, b
, s
, hard_f64_div
, soft_f64_div
,
1808 f64_div_pre
, f64_div_post
);
1812 * Float to Float conversions
1814 * Returns the result of converting one float format to another. The
1815 * conversion is performed according to the IEC/IEEE Standard for
1816 * Binary Floating-Point Arithmetic.
1818 * The float_to_float helper only needs to take care of raising
1819 * invalid exceptions and handling the conversion on NaNs.
1822 static FloatParts
float_to_float(FloatParts a
, const FloatFmt
*dstf
,
1825 if (dstf
->arm_althp
) {
1827 case float_class_qnan
:
1828 case float_class_snan
:
1829 /* There is no NaN in the destination format. Raise Invalid
1830 * and return a zero with the sign of the input NaN.
1832 s
->float_exception_flags
|= float_flag_invalid
;
1833 a
.cls
= float_class_zero
;
1838 case float_class_inf
:
1839 /* There is no Inf in the destination format. Raise Invalid
1840 * and return the maximum normal with the correct sign.
1842 s
->float_exception_flags
|= float_flag_invalid
;
1843 a
.cls
= float_class_normal
;
1844 a
.exp
= dstf
->exp_max
;
1845 a
.frac
= ((1ull << dstf
->frac_size
) - 1) << dstf
->frac_shift
;
1851 } else if (is_nan(a
.cls
)) {
1852 if (is_snan(a
.cls
)) {
1853 s
->float_exception_flags
|= float_flag_invalid
;
1854 a
= parts_silence_nan(a
, s
);
1856 if (s
->default_nan_mode
) {
1857 return parts_default_nan(s
);
1863 float32
float16_to_float32(float16 a
, bool ieee
, float_status
*s
)
1865 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1866 FloatParts p
= float16a_unpack_canonical(a
, s
, fmt16
);
1867 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
1868 return float32_round_pack_canonical(pr
, s
);
1871 float64
float16_to_float64(float16 a
, bool ieee
, float_status
*s
)
1873 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1874 FloatParts p
= float16a_unpack_canonical(a
, s
, fmt16
);
1875 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
1876 return float64_round_pack_canonical(pr
, s
);
1879 float16
float32_to_float16(float32 a
, bool ieee
, float_status
*s
)
1881 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1882 FloatParts p
= float32_unpack_canonical(a
, s
);
1883 FloatParts pr
= float_to_float(p
, fmt16
, s
);
1884 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1887 static float64 QEMU_SOFTFLOAT_ATTR
1888 soft_float32_to_float64(float32 a
, float_status
*s
)
1890 FloatParts p
= float32_unpack_canonical(a
, s
);
1891 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
1892 return float64_round_pack_canonical(pr
, s
);
1895 float64
float32_to_float64(float32 a
, float_status
*s
)
1897 if (likely(float32_is_normal(a
))) {
1898 /* Widening conversion can never produce inexact results. */
1904 } else if (float32_is_zero(a
)) {
1905 return float64_set_sign(float64_zero
, float32_is_neg(a
));
1907 return soft_float32_to_float64(a
, s
);
1911 float16
float64_to_float16(float64 a
, bool ieee
, float_status
*s
)
1913 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1914 FloatParts p
= float64_unpack_canonical(a
, s
);
1915 FloatParts pr
= float_to_float(p
, fmt16
, s
);
1916 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1919 float32
float64_to_float32(float64 a
, float_status
*s
)
1921 FloatParts p
= float64_unpack_canonical(a
, s
);
1922 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
1923 return float32_round_pack_canonical(pr
, s
);
1927 * Rounds the floating-point value `a' to an integer, and returns the
1928 * result as a floating-point value. The operation is performed
1929 * according to the IEC/IEEE Standard for Binary Floating-Point
1933 static FloatParts
round_to_int(FloatParts a
, FloatRoundMode rmode
,
1934 int scale
, float_status
*s
)
1937 case float_class_qnan
:
1938 case float_class_snan
:
1939 return return_nan(a
, s
);
1941 case float_class_zero
:
1942 case float_class_inf
:
1943 /* already "integral" */
1946 case float_class_normal
:
1947 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
1950 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
1951 /* already integral */
1956 /* all fractional */
1957 s
->float_exception_flags
|= float_flag_inexact
;
1959 case float_round_nearest_even
:
1960 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
1962 case float_round_ties_away
:
1963 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
1965 case float_round_to_zero
:
1968 case float_round_up
:
1971 case float_round_down
:
1974 case float_round_to_odd
:
1978 g_assert_not_reached();
1982 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
1985 a
.cls
= float_class_zero
;
1988 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
1989 uint64_t frac_lsbm1
= frac_lsb
>> 1;
1990 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
1991 uint64_t rnd_mask
= rnd_even_mask
>> 1;
1995 case float_round_nearest_even
:
1996 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
1998 case float_round_ties_away
:
2001 case float_round_to_zero
:
2004 case float_round_up
:
2005 inc
= a
.sign
? 0 : rnd_mask
;
2007 case float_round_down
:
2008 inc
= a
.sign
? rnd_mask
: 0;
2010 case float_round_to_odd
:
2011 inc
= a
.frac
& frac_lsb
? 0 : rnd_mask
;
2014 g_assert_not_reached();
2017 if (a
.frac
& rnd_mask
) {
2018 s
->float_exception_flags
|= float_flag_inexact
;
2020 a
.frac
&= ~rnd_mask
;
2021 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
2029 g_assert_not_reached();
2034 float16
float16_round_to_int(float16 a
, float_status
*s
)
2036 FloatParts pa
= float16_unpack_canonical(a
, s
);
2037 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2038 return float16_round_pack_canonical(pr
, s
);
2041 float32
float32_round_to_int(float32 a
, float_status
*s
)
2043 FloatParts pa
= float32_unpack_canonical(a
, s
);
2044 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2045 return float32_round_pack_canonical(pr
, s
);
2048 float64
float64_round_to_int(float64 a
, float_status
*s
)
2050 FloatParts pa
= float64_unpack_canonical(a
, s
);
2051 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2052 return float64_round_pack_canonical(pr
, s
);
2056 * Returns the result of converting the floating-point value `a' to
2057 * the two's complement integer format. The conversion is performed
2058 * according to the IEC/IEEE Standard for Binary Floating-Point
2059 * Arithmetic---which means in particular that the conversion is
2060 * rounded according to the current rounding mode. If `a' is a NaN,
2061 * the largest positive integer is returned. Otherwise, if the
2062 * conversion overflows, the largest integer with the same sign as `a'
2066 static int64_t round_to_int_and_pack(FloatParts in
, FloatRoundMode rmode
,
2067 int scale
, int64_t min
, int64_t max
,
2071 int orig_flags
= get_float_exception_flags(s
);
2072 FloatParts p
= round_to_int(in
, rmode
, scale
, s
);
2075 case float_class_snan
:
2076 case float_class_qnan
:
2077 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2079 case float_class_inf
:
2080 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2081 return p
.sign
? min
: max
;
2082 case float_class_zero
:
2084 case float_class_normal
:
2085 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
2086 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2087 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
2088 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
2093 if (r
<= -(uint64_t) min
) {
2096 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2103 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2108 g_assert_not_reached();
2112 int16_t float16_to_int16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2115 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2116 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2119 int32_t float16_to_int32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2122 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2123 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2126 int64_t float16_to_int64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2129 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2130 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2133 int16_t float32_to_int16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2136 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2137 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2140 int32_t float32_to_int32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2143 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2144 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2147 int64_t float32_to_int64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2150 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2151 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2154 int16_t float64_to_int16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2157 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2158 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2161 int32_t float64_to_int32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2164 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2165 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2168 int64_t float64_to_int64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2171 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2172 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2175 int16_t float16_to_int16(float16 a
, float_status
*s
)
2177 return float16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2180 int32_t float16_to_int32(float16 a
, float_status
*s
)
2182 return float16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2185 int64_t float16_to_int64(float16 a
, float_status
*s
)
2187 return float16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2190 int16_t float32_to_int16(float32 a
, float_status
*s
)
2192 return float32_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2195 int32_t float32_to_int32(float32 a
, float_status
*s
)
2197 return float32_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2200 int64_t float32_to_int64(float32 a
, float_status
*s
)
2202 return float32_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2205 int16_t float64_to_int16(float64 a
, float_status
*s
)
2207 return float64_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2210 int32_t float64_to_int32(float64 a
, float_status
*s
)
2212 return float64_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2215 int64_t float64_to_int64(float64 a
, float_status
*s
)
2217 return float64_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2220 int16_t float16_to_int16_round_to_zero(float16 a
, float_status
*s
)
2222 return float16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2225 int32_t float16_to_int32_round_to_zero(float16 a
, float_status
*s
)
2227 return float16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2230 int64_t float16_to_int64_round_to_zero(float16 a
, float_status
*s
)
2232 return float16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2235 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*s
)
2237 return float32_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2240 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*s
)
2242 return float32_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2245 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*s
)
2247 return float32_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2250 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*s
)
2252 return float64_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2255 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*s
)
2257 return float64_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2260 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*s
)
2262 return float64_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2266 * Returns the result of converting the floating-point value `a' to
2267 * the unsigned integer format. The conversion is performed according
2268 * to the IEC/IEEE Standard for Binary Floating-Point
2269 * Arithmetic---which means in particular that the conversion is
2270 * rounded according to the current rounding mode. If `a' is a NaN,
2271 * the largest unsigned integer is returned. Otherwise, if the
2272 * conversion overflows, the largest unsigned integer is returned. If
2273 * the 'a' is negative, the result is rounded and zero is returned;
2274 * values that do not round to zero will raise the inexact exception
2278 static uint64_t round_to_uint_and_pack(FloatParts in
, FloatRoundMode rmode
,
2279 int scale
, uint64_t max
,
2282 int orig_flags
= get_float_exception_flags(s
);
2283 FloatParts p
= round_to_int(in
, rmode
, scale
, s
);
2287 case float_class_snan
:
2288 case float_class_qnan
:
2289 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2291 case float_class_inf
:
2292 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2293 return p
.sign
? 0 : max
;
2294 case float_class_zero
:
2296 case float_class_normal
:
2298 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2302 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
2303 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2304 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
2305 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
2307 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2311 /* For uint64 this will never trip, but if p.exp is too large
2312 * to shift a decomposed fraction we shall have exited via the
2316 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2321 g_assert_not_reached();
2325 uint16_t float16_to_uint16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2328 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2329 rmode
, scale
, UINT16_MAX
, s
);
2332 uint32_t float16_to_uint32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2335 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2336 rmode
, scale
, UINT32_MAX
, s
);
2339 uint64_t float16_to_uint64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2342 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2343 rmode
, scale
, UINT64_MAX
, s
);
2346 uint16_t float32_to_uint16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2349 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2350 rmode
, scale
, UINT16_MAX
, s
);
2353 uint32_t float32_to_uint32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2356 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2357 rmode
, scale
, UINT32_MAX
, s
);
2360 uint64_t float32_to_uint64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2363 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2364 rmode
, scale
, UINT64_MAX
, s
);
2367 uint16_t float64_to_uint16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2370 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2371 rmode
, scale
, UINT16_MAX
, s
);
2374 uint32_t float64_to_uint32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2377 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2378 rmode
, scale
, UINT32_MAX
, s
);
2381 uint64_t float64_to_uint64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2384 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2385 rmode
, scale
, UINT64_MAX
, s
);
2388 uint16_t float16_to_uint16(float16 a
, float_status
*s
)
2390 return float16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2393 uint32_t float16_to_uint32(float16 a
, float_status
*s
)
2395 return float16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2398 uint64_t float16_to_uint64(float16 a
, float_status
*s
)
2400 return float16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2403 uint16_t float32_to_uint16(float32 a
, float_status
*s
)
2405 return float32_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2408 uint32_t float32_to_uint32(float32 a
, float_status
*s
)
2410 return float32_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2413 uint64_t float32_to_uint64(float32 a
, float_status
*s
)
2415 return float32_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2418 uint16_t float64_to_uint16(float64 a
, float_status
*s
)
2420 return float64_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2423 uint32_t float64_to_uint32(float64 a
, float_status
*s
)
2425 return float64_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2428 uint64_t float64_to_uint64(float64 a
, float_status
*s
)
2430 return float64_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2433 uint16_t float16_to_uint16_round_to_zero(float16 a
, float_status
*s
)
2435 return float16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2438 uint32_t float16_to_uint32_round_to_zero(float16 a
, float_status
*s
)
2440 return float16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2443 uint64_t float16_to_uint64_round_to_zero(float16 a
, float_status
*s
)
2445 return float16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2448 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*s
)
2450 return float32_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2453 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*s
)
2455 return float32_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2458 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*s
)
2460 return float32_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2463 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*s
)
2465 return float64_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2468 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*s
)
2470 return float64_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2473 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*s
)
2475 return float64_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2479 * Integer to float conversions
2481 * Returns the result of converting the two's complement integer `a'
2482 * to the floating-point format. The conversion is performed according
2483 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2486 static FloatParts
int_to_float(int64_t a
, int scale
, float_status
*status
)
2488 FloatParts r
= { .sign
= false };
2491 r
.cls
= float_class_zero
;
2496 r
.cls
= float_class_normal
;
2501 shift
= clz64(f
) - 1;
2502 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2504 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2505 r
.frac
= (shift
< 0 ? DECOMPOSED_IMPLICIT_BIT
: f
<< shift
);
2511 float16
int64_to_float16_scalbn(int64_t a
, int scale
, float_status
*status
)
2513 FloatParts pa
= int_to_float(a
, scale
, status
);
2514 return float16_round_pack_canonical(pa
, status
);
2517 float16
int32_to_float16_scalbn(int32_t a
, int scale
, float_status
*status
)
2519 return int64_to_float16_scalbn(a
, scale
, status
);
2522 float16
int16_to_float16_scalbn(int16_t a
, int scale
, float_status
*status
)
2524 return int64_to_float16_scalbn(a
, scale
, status
);
2527 float16
int64_to_float16(int64_t a
, float_status
*status
)
2529 return int64_to_float16_scalbn(a
, 0, status
);
2532 float16
int32_to_float16(int32_t a
, float_status
*status
)
2534 return int64_to_float16_scalbn(a
, 0, status
);
2537 float16
int16_to_float16(int16_t a
, float_status
*status
)
2539 return int64_to_float16_scalbn(a
, 0, status
);
2542 float32
int64_to_float32_scalbn(int64_t a
, int scale
, float_status
*status
)
2544 FloatParts pa
= int_to_float(a
, scale
, status
);
2545 return float32_round_pack_canonical(pa
, status
);
2548 float32
int32_to_float32_scalbn(int32_t a
, int scale
, float_status
*status
)
2550 return int64_to_float32_scalbn(a
, scale
, status
);
2553 float32
int16_to_float32_scalbn(int16_t a
, int scale
, float_status
*status
)
2555 return int64_to_float32_scalbn(a
, scale
, status
);
2558 float32
int64_to_float32(int64_t a
, float_status
*status
)
2560 return int64_to_float32_scalbn(a
, 0, status
);
2563 float32
int32_to_float32(int32_t a
, float_status
*status
)
2565 return int64_to_float32_scalbn(a
, 0, status
);
2568 float32
int16_to_float32(int16_t a
, float_status
*status
)
2570 return int64_to_float32_scalbn(a
, 0, status
);
2573 float64
int64_to_float64_scalbn(int64_t a
, int scale
, float_status
*status
)
2575 FloatParts pa
= int_to_float(a
, scale
, status
);
2576 return float64_round_pack_canonical(pa
, status
);
2579 float64
int32_to_float64_scalbn(int32_t a
, int scale
, float_status
*status
)
2581 return int64_to_float64_scalbn(a
, scale
, status
);
2584 float64
int16_to_float64_scalbn(int16_t a
, int scale
, float_status
*status
)
2586 return int64_to_float64_scalbn(a
, scale
, status
);
2589 float64
int64_to_float64(int64_t a
, float_status
*status
)
2591 return int64_to_float64_scalbn(a
, 0, status
);
2594 float64
int32_to_float64(int32_t a
, float_status
*status
)
2596 return int64_to_float64_scalbn(a
, 0, status
);
2599 float64
int16_to_float64(int16_t a
, float_status
*status
)
2601 return int64_to_float64_scalbn(a
, 0, status
);
2606 * Unsigned Integer to float conversions
2608 * Returns the result of converting the unsigned integer `a' to the
2609 * floating-point format. The conversion is performed according to the
2610 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2613 static FloatParts
uint_to_float(uint64_t a
, int scale
, float_status
*status
)
2615 FloatParts r
= { .sign
= false };
2618 r
.cls
= float_class_zero
;
2620 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2621 r
.cls
= float_class_normal
;
2622 if ((int64_t)a
< 0) {
2623 r
.exp
= DECOMPOSED_BINARY_POINT
+ 1 + scale
;
2624 shift64RightJamming(a
, 1, &a
);
2627 int shift
= clz64(a
) - 1;
2628 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2629 r
.frac
= a
<< shift
;
2636 float16
uint64_to_float16_scalbn(uint64_t a
, int scale
, float_status
*status
)
2638 FloatParts pa
= uint_to_float(a
, scale
, status
);
2639 return float16_round_pack_canonical(pa
, status
);
2642 float16
uint32_to_float16_scalbn(uint32_t a
, int scale
, float_status
*status
)
2644 return uint64_to_float16_scalbn(a
, scale
, status
);
2647 float16
uint16_to_float16_scalbn(uint16_t a
, int scale
, float_status
*status
)
2649 return uint64_to_float16_scalbn(a
, scale
, status
);
2652 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
2654 return uint64_to_float16_scalbn(a
, 0, status
);
2657 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
2659 return uint64_to_float16_scalbn(a
, 0, status
);
2662 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
2664 return uint64_to_float16_scalbn(a
, 0, status
);
2667 float32
uint64_to_float32_scalbn(uint64_t a
, int scale
, float_status
*status
)
2669 FloatParts pa
= uint_to_float(a
, scale
, status
);
2670 return float32_round_pack_canonical(pa
, status
);
2673 float32
uint32_to_float32_scalbn(uint32_t a
, int scale
, float_status
*status
)
2675 return uint64_to_float32_scalbn(a
, scale
, status
);
2678 float32
uint16_to_float32_scalbn(uint16_t a
, int scale
, float_status
*status
)
2680 return uint64_to_float32_scalbn(a
, scale
, status
);
2683 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
2685 return uint64_to_float32_scalbn(a
, 0, status
);
2688 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
2690 return uint64_to_float32_scalbn(a
, 0, status
);
2693 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
2695 return uint64_to_float32_scalbn(a
, 0, status
);
2698 float64
uint64_to_float64_scalbn(uint64_t a
, int scale
, float_status
*status
)
2700 FloatParts pa
= uint_to_float(a
, scale
, status
);
2701 return float64_round_pack_canonical(pa
, status
);
2704 float64
uint32_to_float64_scalbn(uint32_t a
, int scale
, float_status
*status
)
2706 return uint64_to_float64_scalbn(a
, scale
, status
);
2709 float64
uint16_to_float64_scalbn(uint16_t a
, int scale
, float_status
*status
)
2711 return uint64_to_float64_scalbn(a
, scale
, status
);
2714 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
2716 return uint64_to_float64_scalbn(a
, 0, status
);
2719 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
2721 return uint64_to_float64_scalbn(a
, 0, status
);
2724 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
2726 return uint64_to_float64_scalbn(a
, 0, status
);
2730 /* min() and max() functions. These can't be implemented as
2731 * 'compare and pick one input' because that would mishandle
2732 * NaNs and +0 vs -0.
2734 * minnum() and maxnum() functions. These are similar to the min()
2735 * and max() functions but if one of the arguments is a QNaN and
2736 * the other is numerical then the numerical argument is returned.
2737 * SNaNs will get quietened before being returned.
2738 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2739 * and maxNum() operations. min() and max() are the typical min/max
2740 * semantics provided by many CPUs which predate that specification.
2742 * minnummag() and maxnummag() functions correspond to minNumMag()
2743 * and minNumMag() from the IEEE-754 2008.
2745 static FloatParts
minmax_floats(FloatParts a
, FloatParts b
, bool ismin
,
2746 bool ieee
, bool ismag
, float_status
*s
)
2748 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
2750 /* Takes two floating-point values `a' and `b', one of
2751 * which is a NaN, and returns the appropriate NaN
2752 * result. If either `a' or `b' is a signaling NaN,
2753 * the invalid exception is raised.
2755 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
2756 return pick_nan(a
, b
, s
);
2757 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
2759 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
2763 return pick_nan(a
, b
, s
);
2768 case float_class_normal
:
2771 case float_class_inf
:
2774 case float_class_zero
:
2778 g_assert_not_reached();
2782 case float_class_normal
:
2785 case float_class_inf
:
2788 case float_class_zero
:
2792 g_assert_not_reached();
2796 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
2797 bool a_less
= a_exp
< b_exp
;
2798 if (a_exp
== b_exp
) {
2799 a_less
= a
.frac
< b
.frac
;
2801 return a_less
^ ismin
? b
: a
;
2804 if (a
.sign
== b
.sign
) {
2805 bool a_less
= a_exp
< b_exp
;
2806 if (a_exp
== b_exp
) {
2807 a_less
= a
.frac
< b
.frac
;
2809 return a
.sign
^ a_less
^ ismin
? b
: a
;
2811 return a
.sign
^ ismin
? b
: a
;
2816 #define MINMAX(sz, name, ismin, isiee, ismag) \
2817 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
2820 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2821 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2822 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
2824 return float ## sz ## _round_pack_canonical(pr, s); \
2827 MINMAX(16, min
, true, false, false)
2828 MINMAX(16, minnum
, true, true, false)
2829 MINMAX(16, minnummag
, true, true, true)
2830 MINMAX(16, max
, false, false, false)
2831 MINMAX(16, maxnum
, false, true, false)
2832 MINMAX(16, maxnummag
, false, true, true)
2834 MINMAX(32, min
, true, false, false)
2835 MINMAX(32, minnum
, true, true, false)
2836 MINMAX(32, minnummag
, true, true, true)
2837 MINMAX(32, max
, false, false, false)
2838 MINMAX(32, maxnum
, false, true, false)
2839 MINMAX(32, maxnummag
, false, true, true)
2841 MINMAX(64, min
, true, false, false)
2842 MINMAX(64, minnum
, true, true, false)
2843 MINMAX(64, minnummag
, true, true, true)
2844 MINMAX(64, max
, false, false, false)
2845 MINMAX(64, maxnum
, false, true, false)
2846 MINMAX(64, maxnummag
, false, true, true)
2850 /* Floating point compare */
2851 static FloatRelation
compare_floats(FloatParts a
, FloatParts b
, bool is_quiet
,
2854 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
2856 a
.cls
== float_class_snan
||
2857 b
.cls
== float_class_snan
) {
2858 s
->float_exception_flags
|= float_flag_invalid
;
2860 return float_relation_unordered
;
2863 if (a
.cls
== float_class_zero
) {
2864 if (b
.cls
== float_class_zero
) {
2865 return float_relation_equal
;
2867 return b
.sign
? float_relation_greater
: float_relation_less
;
2868 } else if (b
.cls
== float_class_zero
) {
2869 return a
.sign
? float_relation_less
: float_relation_greater
;
2872 /* The only really important thing about infinity is its sign. If
2873 * both are infinities the sign marks the smallest of the two.
2875 if (a
.cls
== float_class_inf
) {
2876 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
2877 return float_relation_equal
;
2879 return a
.sign
? float_relation_less
: float_relation_greater
;
2880 } else if (b
.cls
== float_class_inf
) {
2881 return b
.sign
? float_relation_greater
: float_relation_less
;
2884 if (a
.sign
!= b
.sign
) {
2885 return a
.sign
? float_relation_less
: float_relation_greater
;
2888 if (a
.exp
== b
.exp
) {
2889 if (a
.frac
== b
.frac
) {
2890 return float_relation_equal
;
2893 return a
.frac
> b
.frac
?
2894 float_relation_less
: float_relation_greater
;
2896 return a
.frac
> b
.frac
?
2897 float_relation_greater
: float_relation_less
;
2901 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
2903 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
2908 #define COMPARE(name, attr, sz) \
2910 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
2912 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2913 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2914 return compare_floats(pa, pb, is_quiet, s); \
2917 COMPARE(soft_f16_compare
, QEMU_FLATTEN
, 16)
2918 COMPARE(soft_f32_compare
, QEMU_SOFTFLOAT_ATTR
, 32)
2919 COMPARE(soft_f64_compare
, QEMU_SOFTFLOAT_ATTR
, 64)
2923 FloatRelation
float16_compare(float16 a
, float16 b
, float_status
*s
)
2925 return soft_f16_compare(a
, b
, false, s
);
2928 FloatRelation
float16_compare_quiet(float16 a
, float16 b
, float_status
*s
)
2930 return soft_f16_compare(a
, b
, true, s
);
2933 static FloatRelation QEMU_FLATTEN
2934 f32_compare(float32 xa
, float32 xb
, bool is_quiet
, float_status
*s
)
2936 union_float32 ua
, ub
;
2941 if (QEMU_NO_HARDFLOAT
) {
2945 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
2946 if (isgreaterequal(ua
.h
, ub
.h
)) {
2947 if (isgreater(ua
.h
, ub
.h
)) {
2948 return float_relation_greater
;
2950 return float_relation_equal
;
2952 if (likely(isless(ua
.h
, ub
.h
))) {
2953 return float_relation_less
;
2955 /* The only condition remaining is unordered.
2956 * Fall through to set flags.
2959 return soft_f32_compare(ua
.s
, ub
.s
, is_quiet
, s
);
2962 FloatRelation
float32_compare(float32 a
, float32 b
, float_status
*s
)
2964 return f32_compare(a
, b
, false, s
);
2967 FloatRelation
float32_compare_quiet(float32 a
, float32 b
, float_status
*s
)
2969 return f32_compare(a
, b
, true, s
);
2972 static FloatRelation QEMU_FLATTEN
2973 f64_compare(float64 xa
, float64 xb
, bool is_quiet
, float_status
*s
)
2975 union_float64 ua
, ub
;
2980 if (QEMU_NO_HARDFLOAT
) {
2984 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
2985 if (isgreaterequal(ua
.h
, ub
.h
)) {
2986 if (isgreater(ua
.h
, ub
.h
)) {
2987 return float_relation_greater
;
2989 return float_relation_equal
;
2991 if (likely(isless(ua
.h
, ub
.h
))) {
2992 return float_relation_less
;
2994 /* The only condition remaining is unordered.
2995 * Fall through to set flags.
2998 return soft_f64_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3001 FloatRelation
float64_compare(float64 a
, float64 b
, float_status
*s
)
3003 return f64_compare(a
, b
, false, s
);
3006 FloatRelation
float64_compare_quiet(float64 a
, float64 b
, float_status
*s
)
3008 return f64_compare(a
, b
, true, s
);
3011 /* Multiply A by 2 raised to the power N. */
3012 static FloatParts
scalbn_decomposed(FloatParts a
, int n
, float_status
*s
)
3014 if (unlikely(is_nan(a
.cls
))) {
3015 return return_nan(a
, s
);
3017 if (a
.cls
== float_class_normal
) {
3018 /* The largest float type (even though not supported by FloatParts)
3019 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3020 * still allows rounding to infinity, without allowing overflow
3021 * within the int32_t that backs FloatParts.exp.
3023 n
= MIN(MAX(n
, -0x10000), 0x10000);
3029 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
3031 FloatParts pa
= float16_unpack_canonical(a
, status
);
3032 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3033 return float16_round_pack_canonical(pr
, status
);
3036 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
3038 FloatParts pa
= float32_unpack_canonical(a
, status
);
3039 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3040 return float32_round_pack_canonical(pr
, status
);
3043 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
3045 FloatParts pa
= float64_unpack_canonical(a
, status
);
3046 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3047 return float64_round_pack_canonical(pr
, status
);
3053 * The old softfloat code did an approximation step before zeroing in
3054 * on the final result. However for simpleness we just compute the
3055 * square root by iterating down from the implicit bit to enough extra
3056 * bits to ensure we get a correctly rounded result.
3058 * This does mean however the calculation is slower than before,
3059 * especially for 64 bit floats.
3062 static FloatParts
sqrt_float(FloatParts a
, float_status
*s
, const FloatFmt
*p
)
3064 uint64_t a_frac
, r_frac
, s_frac
;
3067 if (is_nan(a
.cls
)) {
3068 return return_nan(a
, s
);
3070 if (a
.cls
== float_class_zero
) {
3071 return a
; /* sqrt(+-0) = +-0 */
3074 s
->float_exception_flags
|= float_flag_invalid
;
3075 return parts_default_nan(s
);
3077 if (a
.cls
== float_class_inf
) {
3078 return a
; /* sqrt(+inf) = +inf */
3081 assert(a
.cls
== float_class_normal
);
3083 /* We need two overflow bits at the top. Adding room for that is a
3084 * right shift. If the exponent is odd, we can discard the low bit
3085 * by multiplying the fraction by 2; that's a left shift. Combine
3086 * those and we shift right if the exponent is even.
3094 /* Bit-by-bit computation of sqrt. */
3098 /* Iterate from implicit bit down to the 3 extra bits to compute a
3099 * properly rounded result. Remember we've inserted one more bit
3100 * at the top, so these positions are one less.
3102 bit
= DECOMPOSED_BINARY_POINT
- 1;
3103 last_bit
= MAX(p
->frac_shift
- 4, 0);
3105 uint64_t q
= 1ULL << bit
;
3106 uint64_t t_frac
= s_frac
+ q
;
3107 if (t_frac
<= a_frac
) {
3108 s_frac
= t_frac
+ q
;
3113 } while (--bit
>= last_bit
);
3115 /* Undo the right shift done above. If there is any remaining
3116 * fraction, the result is inexact. Set the sticky bit.
3118 a
.frac
= (r_frac
<< 1) + (a_frac
!= 0);
3123 float16 QEMU_FLATTEN
float16_sqrt(float16 a
, float_status
*status
)
3125 FloatParts pa
= float16_unpack_canonical(a
, status
);
3126 FloatParts pr
= sqrt_float(pa
, status
, &float16_params
);
3127 return float16_round_pack_canonical(pr
, status
);
3130 static float32 QEMU_SOFTFLOAT_ATTR
3131 soft_f32_sqrt(float32 a
, float_status
*status
)
3133 FloatParts pa
= float32_unpack_canonical(a
, status
);
3134 FloatParts pr
= sqrt_float(pa
, status
, &float32_params
);
3135 return float32_round_pack_canonical(pr
, status
);
3138 static float64 QEMU_SOFTFLOAT_ATTR
3139 soft_f64_sqrt(float64 a
, float_status
*status
)
3141 FloatParts pa
= float64_unpack_canonical(a
, status
);
3142 FloatParts pr
= sqrt_float(pa
, status
, &float64_params
);
3143 return float64_round_pack_canonical(pr
, status
);
3146 float32 QEMU_FLATTEN
float32_sqrt(float32 xa
, float_status
*s
)
3148 union_float32 ua
, ur
;
3151 if (unlikely(!can_use_fpu(s
))) {
3155 float32_input_flush1(&ua
.s
, s
);
3156 if (QEMU_HARDFLOAT_1F32_USE_FP
) {
3157 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3158 fpclassify(ua
.h
) == FP_ZERO
) ||
3162 } else if (unlikely(!float32_is_zero_or_normal(ua
.s
) ||
3163 float32_is_neg(ua
.s
))) {
3170 return soft_f32_sqrt(ua
.s
, s
);
3173 float64 QEMU_FLATTEN
float64_sqrt(float64 xa
, float_status
*s
)
3175 union_float64 ua
, ur
;
3178 if (unlikely(!can_use_fpu(s
))) {
3182 float64_input_flush1(&ua
.s
, s
);
3183 if (QEMU_HARDFLOAT_1F64_USE_FP
) {
3184 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3185 fpclassify(ua
.h
) == FP_ZERO
) ||
3189 } else if (unlikely(!float64_is_zero_or_normal(ua
.s
) ||
3190 float64_is_neg(ua
.s
))) {
3197 return soft_f64_sqrt(ua
.s
, s
);
3200 /*----------------------------------------------------------------------------
3201 | The pattern for a default generated NaN.
3202 *----------------------------------------------------------------------------*/
3204 float16
float16_default_nan(float_status
*status
)
3206 FloatParts p
= parts_default_nan(status
);
3207 p
.frac
>>= float16_params
.frac_shift
;
3208 return float16_pack_raw(p
);
3211 float32
float32_default_nan(float_status
*status
)
3213 FloatParts p
= parts_default_nan(status
);
3214 p
.frac
>>= float32_params
.frac_shift
;
3215 return float32_pack_raw(p
);
3218 float64
float64_default_nan(float_status
*status
)
3220 FloatParts p
= parts_default_nan(status
);
3221 p
.frac
>>= float64_params
.frac_shift
;
3222 return float64_pack_raw(p
);
3225 float128
float128_default_nan(float_status
*status
)
3227 FloatParts p
= parts_default_nan(status
);
3230 /* Extrapolate from the choices made by parts_default_nan to fill
3231 * in the quad-floating format. If the low bit is set, assume we
3232 * want to set all non-snan bits.
3234 r
.low
= -(p
.frac
& 1);
3235 r
.high
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- 48);
3236 r
.high
|= UINT64_C(0x7FFF000000000000);
3237 r
.high
|= (uint64_t)p
.sign
<< 63;
3242 /*----------------------------------------------------------------------------
3243 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3244 *----------------------------------------------------------------------------*/
3246 float16
float16_silence_nan(float16 a
, float_status
*status
)
3248 FloatParts p
= float16_unpack_raw(a
);
3249 p
.frac
<<= float16_params
.frac_shift
;
3250 p
= parts_silence_nan(p
, status
);
3251 p
.frac
>>= float16_params
.frac_shift
;
3252 return float16_pack_raw(p
);
3255 float32
float32_silence_nan(float32 a
, float_status
*status
)
3257 FloatParts p
= float32_unpack_raw(a
);
3258 p
.frac
<<= float32_params
.frac_shift
;
3259 p
= parts_silence_nan(p
, status
);
3260 p
.frac
>>= float32_params
.frac_shift
;
3261 return float32_pack_raw(p
);
3264 float64
float64_silence_nan(float64 a
, float_status
*status
)
3266 FloatParts p
= float64_unpack_raw(a
);
3267 p
.frac
<<= float64_params
.frac_shift
;
3268 p
= parts_silence_nan(p
, status
);
3269 p
.frac
>>= float64_params
.frac_shift
;
3270 return float64_pack_raw(p
);
3274 /*----------------------------------------------------------------------------
3275 | If `a' is denormal and we are in flush-to-zero mode then set the
3276 | input-denormal exception and return zero. Otherwise just return the value.
3277 *----------------------------------------------------------------------------*/
3279 static bool parts_squash_denormal(FloatParts p
, float_status
*status
)
3281 if (p
.exp
== 0 && p
.frac
!= 0) {
3282 float_raise(float_flag_input_denormal
, status
);
3289 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
3291 if (status
->flush_inputs_to_zero
) {
3292 FloatParts p
= float16_unpack_raw(a
);
3293 if (parts_squash_denormal(p
, status
)) {
3294 return float16_set_sign(float16_zero
, p
.sign
);
3300 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
3302 if (status
->flush_inputs_to_zero
) {
3303 FloatParts p
= float32_unpack_raw(a
);
3304 if (parts_squash_denormal(p
, status
)) {
3305 return float32_set_sign(float32_zero
, p
.sign
);
3311 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
3313 if (status
->flush_inputs_to_zero
) {
3314 FloatParts p
= float64_unpack_raw(a
);
3315 if (parts_squash_denormal(p
, status
)) {
3316 return float64_set_sign(float64_zero
, p
.sign
);
3322 /*----------------------------------------------------------------------------
3323 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3324 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3325 | input. If `zSign' is 1, the input is negated before being converted to an
3326 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3327 | is simply rounded to an integer, with the inexact exception raised if the
3328 | input cannot be represented exactly as an integer. However, if the fixed-
3329 | point input is too large, the invalid exception is raised and the largest
3330 | positive or negative integer is returned.
3331 *----------------------------------------------------------------------------*/
3333 static int32_t roundAndPackInt32(bool zSign
, uint64_t absZ
,
3334 float_status
*status
)
3336 int8_t roundingMode
;
3337 bool roundNearestEven
;
3338 int8_t roundIncrement
, roundBits
;
3341 roundingMode
= status
->float_rounding_mode
;
3342 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3343 switch (roundingMode
) {
3344 case float_round_nearest_even
:
3345 case float_round_ties_away
:
3346 roundIncrement
= 0x40;
3348 case float_round_to_zero
:
3351 case float_round_up
:
3352 roundIncrement
= zSign
? 0 : 0x7f;
3354 case float_round_down
:
3355 roundIncrement
= zSign
? 0x7f : 0;
3357 case float_round_to_odd
:
3358 roundIncrement
= absZ
& 0x80 ? 0 : 0x7f;
3363 roundBits
= absZ
& 0x7F;
3364 absZ
= ( absZ
+ roundIncrement
)>>7;
3365 if (!(roundBits
^ 0x40) && roundNearestEven
) {
3369 if ( zSign
) z
= - z
;
3370 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
3371 float_raise(float_flag_invalid
, status
);
3372 return zSign
? INT32_MIN
: INT32_MAX
;
3375 status
->float_exception_flags
|= float_flag_inexact
;
3381 /*----------------------------------------------------------------------------
3382 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3383 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3384 | and returns the properly rounded 64-bit integer corresponding to the input.
3385 | If `zSign' is 1, the input is negated before being converted to an integer.
3386 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3387 | the inexact exception raised if the input cannot be represented exactly as
3388 | an integer. However, if the fixed-point input is too large, the invalid
3389 | exception is raised and the largest positive or negative integer is
3391 *----------------------------------------------------------------------------*/
3393 static int64_t roundAndPackInt64(bool zSign
, uint64_t absZ0
, uint64_t absZ1
,
3394 float_status
*status
)
3396 int8_t roundingMode
;
3397 bool roundNearestEven
, increment
;
3400 roundingMode
= status
->float_rounding_mode
;
3401 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3402 switch (roundingMode
) {
3403 case float_round_nearest_even
:
3404 case float_round_ties_away
:
3405 increment
= ((int64_t) absZ1
< 0);
3407 case float_round_to_zero
:
3410 case float_round_up
:
3411 increment
= !zSign
&& absZ1
;
3413 case float_round_down
:
3414 increment
= zSign
&& absZ1
;
3416 case float_round_to_odd
:
3417 increment
= !(absZ0
& 1) && absZ1
;
3424 if ( absZ0
== 0 ) goto overflow
;
3425 if (!(absZ1
<< 1) && roundNearestEven
) {
3430 if ( zSign
) z
= - z
;
3431 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
3433 float_raise(float_flag_invalid
, status
);
3434 return zSign
? INT64_MIN
: INT64_MAX
;
3437 status
->float_exception_flags
|= float_flag_inexact
;
3443 /*----------------------------------------------------------------------------
3444 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3445 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3446 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3447 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3448 | with the inexact exception raised if the input cannot be represented exactly
3449 | as an integer. However, if the fixed-point input is too large, the invalid
3450 | exception is raised and the largest unsigned integer is returned.
3451 *----------------------------------------------------------------------------*/
3453 static int64_t roundAndPackUint64(bool zSign
, uint64_t absZ0
,
3454 uint64_t absZ1
, float_status
*status
)
3456 int8_t roundingMode
;
3457 bool roundNearestEven
, increment
;
3459 roundingMode
= status
->float_rounding_mode
;
3460 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
3461 switch (roundingMode
) {
3462 case float_round_nearest_even
:
3463 case float_round_ties_away
:
3464 increment
= ((int64_t)absZ1
< 0);
3466 case float_round_to_zero
:
3469 case float_round_up
:
3470 increment
= !zSign
&& absZ1
;
3472 case float_round_down
:
3473 increment
= zSign
&& absZ1
;
3475 case float_round_to_odd
:
3476 increment
= !(absZ0
& 1) && absZ1
;
3484 float_raise(float_flag_invalid
, status
);
3487 if (!(absZ1
<< 1) && roundNearestEven
) {
3492 if (zSign
&& absZ0
) {
3493 float_raise(float_flag_invalid
, status
);
3498 status
->float_exception_flags
|= float_flag_inexact
;
3503 /*----------------------------------------------------------------------------
3504 | Normalizes the subnormal single-precision floating-point value represented
3505 | by the denormalized significand `aSig'. The normalized exponent and
3506 | significand are stored at the locations pointed to by `zExpPtr' and
3507 | `zSigPtr', respectively.
3508 *----------------------------------------------------------------------------*/
3511 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
3515 shiftCount
= clz32(aSig
) - 8;
3516 *zSigPtr
= aSig
<<shiftCount
;
3517 *zExpPtr
= 1 - shiftCount
;
3521 /*----------------------------------------------------------------------------
3522 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3523 | and significand `zSig', and returns the proper single-precision floating-
3524 | point value corresponding to the abstract input. Ordinarily, the abstract
3525 | value is simply rounded and packed into the single-precision format, with
3526 | the inexact exception raised if the abstract input cannot be represented
3527 | exactly. However, if the abstract value is too large, the overflow and
3528 | inexact exceptions are raised and an infinity or maximal finite value is
3529 | returned. If the abstract value is too small, the input value is rounded to
3530 | a subnormal number, and the underflow and inexact exceptions are raised if
3531 | the abstract input cannot be represented exactly as a subnormal single-
3532 | precision floating-point number.
3533 | The input significand `zSig' has its binary point between bits 30
3534 | and 29, which is 7 bits to the left of the usual location. This shifted
3535 | significand must be normalized or smaller. If `zSig' is not normalized,
3536 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3537 | and it must not require rounding. In the usual case that `zSig' is
3538 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3539 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3540 | Binary Floating-Point Arithmetic.
3541 *----------------------------------------------------------------------------*/
3543 static float32
roundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
3544 float_status
*status
)
3546 int8_t roundingMode
;
3547 bool roundNearestEven
;
3548 int8_t roundIncrement
, roundBits
;
3551 roundingMode
= status
->float_rounding_mode
;
3552 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3553 switch (roundingMode
) {
3554 case float_round_nearest_even
:
3555 case float_round_ties_away
:
3556 roundIncrement
= 0x40;
3558 case float_round_to_zero
:
3561 case float_round_up
:
3562 roundIncrement
= zSign
? 0 : 0x7f;
3564 case float_round_down
:
3565 roundIncrement
= zSign
? 0x7f : 0;
3567 case float_round_to_odd
:
3568 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
3574 roundBits
= zSig
& 0x7F;
3575 if ( 0xFD <= (uint16_t) zExp
) {
3576 if ( ( 0xFD < zExp
)
3577 || ( ( zExp
== 0xFD )
3578 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
3580 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
3581 roundIncrement
!= 0;
3582 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3583 return packFloat32(zSign
, 0xFF, -!overflow_to_inf
);
3586 if (status
->flush_to_zero
) {
3587 float_raise(float_flag_output_denormal
, status
);
3588 return packFloat32(zSign
, 0, 0);
3590 isTiny
= status
->tininess_before_rounding
3592 || (zSig
+ roundIncrement
< 0x80000000);
3593 shift32RightJamming( zSig
, - zExp
, &zSig
);
3595 roundBits
= zSig
& 0x7F;
3596 if (isTiny
&& roundBits
) {
3597 float_raise(float_flag_underflow
, status
);
3599 if (roundingMode
== float_round_to_odd
) {
3601 * For round-to-odd case, the roundIncrement depends on
3602 * zSig which just changed.
3604 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
3609 status
->float_exception_flags
|= float_flag_inexact
;
3611 zSig
= ( zSig
+ roundIncrement
)>>7;
3612 if (!(roundBits
^ 0x40) && roundNearestEven
) {
3615 if ( zSig
== 0 ) zExp
= 0;
3616 return packFloat32( zSign
, zExp
, zSig
);
3620 /*----------------------------------------------------------------------------
3621 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3622 | and significand `zSig', and returns the proper single-precision floating-
3623 | point value corresponding to the abstract input. This routine is just like
3624 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3625 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3626 | floating-point exponent.
3627 *----------------------------------------------------------------------------*/
3630 normalizeRoundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
3631 float_status
*status
)
3635 shiftCount
= clz32(zSig
) - 1;
3636 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
3641 /*----------------------------------------------------------------------------
3642 | Normalizes the subnormal double-precision floating-point value represented
3643 | by the denormalized significand `aSig'. The normalized exponent and
3644 | significand are stored at the locations pointed to by `zExpPtr' and
3645 | `zSigPtr', respectively.
3646 *----------------------------------------------------------------------------*/
3649 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
3653 shiftCount
= clz64(aSig
) - 11;
3654 *zSigPtr
= aSig
<<shiftCount
;
3655 *zExpPtr
= 1 - shiftCount
;
3659 /*----------------------------------------------------------------------------
3660 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3661 | double-precision floating-point value, returning the result. After being
3662 | shifted into the proper positions, the three fields are simply added
3663 | together to form the result. This means that any integer portion of `zSig'
3664 | will be added into the exponent. Since a properly normalized significand
3665 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3666 | than the desired result exponent whenever `zSig' is a complete, normalized
3668 *----------------------------------------------------------------------------*/
3670 static inline float64
packFloat64(bool zSign
, int zExp
, uint64_t zSig
)
3673 return make_float64(
3674 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
3678 /*----------------------------------------------------------------------------
3679 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3680 | and significand `zSig', and returns the proper double-precision floating-
3681 | point value corresponding to the abstract input. Ordinarily, the abstract
3682 | value is simply rounded and packed into the double-precision format, with
3683 | the inexact exception raised if the abstract input cannot be represented
3684 | exactly. However, if the abstract value is too large, the overflow and
3685 | inexact exceptions are raised and an infinity or maximal finite value is
3686 | returned. If the abstract value is too small, the input value is rounded to
3687 | a subnormal number, and the underflow and inexact exceptions are raised if
3688 | the abstract input cannot be represented exactly as a subnormal double-
3689 | precision floating-point number.
3690 | The input significand `zSig' has its binary point between bits 62
3691 | and 61, which is 10 bits to the left of the usual location. This shifted
3692 | significand must be normalized or smaller. If `zSig' is not normalized,
3693 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3694 | and it must not require rounding. In the usual case that `zSig' is
3695 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3696 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3697 | Binary Floating-Point Arithmetic.
3698 *----------------------------------------------------------------------------*/
3700 static float64
roundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
3701 float_status
*status
)
3703 int8_t roundingMode
;
3704 bool roundNearestEven
;
3705 int roundIncrement
, roundBits
;
3708 roundingMode
= status
->float_rounding_mode
;
3709 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3710 switch (roundingMode
) {
3711 case float_round_nearest_even
:
3712 case float_round_ties_away
:
3713 roundIncrement
= 0x200;
3715 case float_round_to_zero
:
3718 case float_round_up
:
3719 roundIncrement
= zSign
? 0 : 0x3ff;
3721 case float_round_down
:
3722 roundIncrement
= zSign
? 0x3ff : 0;
3724 case float_round_to_odd
:
3725 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
3730 roundBits
= zSig
& 0x3FF;
3731 if ( 0x7FD <= (uint16_t) zExp
) {
3732 if ( ( 0x7FD < zExp
)
3733 || ( ( zExp
== 0x7FD )
3734 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
3736 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
3737 roundIncrement
!= 0;
3738 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3739 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
3742 if (status
->flush_to_zero
) {
3743 float_raise(float_flag_output_denormal
, status
);
3744 return packFloat64(zSign
, 0, 0);
3746 isTiny
= status
->tininess_before_rounding
3748 || (zSig
+ roundIncrement
< UINT64_C(0x8000000000000000));
3749 shift64RightJamming( zSig
, - zExp
, &zSig
);
3751 roundBits
= zSig
& 0x3FF;
3752 if (isTiny
&& roundBits
) {
3753 float_raise(float_flag_underflow
, status
);
3755 if (roundingMode
== float_round_to_odd
) {
3757 * For round-to-odd case, the roundIncrement depends on
3758 * zSig which just changed.
3760 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
3765 status
->float_exception_flags
|= float_flag_inexact
;
3767 zSig
= ( zSig
+ roundIncrement
)>>10;
3768 if (!(roundBits
^ 0x200) && roundNearestEven
) {
3771 if ( zSig
== 0 ) zExp
= 0;
3772 return packFloat64( zSign
, zExp
, zSig
);
3776 /*----------------------------------------------------------------------------
3777 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3778 | and significand `zSig', and returns the proper double-precision floating-
3779 | point value corresponding to the abstract input. This routine is just like
3780 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3781 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3782 | floating-point exponent.
3783 *----------------------------------------------------------------------------*/
3786 normalizeRoundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
3787 float_status
*status
)
3791 shiftCount
= clz64(zSig
) - 1;
3792 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
3797 /*----------------------------------------------------------------------------
3798 | Normalizes the subnormal extended double-precision floating-point value
3799 | represented by the denormalized significand `aSig'. The normalized exponent
3800 | and significand are stored at the locations pointed to by `zExpPtr' and
3801 | `zSigPtr', respectively.
3802 *----------------------------------------------------------------------------*/
3804 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
3809 shiftCount
= clz64(aSig
);
3810 *zSigPtr
= aSig
<<shiftCount
;
3811 *zExpPtr
= 1 - shiftCount
;
3814 /*----------------------------------------------------------------------------
3815 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3816 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3817 | and returns the proper extended double-precision floating-point value
3818 | corresponding to the abstract input. Ordinarily, the abstract value is
3819 | rounded and packed into the extended double-precision format, with the
3820 | inexact exception raised if the abstract input cannot be represented
3821 | exactly. However, if the abstract value is too large, the overflow and
3822 | inexact exceptions are raised and an infinity or maximal finite value is
3823 | returned. If the abstract value is too small, the input value is rounded to
3824 | a subnormal number, and the underflow and inexact exceptions are raised if
3825 | the abstract input cannot be represented exactly as a subnormal extended
3826 | double-precision floating-point number.
3827 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
3828 | number of bits as single or double precision, respectively. Otherwise, the
3829 | result is rounded to the full precision of the extended double-precision
3831 | The input significand must be normalized or smaller. If the input
3832 | significand is not normalized, `zExp' must be 0; in that case, the result
3833 | returned is a subnormal number, and it must not require rounding. The
3834 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3835 | Floating-Point Arithmetic.
3836 *----------------------------------------------------------------------------*/
3838 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, bool zSign
,
3839 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
3840 float_status
*status
)
3842 int8_t roundingMode
;
3843 bool roundNearestEven
, increment
, isTiny
;
3844 int64_t roundIncrement
, roundMask
, roundBits
;
3846 roundingMode
= status
->float_rounding_mode
;
3847 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3848 if ( roundingPrecision
== 80 ) goto precision80
;
3849 if ( roundingPrecision
== 64 ) {
3850 roundIncrement
= UINT64_C(0x0000000000000400);
3851 roundMask
= UINT64_C(0x00000000000007FF);
3853 else if ( roundingPrecision
== 32 ) {
3854 roundIncrement
= UINT64_C(0x0000008000000000);
3855 roundMask
= UINT64_C(0x000000FFFFFFFFFF);
3860 zSig0
|= ( zSig1
!= 0 );
3861 switch (roundingMode
) {
3862 case float_round_nearest_even
:
3863 case float_round_ties_away
:
3865 case float_round_to_zero
:
3868 case float_round_up
:
3869 roundIncrement
= zSign
? 0 : roundMask
;
3871 case float_round_down
:
3872 roundIncrement
= zSign
? roundMask
: 0;
3877 roundBits
= zSig0
& roundMask
;
3878 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
3879 if ( ( 0x7FFE < zExp
)
3880 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
3885 if (status
->flush_to_zero
) {
3886 float_raise(float_flag_output_denormal
, status
);
3887 return packFloatx80(zSign
, 0, 0);
3889 isTiny
= status
->tininess_before_rounding
3891 || (zSig0
<= zSig0
+ roundIncrement
);
3892 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
3894 roundBits
= zSig0
& roundMask
;
3895 if (isTiny
&& roundBits
) {
3896 float_raise(float_flag_underflow
, status
);
3899 status
->float_exception_flags
|= float_flag_inexact
;
3901 zSig0
+= roundIncrement
;
3902 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
3903 roundIncrement
= roundMask
+ 1;
3904 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
3905 roundMask
|= roundIncrement
;
3907 zSig0
&= ~ roundMask
;
3908 return packFloatx80( zSign
, zExp
, zSig0
);
3912 status
->float_exception_flags
|= float_flag_inexact
;
3914 zSig0
+= roundIncrement
;
3915 if ( zSig0
< roundIncrement
) {
3917 zSig0
= UINT64_C(0x8000000000000000);
3919 roundIncrement
= roundMask
+ 1;
3920 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
3921 roundMask
|= roundIncrement
;
3923 zSig0
&= ~ roundMask
;
3924 if ( zSig0
== 0 ) zExp
= 0;
3925 return packFloatx80( zSign
, zExp
, zSig0
);
3927 switch (roundingMode
) {
3928 case float_round_nearest_even
:
3929 case float_round_ties_away
:
3930 increment
= ((int64_t)zSig1
< 0);
3932 case float_round_to_zero
:
3935 case float_round_up
:
3936 increment
= !zSign
&& zSig1
;
3938 case float_round_down
:
3939 increment
= zSign
&& zSig1
;
3944 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
3945 if ( ( 0x7FFE < zExp
)
3946 || ( ( zExp
== 0x7FFE )
3947 && ( zSig0
== UINT64_C(0xFFFFFFFFFFFFFFFF) )
3953 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3954 if ( ( roundingMode
== float_round_to_zero
)
3955 || ( zSign
&& ( roundingMode
== float_round_up
) )
3956 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
3958 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
3960 return packFloatx80(zSign
,
3961 floatx80_infinity_high
,
3962 floatx80_infinity_low
);
3965 isTiny
= status
->tininess_before_rounding
3968 || (zSig0
< UINT64_C(0xFFFFFFFFFFFFFFFF));
3969 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
3971 if (isTiny
&& zSig1
) {
3972 float_raise(float_flag_underflow
, status
);
3975 status
->float_exception_flags
|= float_flag_inexact
;
3977 switch (roundingMode
) {
3978 case float_round_nearest_even
:
3979 case float_round_ties_away
:
3980 increment
= ((int64_t)zSig1
< 0);
3982 case float_round_to_zero
:
3985 case float_round_up
:
3986 increment
= !zSign
&& zSig1
;
3988 case float_round_down
:
3989 increment
= zSign
&& zSig1
;
3996 if (!(zSig1
<< 1) && roundNearestEven
) {
3999 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4001 return packFloatx80( zSign
, zExp
, zSig0
);
4005 status
->float_exception_flags
|= float_flag_inexact
;
4011 zSig0
= UINT64_C(0x8000000000000000);
4014 if (!(zSig1
<< 1) && roundNearestEven
) {
4020 if ( zSig0
== 0 ) zExp
= 0;
4022 return packFloatx80( zSign
, zExp
, zSig0
);
4026 /*----------------------------------------------------------------------------
4027 | Takes an abstract floating-point value having sign `zSign', exponent
4028 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4029 | and returns the proper extended double-precision floating-point value
4030 | corresponding to the abstract input. This routine is just like
4031 | `roundAndPackFloatx80' except that the input significand does not have to be
4033 *----------------------------------------------------------------------------*/
4035 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
4036 bool zSign
, int32_t zExp
,
4037 uint64_t zSig0
, uint64_t zSig1
,
4038 float_status
*status
)
4047 shiftCount
= clz64(zSig0
);
4048 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4050 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
4051 zSig0
, zSig1
, status
);
4055 /*----------------------------------------------------------------------------
4056 | Returns the least-significant 64 fraction bits of the quadruple-precision
4057 | floating-point value `a'.
4058 *----------------------------------------------------------------------------*/
4060 static inline uint64_t extractFloat128Frac1( float128 a
)
4067 /*----------------------------------------------------------------------------
4068 | Returns the most-significant 48 fraction bits of the quadruple-precision
4069 | floating-point value `a'.
4070 *----------------------------------------------------------------------------*/
4072 static inline uint64_t extractFloat128Frac0( float128 a
)
4075 return a
.high
& UINT64_C(0x0000FFFFFFFFFFFF);
4079 /*----------------------------------------------------------------------------
4080 | Returns the exponent bits of the quadruple-precision floating-point value
4082 *----------------------------------------------------------------------------*/
4084 static inline int32_t extractFloat128Exp( float128 a
)
4087 return ( a
.high
>>48 ) & 0x7FFF;
4091 /*----------------------------------------------------------------------------
4092 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4093 *----------------------------------------------------------------------------*/
4095 static inline bool extractFloat128Sign(float128 a
)
4097 return a
.high
>> 63;
4100 /*----------------------------------------------------------------------------
4101 | Normalizes the subnormal quadruple-precision floating-point value
4102 | represented by the denormalized significand formed by the concatenation of
4103 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4104 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4105 | significand are stored at the location pointed to by `zSig0Ptr', and the
4106 | least significant 64 bits of the normalized significand are stored at the
4107 | location pointed to by `zSig1Ptr'.
4108 *----------------------------------------------------------------------------*/
4111 normalizeFloat128Subnormal(
4122 shiftCount
= clz64(aSig1
) - 15;
4123 if ( shiftCount
< 0 ) {
4124 *zSig0Ptr
= aSig1
>>( - shiftCount
);
4125 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
4128 *zSig0Ptr
= aSig1
<<shiftCount
;
4131 *zExpPtr
= - shiftCount
- 63;
4134 shiftCount
= clz64(aSig0
) - 15;
4135 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
4136 *zExpPtr
= 1 - shiftCount
;
4141 /*----------------------------------------------------------------------------
4142 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4143 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4144 | floating-point value, returning the result. After being shifted into the
4145 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4146 | added together to form the most significant 32 bits of the result. This
4147 | means that any integer portion of `zSig0' will be added into the exponent.
4148 | Since a properly normalized significand will have an integer portion equal
4149 | to 1, the `zExp' input should be 1 less than the desired result exponent
4150 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4152 *----------------------------------------------------------------------------*/
4154 static inline float128
4155 packFloat128(bool zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
4160 z
.high
= ((uint64_t)zSign
<< 63) + ((uint64_t)zExp
<< 48) + zSig0
;
4164 /*----------------------------------------------------------------------------
4165 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4166 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4167 | and `zSig2', and returns the proper quadruple-precision floating-point value
4168 | corresponding to the abstract input. Ordinarily, the abstract value is
4169 | simply rounded and packed into the quadruple-precision format, with the
4170 | inexact exception raised if the abstract input cannot be represented
4171 | exactly. However, if the abstract value is too large, the overflow and
4172 | inexact exceptions are raised and an infinity or maximal finite value is
4173 | returned. If the abstract value is too small, the input value is rounded to
4174 | a subnormal number, and the underflow and inexact exceptions are raised if
4175 | the abstract input cannot be represented exactly as a subnormal quadruple-
4176 | precision floating-point number.
4177 | The input significand must be normalized or smaller. If the input
4178 | significand is not normalized, `zExp' must be 0; in that case, the result
4179 | returned is a subnormal number, and it must not require rounding. In the
4180 | usual case that the input significand is normalized, `zExp' must be 1 less
4181 | than the ``true'' floating-point exponent. The handling of underflow and
4182 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4183 *----------------------------------------------------------------------------*/
4185 static float128
roundAndPackFloat128(bool zSign
, int32_t zExp
,
4186 uint64_t zSig0
, uint64_t zSig1
,
4187 uint64_t zSig2
, float_status
*status
)
4189 int8_t roundingMode
;
4190 bool roundNearestEven
, increment
, isTiny
;
4192 roundingMode
= status
->float_rounding_mode
;
4193 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4194 switch (roundingMode
) {
4195 case float_round_nearest_even
:
4196 case float_round_ties_away
:
4197 increment
= ((int64_t)zSig2
< 0);
4199 case float_round_to_zero
:
4202 case float_round_up
:
4203 increment
= !zSign
&& zSig2
;
4205 case float_round_down
:
4206 increment
= zSign
&& zSig2
;
4208 case float_round_to_odd
:
4209 increment
= !(zSig1
& 0x1) && zSig2
;
4214 if ( 0x7FFD <= (uint32_t) zExp
) {
4215 if ( ( 0x7FFD < zExp
)
4216 || ( ( zExp
== 0x7FFD )
4218 UINT64_C(0x0001FFFFFFFFFFFF),
4219 UINT64_C(0xFFFFFFFFFFFFFFFF),
4226 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4227 if ( ( roundingMode
== float_round_to_zero
)
4228 || ( zSign
&& ( roundingMode
== float_round_up
) )
4229 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4230 || (roundingMode
== float_round_to_odd
)
4236 UINT64_C(0x0000FFFFFFFFFFFF),
4237 UINT64_C(0xFFFFFFFFFFFFFFFF)
4240 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4243 if (status
->flush_to_zero
) {
4244 float_raise(float_flag_output_denormal
, status
);
4245 return packFloat128(zSign
, 0, 0, 0);
4247 isTiny
= status
->tininess_before_rounding
4250 || lt128(zSig0
, zSig1
,
4251 UINT64_C(0x0001FFFFFFFFFFFF),
4252 UINT64_C(0xFFFFFFFFFFFFFFFF));
4253 shift128ExtraRightJamming(
4254 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
4256 if (isTiny
&& zSig2
) {
4257 float_raise(float_flag_underflow
, status
);
4259 switch (roundingMode
) {
4260 case float_round_nearest_even
:
4261 case float_round_ties_away
:
4262 increment
= ((int64_t)zSig2
< 0);
4264 case float_round_to_zero
:
4267 case float_round_up
:
4268 increment
= !zSign
&& zSig2
;
4270 case float_round_down
:
4271 increment
= zSign
&& zSig2
;
4273 case float_round_to_odd
:
4274 increment
= !(zSig1
& 0x1) && zSig2
;
4282 status
->float_exception_flags
|= float_flag_inexact
;
4285 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
4286 if ((zSig2
+ zSig2
== 0) && roundNearestEven
) {
4291 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
4293 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4297 /*----------------------------------------------------------------------------
4298 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4299 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4300 | returns the proper quadruple-precision floating-point value corresponding
4301 | to the abstract input. This routine is just like `roundAndPackFloat128'
4302 | except that the input significand has fewer bits and does not have to be
4303 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4305 *----------------------------------------------------------------------------*/
4307 static float128
normalizeRoundAndPackFloat128(bool zSign
, int32_t zExp
,
4308 uint64_t zSig0
, uint64_t zSig1
,
4309 float_status
*status
)
4319 shiftCount
= clz64(zSig0
) - 15;
4320 if ( 0 <= shiftCount
) {
4322 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4325 shift128ExtraRightJamming(
4326 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
4329 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
4334 /*----------------------------------------------------------------------------
4335 | Returns the result of converting the 32-bit two's complement integer `a'
4336 | to the extended double-precision floating-point format. The conversion
4337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4339 *----------------------------------------------------------------------------*/
4341 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
4348 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4350 absA
= zSign
? - a
: a
;
4351 shiftCount
= clz32(absA
) + 32;
4353 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
4357 /*----------------------------------------------------------------------------
4358 | Returns the result of converting the 32-bit two's complement integer `a' to
4359 | the quadruple-precision floating-point format. The conversion is performed
4360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4361 *----------------------------------------------------------------------------*/
4363 float128
int32_to_float128(int32_t a
, float_status
*status
)
4370 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4372 absA
= zSign
? - a
: a
;
4373 shiftCount
= clz32(absA
) + 17;
4375 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
4379 /*----------------------------------------------------------------------------
4380 | Returns the result of converting the 64-bit two's complement integer `a'
4381 | to the extended double-precision floating-point format. The conversion
4382 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4384 *----------------------------------------------------------------------------*/
4386 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
4392 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4394 absA
= zSign
? - a
: a
;
4395 shiftCount
= clz64(absA
);
4396 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
4400 /*----------------------------------------------------------------------------
4401 | Returns the result of converting the 64-bit two's complement integer `a' to
4402 | the quadruple-precision floating-point format. The conversion is performed
4403 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4404 *----------------------------------------------------------------------------*/
4406 float128
int64_to_float128(int64_t a
, float_status
*status
)
4412 uint64_t zSig0
, zSig1
;
4414 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4416 absA
= zSign
? - a
: a
;
4417 shiftCount
= clz64(absA
) + 49;
4418 zExp
= 0x406E - shiftCount
;
4419 if ( 64 <= shiftCount
) {
4428 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4429 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4433 /*----------------------------------------------------------------------------
4434 | Returns the result of converting the 64-bit unsigned integer `a'
4435 | to the quadruple-precision floating-point format. The conversion is performed
4436 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4437 *----------------------------------------------------------------------------*/
4439 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
4442 return float128_zero
;
4444 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a
, status
);
4447 /*----------------------------------------------------------------------------
4448 | Returns the result of converting the single-precision floating-point value
4449 | `a' to the extended double-precision floating-point format. The conversion
4450 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4452 *----------------------------------------------------------------------------*/
4454 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
4460 a
= float32_squash_input_denormal(a
, status
);
4461 aSig
= extractFloat32Frac( a
);
4462 aExp
= extractFloat32Exp( a
);
4463 aSign
= extractFloat32Sign( a
);
4464 if ( aExp
== 0xFF ) {
4466 floatx80 res
= commonNaNToFloatx80(float32ToCommonNaN(a
, status
),
4468 return floatx80_silence_nan(res
, status
);
4470 return packFloatx80(aSign
,
4471 floatx80_infinity_high
,
4472 floatx80_infinity_low
);
4475 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
4476 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4479 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
4483 /*----------------------------------------------------------------------------
4484 | Returns the result of converting the single-precision floating-point value
4485 | `a' to the double-precision floating-point format. The conversion is
4486 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4488 *----------------------------------------------------------------------------*/
4490 float128
float32_to_float128(float32 a
, float_status
*status
)
4496 a
= float32_squash_input_denormal(a
, status
);
4497 aSig
= extractFloat32Frac( a
);
4498 aExp
= extractFloat32Exp( a
);
4499 aSign
= extractFloat32Sign( a
);
4500 if ( aExp
== 0xFF ) {
4502 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
4504 return packFloat128( aSign
, 0x7FFF, 0, 0 );
4507 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
4508 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4511 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
4515 /*----------------------------------------------------------------------------
4516 | Returns the remainder of the single-precision floating-point value `a'
4517 | with respect to the corresponding value `b'. The operation is performed
4518 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4519 *----------------------------------------------------------------------------*/
4521 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
4524 int aExp
, bExp
, expDiff
;
4525 uint32_t aSig
, bSig
;
4527 uint64_t aSig64
, bSig64
, q64
;
4528 uint32_t alternateASig
;
4530 a
= float32_squash_input_denormal(a
, status
);
4531 b
= float32_squash_input_denormal(b
, status
);
4533 aSig
= extractFloat32Frac( a
);
4534 aExp
= extractFloat32Exp( a
);
4535 aSign
= extractFloat32Sign( a
);
4536 bSig
= extractFloat32Frac( b
);
4537 bExp
= extractFloat32Exp( b
);
4538 if ( aExp
== 0xFF ) {
4539 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
4540 return propagateFloat32NaN(a
, b
, status
);
4542 float_raise(float_flag_invalid
, status
);
4543 return float32_default_nan(status
);
4545 if ( bExp
== 0xFF ) {
4547 return propagateFloat32NaN(a
, b
, status
);
4553 float_raise(float_flag_invalid
, status
);
4554 return float32_default_nan(status
);
4556 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
4559 if ( aSig
== 0 ) return a
;
4560 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4562 expDiff
= aExp
- bExp
;
4565 if ( expDiff
< 32 ) {
4568 if ( expDiff
< 0 ) {
4569 if ( expDiff
< -1 ) return a
;
4572 q
= ( bSig
<= aSig
);
4573 if ( q
) aSig
-= bSig
;
4574 if ( 0 < expDiff
) {
4575 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
4578 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4586 if ( bSig
<= aSig
) aSig
-= bSig
;
4587 aSig64
= ( (uint64_t) aSig
)<<40;
4588 bSig64
= ( (uint64_t) bSig
)<<40;
4590 while ( 0 < expDiff
) {
4591 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
4592 q64
= ( 2 < q64
) ? q64
- 2 : 0;
4593 aSig64
= - ( ( bSig
* q64
)<<38 );
4597 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
4598 q64
= ( 2 < q64
) ? q64
- 2 : 0;
4599 q
= q64
>>( 64 - expDiff
);
4601 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
4604 alternateASig
= aSig
;
4607 } while ( 0 <= (int32_t) aSig
);
4608 sigMean
= aSig
+ alternateASig
;
4609 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4610 aSig
= alternateASig
;
4612 zSign
= ( (int32_t) aSig
< 0 );
4613 if ( zSign
) aSig
= - aSig
;
4614 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
4619 /*----------------------------------------------------------------------------
4620 | Returns the binary exponential of the single-precision floating-point value
4621 | `a'. The operation is performed according to the IEC/IEEE Standard for
4622 | Binary Floating-Point Arithmetic.
4624 | Uses the following identities:
4626 | 1. -------------------------------------------------------------------------
4630 | 2. -------------------------------------------------------------------------
4633 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4635 *----------------------------------------------------------------------------*/
4637 static const float64 float32_exp2_coefficients
[15] =
4639 const_float64( 0x3ff0000000000000ll
), /* 1 */
4640 const_float64( 0x3fe0000000000000ll
), /* 2 */
4641 const_float64( 0x3fc5555555555555ll
), /* 3 */
4642 const_float64( 0x3fa5555555555555ll
), /* 4 */
4643 const_float64( 0x3f81111111111111ll
), /* 5 */
4644 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
4645 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
4646 const_float64( 0x3efa01a01a01a01all
), /* 8 */
4647 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
4648 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
4649 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
4650 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
4651 const_float64( 0x3de6124613a86d09ll
), /* 13 */
4652 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
4653 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
4656 float32
float32_exp2(float32 a
, float_status
*status
)
4663 a
= float32_squash_input_denormal(a
, status
);
4665 aSig
= extractFloat32Frac( a
);
4666 aExp
= extractFloat32Exp( a
);
4667 aSign
= extractFloat32Sign( a
);
4669 if ( aExp
== 0xFF) {
4671 return propagateFloat32NaN(a
, float32_zero
, status
);
4673 return (aSign
) ? float32_zero
: a
;
4676 if (aSig
== 0) return float32_one
;
4679 float_raise(float_flag_inexact
, status
);
4681 /* ******************************* */
4682 /* using float64 for approximation */
4683 /* ******************************* */
4684 x
= float32_to_float64(a
, status
);
4685 x
= float64_mul(x
, float64_ln2
, status
);
4689 for (i
= 0 ; i
< 15 ; i
++) {
4692 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
4693 r
= float64_add(r
, f
, status
);
4695 xn
= float64_mul(xn
, x
, status
);
4698 return float64_to_float32(r
, status
);
4701 /*----------------------------------------------------------------------------
4702 | Returns the binary log of the single-precision floating-point value `a'.
4703 | The operation is performed according to the IEC/IEEE Standard for Binary
4704 | Floating-Point Arithmetic.
4705 *----------------------------------------------------------------------------*/
4706 float32
float32_log2(float32 a
, float_status
*status
)
4710 uint32_t aSig
, zSig
, i
;
4712 a
= float32_squash_input_denormal(a
, status
);
4713 aSig
= extractFloat32Frac( a
);
4714 aExp
= extractFloat32Exp( a
);
4715 aSign
= extractFloat32Sign( a
);
4718 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
4719 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4722 float_raise(float_flag_invalid
, status
);
4723 return float32_default_nan(status
);
4725 if ( aExp
== 0xFF ) {
4727 return propagateFloat32NaN(a
, float32_zero
, status
);
4737 for (i
= 1 << 22; i
> 0; i
>>= 1) {
4738 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
4739 if ( aSig
& 0x01000000 ) {
4748 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
4751 /*----------------------------------------------------------------------------
4752 | Returns the result of converting the double-precision floating-point value
4753 | `a' to the extended double-precision floating-point format. The conversion
4754 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4756 *----------------------------------------------------------------------------*/
4758 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
4764 a
= float64_squash_input_denormal(a
, status
);
4765 aSig
= extractFloat64Frac( a
);
4766 aExp
= extractFloat64Exp( a
);
4767 aSign
= extractFloat64Sign( a
);
4768 if ( aExp
== 0x7FF ) {
4770 floatx80 res
= commonNaNToFloatx80(float64ToCommonNaN(a
, status
),
4772 return floatx80_silence_nan(res
, status
);
4774 return packFloatx80(aSign
,
4775 floatx80_infinity_high
,
4776 floatx80_infinity_low
);
4779 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
4780 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4784 aSign
, aExp
+ 0x3C00, (aSig
| UINT64_C(0x0010000000000000)) << 11);
4788 /*----------------------------------------------------------------------------
4789 | Returns the result of converting the double-precision floating-point value
4790 | `a' to the quadruple-precision floating-point format. The conversion is
4791 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4793 *----------------------------------------------------------------------------*/
4795 float128
float64_to_float128(float64 a
, float_status
*status
)
4799 uint64_t aSig
, zSig0
, zSig1
;
4801 a
= float64_squash_input_denormal(a
, status
);
4802 aSig
= extractFloat64Frac( a
);
4803 aExp
= extractFloat64Exp( a
);
4804 aSign
= extractFloat64Sign( a
);
4805 if ( aExp
== 0x7FF ) {
4807 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
4809 return packFloat128( aSign
, 0x7FFF, 0, 0 );
4812 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
4813 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4816 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
4817 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
4822 /*----------------------------------------------------------------------------
4823 | Returns the remainder of the double-precision floating-point value `a'
4824 | with respect to the corresponding value `b'. The operation is performed
4825 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4826 *----------------------------------------------------------------------------*/
4828 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
4831 int aExp
, bExp
, expDiff
;
4832 uint64_t aSig
, bSig
;
4833 uint64_t q
, alternateASig
;
4836 a
= float64_squash_input_denormal(a
, status
);
4837 b
= float64_squash_input_denormal(b
, status
);
4838 aSig
= extractFloat64Frac( a
);
4839 aExp
= extractFloat64Exp( a
);
4840 aSign
= extractFloat64Sign( a
);
4841 bSig
= extractFloat64Frac( b
);
4842 bExp
= extractFloat64Exp( b
);
4843 if ( aExp
== 0x7FF ) {
4844 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
4845 return propagateFloat64NaN(a
, b
, status
);
4847 float_raise(float_flag_invalid
, status
);
4848 return float64_default_nan(status
);
4850 if ( bExp
== 0x7FF ) {
4852 return propagateFloat64NaN(a
, b
, status
);
4858 float_raise(float_flag_invalid
, status
);
4859 return float64_default_nan(status
);
4861 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4864 if ( aSig
== 0 ) return a
;
4865 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4867 expDiff
= aExp
- bExp
;
4868 aSig
= (aSig
| UINT64_C(0x0010000000000000)) << 11;
4869 bSig
= (bSig
| UINT64_C(0x0010000000000000)) << 11;
4870 if ( expDiff
< 0 ) {
4871 if ( expDiff
< -1 ) return a
;
4874 q
= ( bSig
<= aSig
);
4875 if ( q
) aSig
-= bSig
;
4877 while ( 0 < expDiff
) {
4878 q
= estimateDiv128To64( aSig
, 0, bSig
);
4879 q
= ( 2 < q
) ? q
- 2 : 0;
4880 aSig
= - ( ( bSig
>>2 ) * q
);
4884 if ( 0 < expDiff
) {
4885 q
= estimateDiv128To64( aSig
, 0, bSig
);
4886 q
= ( 2 < q
) ? q
- 2 : 0;
4889 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4896 alternateASig
= aSig
;
4899 } while ( 0 <= (int64_t) aSig
);
4900 sigMean
= aSig
+ alternateASig
;
4901 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4902 aSig
= alternateASig
;
4904 zSign
= ( (int64_t) aSig
< 0 );
4905 if ( zSign
) aSig
= - aSig
;
4906 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
4910 /*----------------------------------------------------------------------------
4911 | Returns the binary log of the double-precision floating-point value `a'.
4912 | The operation is performed according to the IEC/IEEE Standard for Binary
4913 | Floating-Point Arithmetic.
4914 *----------------------------------------------------------------------------*/
4915 float64
float64_log2(float64 a
, float_status
*status
)
4919 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
4920 a
= float64_squash_input_denormal(a
, status
);
4922 aSig
= extractFloat64Frac( a
);
4923 aExp
= extractFloat64Exp( a
);
4924 aSign
= extractFloat64Sign( a
);
4927 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
4928 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4931 float_raise(float_flag_invalid
, status
);
4932 return float64_default_nan(status
);
4934 if ( aExp
== 0x7FF ) {
4936 return propagateFloat64NaN(a
, float64_zero
, status
);
4942 aSig
|= UINT64_C(0x0010000000000000);
4944 zSig
= (uint64_t)aExp
<< 52;
4945 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
4946 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
4947 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
4948 if ( aSig
& UINT64_C(0x0020000000000000) ) {
4956 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
4959 /*----------------------------------------------------------------------------
4960 | Returns the result of converting the extended double-precision floating-
4961 | point value `a' to the 32-bit two's complement integer format. The
4962 | conversion is performed according to the IEC/IEEE Standard for Binary
4963 | Floating-Point Arithmetic---which means in particular that the conversion
4964 | is rounded according to the current rounding mode. If `a' is a NaN, the
4965 | largest positive integer is returned. Otherwise, if the conversion
4966 | overflows, the largest integer with the same sign as `a' is returned.
4967 *----------------------------------------------------------------------------*/
4969 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
4972 int32_t aExp
, shiftCount
;
4975 if (floatx80_invalid_encoding(a
)) {
4976 float_raise(float_flag_invalid
, status
);
4979 aSig
= extractFloatx80Frac( a
);
4980 aExp
= extractFloatx80Exp( a
);
4981 aSign
= extractFloatx80Sign( a
);
4982 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4983 shiftCount
= 0x4037 - aExp
;
4984 if ( shiftCount
<= 0 ) shiftCount
= 1;
4985 shift64RightJamming( aSig
, shiftCount
, &aSig
);
4986 return roundAndPackInt32(aSign
, aSig
, status
);
4990 /*----------------------------------------------------------------------------
4991 | Returns the result of converting the extended double-precision floating-
4992 | point value `a' to the 32-bit two's complement integer format. The
4993 | conversion is performed according to the IEC/IEEE Standard for Binary
4994 | Floating-Point Arithmetic, except that the conversion is always rounded
4995 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4996 | Otherwise, if the conversion overflows, the largest integer with the same
4997 | sign as `a' is returned.
4998 *----------------------------------------------------------------------------*/
5000 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
5003 int32_t aExp
, shiftCount
;
5004 uint64_t aSig
, savedASig
;
5007 if (floatx80_invalid_encoding(a
)) {
5008 float_raise(float_flag_invalid
, status
);
5011 aSig
= extractFloatx80Frac( a
);
5012 aExp
= extractFloatx80Exp( a
);
5013 aSign
= extractFloatx80Sign( a
);
5014 if ( 0x401E < aExp
) {
5015 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5018 else if ( aExp
< 0x3FFF ) {
5020 status
->float_exception_flags
|= float_flag_inexact
;
5024 shiftCount
= 0x403E - aExp
;
5026 aSig
>>= shiftCount
;
5028 if ( aSign
) z
= - z
;
5029 if ( ( z
< 0 ) ^ aSign
) {
5031 float_raise(float_flag_invalid
, status
);
5032 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5034 if ( ( aSig
<<shiftCount
) != savedASig
) {
5035 status
->float_exception_flags
|= float_flag_inexact
;
5041 /*----------------------------------------------------------------------------
5042 | Returns the result of converting the extended double-precision floating-
5043 | point value `a' to the 64-bit two's complement integer format. The
5044 | conversion is performed according to the IEC/IEEE Standard for Binary
5045 | Floating-Point Arithmetic---which means in particular that the conversion
5046 | is rounded according to the current rounding mode. If `a' is a NaN,
5047 | the largest positive integer is returned. Otherwise, if the conversion
5048 | overflows, the largest integer with the same sign as `a' is returned.
5049 *----------------------------------------------------------------------------*/
5051 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
5054 int32_t aExp
, shiftCount
;
5055 uint64_t aSig
, aSigExtra
;
5057 if (floatx80_invalid_encoding(a
)) {
5058 float_raise(float_flag_invalid
, status
);
5061 aSig
= extractFloatx80Frac( a
);
5062 aExp
= extractFloatx80Exp( a
);
5063 aSign
= extractFloatx80Sign( a
);
5064 shiftCount
= 0x403E - aExp
;
5065 if ( shiftCount
<= 0 ) {
5067 float_raise(float_flag_invalid
, status
);
5068 if (!aSign
|| floatx80_is_any_nan(a
)) {
5076 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
5078 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
5082 /*----------------------------------------------------------------------------
5083 | Returns the result of converting the extended double-precision floating-
5084 | point value `a' to the 64-bit two's complement integer format. The
5085 | conversion is performed according to the IEC/IEEE Standard for Binary
5086 | Floating-Point Arithmetic, except that the conversion is always rounded
5087 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5088 | Otherwise, if the conversion overflows, the largest integer with the same
5089 | sign as `a' is returned.
5090 *----------------------------------------------------------------------------*/
5092 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
5095 int32_t aExp
, shiftCount
;
5099 if (floatx80_invalid_encoding(a
)) {
5100 float_raise(float_flag_invalid
, status
);
5103 aSig
= extractFloatx80Frac( a
);
5104 aExp
= extractFloatx80Exp( a
);
5105 aSign
= extractFloatx80Sign( a
);
5106 shiftCount
= aExp
- 0x403E;
5107 if ( 0 <= shiftCount
) {
5108 aSig
&= UINT64_C(0x7FFFFFFFFFFFFFFF);
5109 if ( ( a
.high
!= 0xC03E ) || aSig
) {
5110 float_raise(float_flag_invalid
, status
);
5111 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
5117 else if ( aExp
< 0x3FFF ) {
5119 status
->float_exception_flags
|= float_flag_inexact
;
5123 z
= aSig
>>( - shiftCount
);
5124 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
5125 status
->float_exception_flags
|= float_flag_inexact
;
5127 if ( aSign
) z
= - z
;
5132 /*----------------------------------------------------------------------------
5133 | Returns the result of converting the extended double-precision floating-
5134 | point value `a' to the single-precision floating-point format. The
5135 | conversion is performed according to the IEC/IEEE Standard for Binary
5136 | Floating-Point Arithmetic.
5137 *----------------------------------------------------------------------------*/
5139 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
5145 if (floatx80_invalid_encoding(a
)) {
5146 float_raise(float_flag_invalid
, status
);
5147 return float32_default_nan(status
);
5149 aSig
= extractFloatx80Frac( a
);
5150 aExp
= extractFloatx80Exp( a
);
5151 aSign
= extractFloatx80Sign( a
);
5152 if ( aExp
== 0x7FFF ) {
5153 if ( (uint64_t) ( aSig
<<1 ) ) {
5154 float32 res
= commonNaNToFloat32(floatx80ToCommonNaN(a
, status
),
5156 return float32_silence_nan(res
, status
);
5158 return packFloat32( aSign
, 0xFF, 0 );
5160 shift64RightJamming( aSig
, 33, &aSig
);
5161 if ( aExp
|| aSig
) aExp
-= 0x3F81;
5162 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
5166 /*----------------------------------------------------------------------------
5167 | Returns the result of converting the extended double-precision floating-
5168 | point value `a' to the double-precision floating-point format. The
5169 | conversion is performed according to the IEC/IEEE Standard for Binary
5170 | Floating-Point Arithmetic.
5171 *----------------------------------------------------------------------------*/
5173 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
5177 uint64_t aSig
, zSig
;
5179 if (floatx80_invalid_encoding(a
)) {
5180 float_raise(float_flag_invalid
, status
);
5181 return float64_default_nan(status
);
5183 aSig
= extractFloatx80Frac( a
);
5184 aExp
= extractFloatx80Exp( a
);
5185 aSign
= extractFloatx80Sign( a
);
5186 if ( aExp
== 0x7FFF ) {
5187 if ( (uint64_t) ( aSig
<<1 ) ) {
5188 float64 res
= commonNaNToFloat64(floatx80ToCommonNaN(a
, status
),
5190 return float64_silence_nan(res
, status
);
5192 return packFloat64( aSign
, 0x7FF, 0 );
5194 shift64RightJamming( aSig
, 1, &zSig
);
5195 if ( aExp
|| aSig
) aExp
-= 0x3C01;
5196 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
5200 /*----------------------------------------------------------------------------
5201 | Returns the result of converting the extended double-precision floating-
5202 | point value `a' to the quadruple-precision floating-point format. The
5203 | conversion is performed according to the IEC/IEEE Standard for Binary
5204 | Floating-Point Arithmetic.
5205 *----------------------------------------------------------------------------*/
5207 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
5211 uint64_t aSig
, zSig0
, zSig1
;
5213 if (floatx80_invalid_encoding(a
)) {
5214 float_raise(float_flag_invalid
, status
);
5215 return float128_default_nan(status
);
5217 aSig
= extractFloatx80Frac( a
);
5218 aExp
= extractFloatx80Exp( a
);
5219 aSign
= extractFloatx80Sign( a
);
5220 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
5221 float128 res
= commonNaNToFloat128(floatx80ToCommonNaN(a
, status
),
5223 return float128_silence_nan(res
, status
);
5225 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
5226 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
5230 /*----------------------------------------------------------------------------
5231 | Rounds the extended double-precision floating-point value `a'
5232 | to the precision provided by floatx80_rounding_precision and returns the
5233 | result as an extended double-precision floating-point value.
5234 | The operation is performed according to the IEC/IEEE Standard for Binary
5235 | Floating-Point Arithmetic.
5236 *----------------------------------------------------------------------------*/
5238 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
5240 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5241 extractFloatx80Sign(a
),
5242 extractFloatx80Exp(a
),
5243 extractFloatx80Frac(a
), 0, status
);
5246 /*----------------------------------------------------------------------------
5247 | Rounds the extended double-precision floating-point value `a' to an integer,
5248 | and returns the result as an extended quadruple-precision floating-point
5249 | value. The operation is performed according to the IEC/IEEE Standard for
5250 | Binary Floating-Point Arithmetic.
5251 *----------------------------------------------------------------------------*/
5253 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
5257 uint64_t lastBitMask
, roundBitsMask
;
5260 if (floatx80_invalid_encoding(a
)) {
5261 float_raise(float_flag_invalid
, status
);
5262 return floatx80_default_nan(status
);
5264 aExp
= extractFloatx80Exp( a
);
5265 if ( 0x403E <= aExp
) {
5266 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
5267 return propagateFloatx80NaN(a
, a
, status
);
5271 if ( aExp
< 0x3FFF ) {
5273 && ( (uint64_t) ( extractFloatx80Frac( a
) ) == 0 ) ) {
5276 status
->float_exception_flags
|= float_flag_inexact
;
5277 aSign
= extractFloatx80Sign( a
);
5278 switch (status
->float_rounding_mode
) {
5279 case float_round_nearest_even
:
5280 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
5283 packFloatx80( aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5286 case float_round_ties_away
:
5287 if (aExp
== 0x3FFE) {
5288 return packFloatx80(aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5291 case float_round_down
:
5294 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5295 : packFloatx80( 0, 0, 0 );
5296 case float_round_up
:
5298 aSign
? packFloatx80( 1, 0, 0 )
5299 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5301 case float_round_to_zero
:
5304 g_assert_not_reached();
5306 return packFloatx80( aSign
, 0, 0 );
5309 lastBitMask
<<= 0x403E - aExp
;
5310 roundBitsMask
= lastBitMask
- 1;
5312 switch (status
->float_rounding_mode
) {
5313 case float_round_nearest_even
:
5314 z
.low
+= lastBitMask
>>1;
5315 if ((z
.low
& roundBitsMask
) == 0) {
5316 z
.low
&= ~lastBitMask
;
5319 case float_round_ties_away
:
5320 z
.low
+= lastBitMask
>> 1;
5322 case float_round_to_zero
:
5324 case float_round_up
:
5325 if (!extractFloatx80Sign(z
)) {
5326 z
.low
+= roundBitsMask
;
5329 case float_round_down
:
5330 if (extractFloatx80Sign(z
)) {
5331 z
.low
+= roundBitsMask
;
5337 z
.low
&= ~ roundBitsMask
;
5340 z
.low
= UINT64_C(0x8000000000000000);
5342 if (z
.low
!= a
.low
) {
5343 status
->float_exception_flags
|= float_flag_inexact
;
5349 /*----------------------------------------------------------------------------
5350 | Returns the result of adding the absolute values of the extended double-
5351 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5352 | negated before being returned. `zSign' is ignored if the result is a NaN.
5353 | The addition is performed according to the IEC/IEEE Standard for Binary
5354 | Floating-Point Arithmetic.
5355 *----------------------------------------------------------------------------*/
5357 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5358 float_status
*status
)
5360 int32_t aExp
, bExp
, zExp
;
5361 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5364 aSig
= extractFloatx80Frac( a
);
5365 aExp
= extractFloatx80Exp( a
);
5366 bSig
= extractFloatx80Frac( b
);
5367 bExp
= extractFloatx80Exp( b
);
5368 expDiff
= aExp
- bExp
;
5369 if ( 0 < expDiff
) {
5370 if ( aExp
== 0x7FFF ) {
5371 if ((uint64_t)(aSig
<< 1)) {
5372 return propagateFloatx80NaN(a
, b
, status
);
5376 if ( bExp
== 0 ) --expDiff
;
5377 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5380 else if ( expDiff
< 0 ) {
5381 if ( bExp
== 0x7FFF ) {
5382 if ((uint64_t)(bSig
<< 1)) {
5383 return propagateFloatx80NaN(a
, b
, status
);
5385 return packFloatx80(zSign
,
5386 floatx80_infinity_high
,
5387 floatx80_infinity_low
);
5389 if ( aExp
== 0 ) ++expDiff
;
5390 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5394 if ( aExp
== 0x7FFF ) {
5395 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5396 return propagateFloatx80NaN(a
, b
, status
);
5401 zSig0
= aSig
+ bSig
;
5403 if ((aSig
| bSig
) & UINT64_C(0x8000000000000000) && zSig0
< aSig
) {
5404 /* At least one of the values is a pseudo-denormal,
5405 * and there is a carry out of the result. */
5410 return packFloatx80(zSign
, 0, 0);
5412 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5418 zSig0
= aSig
+ bSig
;
5419 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5421 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5422 zSig0
|= UINT64_C(0x8000000000000000);
5425 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5426 zSign
, zExp
, zSig0
, zSig1
, status
);
5429 /*----------------------------------------------------------------------------
5430 | Returns the result of subtracting the absolute values of the extended
5431 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5432 | difference is negated before being returned. `zSign' is ignored if the
5433 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5434 | Standard for Binary Floating-Point Arithmetic.
5435 *----------------------------------------------------------------------------*/
5437 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5438 float_status
*status
)
5440 int32_t aExp
, bExp
, zExp
;
5441 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5444 aSig
= extractFloatx80Frac( a
);
5445 aExp
= extractFloatx80Exp( a
);
5446 bSig
= extractFloatx80Frac( b
);
5447 bExp
= extractFloatx80Exp( b
);
5448 expDiff
= aExp
- bExp
;
5449 if ( 0 < expDiff
) goto aExpBigger
;
5450 if ( expDiff
< 0 ) goto bExpBigger
;
5451 if ( aExp
== 0x7FFF ) {
5452 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5453 return propagateFloatx80NaN(a
, b
, status
);
5455 float_raise(float_flag_invalid
, status
);
5456 return floatx80_default_nan(status
);
5463 if ( bSig
< aSig
) goto aBigger
;
5464 if ( aSig
< bSig
) goto bBigger
;
5465 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
5467 if ( bExp
== 0x7FFF ) {
5468 if ((uint64_t)(bSig
<< 1)) {
5469 return propagateFloatx80NaN(a
, b
, status
);
5471 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
5472 floatx80_infinity_low
);
5474 if ( aExp
== 0 ) ++expDiff
;
5475 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5477 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
5480 goto normalizeRoundAndPack
;
5482 if ( aExp
== 0x7FFF ) {
5483 if ((uint64_t)(aSig
<< 1)) {
5484 return propagateFloatx80NaN(a
, b
, status
);
5488 if ( bExp
== 0 ) --expDiff
;
5489 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5491 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
5493 normalizeRoundAndPack
:
5494 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
5495 zSign
, zExp
, zSig0
, zSig1
, status
);
5498 /*----------------------------------------------------------------------------
5499 | Returns the result of adding the extended double-precision floating-point
5500 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5501 | Standard for Binary Floating-Point Arithmetic.
5502 *----------------------------------------------------------------------------*/
5504 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
5508 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5509 float_raise(float_flag_invalid
, status
);
5510 return floatx80_default_nan(status
);
5512 aSign
= extractFloatx80Sign( a
);
5513 bSign
= extractFloatx80Sign( b
);
5514 if ( aSign
== bSign
) {
5515 return addFloatx80Sigs(a
, b
, aSign
, status
);
5518 return subFloatx80Sigs(a
, b
, aSign
, status
);
5523 /*----------------------------------------------------------------------------
5524 | Returns the result of subtracting the extended double-precision floating-
5525 | point values `a' and `b'. The operation is performed according to the
5526 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5527 *----------------------------------------------------------------------------*/
5529 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5533 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5534 float_raise(float_flag_invalid
, status
);
5535 return floatx80_default_nan(status
);
5537 aSign
= extractFloatx80Sign( a
);
5538 bSign
= extractFloatx80Sign( b
);
5539 if ( aSign
== bSign
) {
5540 return subFloatx80Sigs(a
, b
, aSign
, status
);
5543 return addFloatx80Sigs(a
, b
, aSign
, status
);
5548 /*----------------------------------------------------------------------------
5549 | Returns the result of multiplying the extended double-precision floating-
5550 | point values `a' and `b'. The operation is performed according to the
5551 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5552 *----------------------------------------------------------------------------*/
5554 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
5556 bool aSign
, bSign
, zSign
;
5557 int32_t aExp
, bExp
, zExp
;
5558 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5560 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5561 float_raise(float_flag_invalid
, status
);
5562 return floatx80_default_nan(status
);
5564 aSig
= extractFloatx80Frac( a
);
5565 aExp
= extractFloatx80Exp( a
);
5566 aSign
= extractFloatx80Sign( a
);
5567 bSig
= extractFloatx80Frac( b
);
5568 bExp
= extractFloatx80Exp( b
);
5569 bSign
= extractFloatx80Sign( b
);
5570 zSign
= aSign
^ bSign
;
5571 if ( aExp
== 0x7FFF ) {
5572 if ( (uint64_t) ( aSig
<<1 )
5573 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5574 return propagateFloatx80NaN(a
, b
, status
);
5576 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
5577 return packFloatx80(zSign
, floatx80_infinity_high
,
5578 floatx80_infinity_low
);
5580 if ( bExp
== 0x7FFF ) {
5581 if ((uint64_t)(bSig
<< 1)) {
5582 return propagateFloatx80NaN(a
, b
, status
);
5584 if ( ( aExp
| aSig
) == 0 ) {
5586 float_raise(float_flag_invalid
, status
);
5587 return floatx80_default_nan(status
);
5589 return packFloatx80(zSign
, floatx80_infinity_high
,
5590 floatx80_infinity_low
);
5593 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5594 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5597 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5598 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5600 zExp
= aExp
+ bExp
- 0x3FFE;
5601 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
5602 if ( 0 < (int64_t) zSig0
) {
5603 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5606 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5607 zSign
, zExp
, zSig0
, zSig1
, status
);
5610 /*----------------------------------------------------------------------------
5611 | Returns the result of dividing the extended double-precision floating-point
5612 | value `a' by the corresponding value `b'. The operation is performed
5613 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5614 *----------------------------------------------------------------------------*/
5616 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
5618 bool aSign
, bSign
, zSign
;
5619 int32_t aExp
, bExp
, zExp
;
5620 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5621 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
5623 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5624 float_raise(float_flag_invalid
, status
);
5625 return floatx80_default_nan(status
);
5627 aSig
= extractFloatx80Frac( a
);
5628 aExp
= extractFloatx80Exp( a
);
5629 aSign
= extractFloatx80Sign( a
);
5630 bSig
= extractFloatx80Frac( b
);
5631 bExp
= extractFloatx80Exp( b
);
5632 bSign
= extractFloatx80Sign( b
);
5633 zSign
= aSign
^ bSign
;
5634 if ( aExp
== 0x7FFF ) {
5635 if ((uint64_t)(aSig
<< 1)) {
5636 return propagateFloatx80NaN(a
, b
, status
);
5638 if ( bExp
== 0x7FFF ) {
5639 if ((uint64_t)(bSig
<< 1)) {
5640 return propagateFloatx80NaN(a
, b
, status
);
5644 return packFloatx80(zSign
, floatx80_infinity_high
,
5645 floatx80_infinity_low
);
5647 if ( bExp
== 0x7FFF ) {
5648 if ((uint64_t)(bSig
<< 1)) {
5649 return propagateFloatx80NaN(a
, b
, status
);
5651 return packFloatx80( zSign
, 0, 0 );
5655 if ( ( aExp
| aSig
) == 0 ) {
5657 float_raise(float_flag_invalid
, status
);
5658 return floatx80_default_nan(status
);
5660 float_raise(float_flag_divbyzero
, status
);
5661 return packFloatx80(zSign
, floatx80_infinity_high
,
5662 floatx80_infinity_low
);
5664 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5667 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5668 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5670 zExp
= aExp
- bExp
+ 0x3FFE;
5672 if ( bSig
<= aSig
) {
5673 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
5676 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
5677 mul64To128( bSig
, zSig0
, &term0
, &term1
);
5678 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
5679 while ( (int64_t) rem0
< 0 ) {
5681 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
5683 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
5684 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
5685 mul64To128( bSig
, zSig1
, &term1
, &term2
);
5686 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5687 while ( (int64_t) rem1
< 0 ) {
5689 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
5691 zSig1
|= ( ( rem1
| rem2
) != 0 );
5693 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5694 zSign
, zExp
, zSig0
, zSig1
, status
);
5697 /*----------------------------------------------------------------------------
5698 | Returns the remainder of the extended double-precision floating-point value
5699 | `a' with respect to the corresponding value `b'. The operation is performed
5700 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
5701 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
5702 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
5703 | the absolute value of the integer quotient.
5704 *----------------------------------------------------------------------------*/
5706 floatx80
floatx80_modrem(floatx80 a
, floatx80 b
, bool mod
, uint64_t *quotient
,
5707 float_status
*status
)
5710 int32_t aExp
, bExp
, expDiff
, aExpOrig
;
5711 uint64_t aSig0
, aSig1
, bSig
;
5712 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
5715 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5716 float_raise(float_flag_invalid
, status
);
5717 return floatx80_default_nan(status
);
5719 aSig0
= extractFloatx80Frac( a
);
5720 aExpOrig
= aExp
= extractFloatx80Exp( a
);
5721 aSign
= extractFloatx80Sign( a
);
5722 bSig
= extractFloatx80Frac( b
);
5723 bExp
= extractFloatx80Exp( b
);
5724 if ( aExp
== 0x7FFF ) {
5725 if ( (uint64_t) ( aSig0
<<1 )
5726 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5727 return propagateFloatx80NaN(a
, b
, status
);
5731 if ( bExp
== 0x7FFF ) {
5732 if ((uint64_t)(bSig
<< 1)) {
5733 return propagateFloatx80NaN(a
, b
, status
);
5735 if (aExp
== 0 && aSig0
>> 63) {
5737 * Pseudo-denormal argument must be returned in normalized
5740 return packFloatx80(aSign
, 1, aSig0
);
5747 float_raise(float_flag_invalid
, status
);
5748 return floatx80_default_nan(status
);
5750 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5753 if ( aSig0
== 0 ) return a
;
5754 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5757 expDiff
= aExp
- bExp
;
5759 if ( expDiff
< 0 ) {
5760 if ( mod
|| expDiff
< -1 ) {
5761 if (aExp
== 1 && aExpOrig
== 0) {
5763 * Pseudo-denormal argument must be returned in
5766 return packFloatx80(aSign
, aExp
, aSig0
);
5770 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
5773 *quotient
= q
= ( bSig
<= aSig0
);
5774 if ( q
) aSig0
-= bSig
;
5776 while ( 0 < expDiff
) {
5777 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5778 q
= ( 2 < q
) ? q
- 2 : 0;
5779 mul64To128( bSig
, q
, &term0
, &term1
);
5780 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5781 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
5787 if ( 0 < expDiff
) {
5788 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5789 q
= ( 2 < q
) ? q
- 2 : 0;
5791 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
5792 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5793 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
5794 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
5796 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5799 *quotient
<<= expDiff
;
5810 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
5811 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5812 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5815 aSig0
= alternateASig0
;
5816 aSig1
= alternateASig1
;
5822 normalizeRoundAndPackFloatx80(
5823 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
5827 /*----------------------------------------------------------------------------
5828 | Returns the remainder of the extended double-precision floating-point value
5829 | `a' with respect to the corresponding value `b'. The operation is performed
5830 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5831 *----------------------------------------------------------------------------*/
5833 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
5836 return floatx80_modrem(a
, b
, false, "ient
, status
);
5839 /*----------------------------------------------------------------------------
5840 | Returns the remainder of the extended double-precision floating-point value
5841 | `a' with respect to the corresponding value `b', with the quotient truncated
5843 *----------------------------------------------------------------------------*/
5845 floatx80
floatx80_mod(floatx80 a
, floatx80 b
, float_status
*status
)
5848 return floatx80_modrem(a
, b
, true, "ient
, status
);
5851 /*----------------------------------------------------------------------------
5852 | Returns the square root of the extended double-precision floating-point
5853 | value `a'. The operation is performed according to the IEC/IEEE Standard
5854 | for Binary Floating-Point Arithmetic.
5855 *----------------------------------------------------------------------------*/
5857 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
5861 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
5862 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5864 if (floatx80_invalid_encoding(a
)) {
5865 float_raise(float_flag_invalid
, status
);
5866 return floatx80_default_nan(status
);
5868 aSig0
= extractFloatx80Frac( a
);
5869 aExp
= extractFloatx80Exp( a
);
5870 aSign
= extractFloatx80Sign( a
);
5871 if ( aExp
== 0x7FFF ) {
5872 if ((uint64_t)(aSig0
<< 1)) {
5873 return propagateFloatx80NaN(a
, a
, status
);
5875 if ( ! aSign
) return a
;
5879 if ( ( aExp
| aSig0
) == 0 ) return a
;
5881 float_raise(float_flag_invalid
, status
);
5882 return floatx80_default_nan(status
);
5885 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
5886 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5888 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
5889 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
5890 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
5891 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5892 doubleZSig0
= zSig0
<<1;
5893 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5894 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5895 while ( (int64_t) rem0
< 0 ) {
5898 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5900 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5901 if ( ( zSig1
& UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
5902 if ( zSig1
== 0 ) zSig1
= 1;
5903 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5904 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5905 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5906 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5907 while ( (int64_t) rem1
< 0 ) {
5909 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5911 term2
|= doubleZSig0
;
5912 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5914 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5916 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
5917 zSig0
|= doubleZSig0
;
5918 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5919 0, zExp
, zSig0
, zSig1
, status
);
5922 /*----------------------------------------------------------------------------
5923 | Returns the result of converting the quadruple-precision floating-point
5924 | value `a' to the 32-bit two's complement integer format. The conversion
5925 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5926 | Arithmetic---which means in particular that the conversion is rounded
5927 | according to the current rounding mode. If `a' is a NaN, the largest
5928 | positive integer is returned. Otherwise, if the conversion overflows, the
5929 | largest integer with the same sign as `a' is returned.
5930 *----------------------------------------------------------------------------*/
5932 int32_t float128_to_int32(float128 a
, float_status
*status
)
5935 int32_t aExp
, shiftCount
;
5936 uint64_t aSig0
, aSig1
;
5938 aSig1
= extractFloat128Frac1( a
);
5939 aSig0
= extractFloat128Frac0( a
);
5940 aExp
= extractFloat128Exp( a
);
5941 aSign
= extractFloat128Sign( a
);
5942 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
5943 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
5944 aSig0
|= ( aSig1
!= 0 );
5945 shiftCount
= 0x4028 - aExp
;
5946 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
5947 return roundAndPackInt32(aSign
, aSig0
, status
);
5951 /*----------------------------------------------------------------------------
5952 | Returns the result of converting the quadruple-precision floating-point
5953 | value `a' to the 32-bit two's complement integer format. The conversion
5954 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5955 | Arithmetic, except that the conversion is always rounded toward zero. If
5956 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5957 | conversion overflows, the largest integer with the same sign as `a' is
5959 *----------------------------------------------------------------------------*/
5961 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
5964 int32_t aExp
, shiftCount
;
5965 uint64_t aSig0
, aSig1
, savedASig
;
5968 aSig1
= extractFloat128Frac1( a
);
5969 aSig0
= extractFloat128Frac0( a
);
5970 aExp
= extractFloat128Exp( a
);
5971 aSign
= extractFloat128Sign( a
);
5972 aSig0
|= ( aSig1
!= 0 );
5973 if ( 0x401E < aExp
) {
5974 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
5977 else if ( aExp
< 0x3FFF ) {
5978 if (aExp
|| aSig0
) {
5979 status
->float_exception_flags
|= float_flag_inexact
;
5983 aSig0
|= UINT64_C(0x0001000000000000);
5984 shiftCount
= 0x402F - aExp
;
5986 aSig0
>>= shiftCount
;
5988 if ( aSign
) z
= - z
;
5989 if ( ( z
< 0 ) ^ aSign
) {
5991 float_raise(float_flag_invalid
, status
);
5992 return aSign
? INT32_MIN
: INT32_MAX
;
5994 if ( ( aSig0
<<shiftCount
) != savedASig
) {
5995 status
->float_exception_flags
|= float_flag_inexact
;
6001 /*----------------------------------------------------------------------------
6002 | Returns the result of converting the quadruple-precision floating-point
6003 | value `a' to the 64-bit two's complement integer format. The conversion
6004 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6005 | Arithmetic---which means in particular that the conversion is rounded
6006 | according to the current rounding mode. If `a' is a NaN, the largest
6007 | positive integer is returned. Otherwise, if the conversion overflows, the
6008 | largest integer with the same sign as `a' is returned.
6009 *----------------------------------------------------------------------------*/
6011 int64_t float128_to_int64(float128 a
, float_status
*status
)
6014 int32_t aExp
, shiftCount
;
6015 uint64_t aSig0
, aSig1
;
6017 aSig1
= extractFloat128Frac1( a
);
6018 aSig0
= extractFloat128Frac0( a
);
6019 aExp
= extractFloat128Exp( a
);
6020 aSign
= extractFloat128Sign( a
);
6021 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6022 shiftCount
= 0x402F - aExp
;
6023 if ( shiftCount
<= 0 ) {
6024 if ( 0x403E < aExp
) {
6025 float_raise(float_flag_invalid
, status
);
6027 || ( ( aExp
== 0x7FFF )
6028 && ( aSig1
|| ( aSig0
!= UINT64_C(0x0001000000000000) ) )
6035 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
6038 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6040 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
6044 /*----------------------------------------------------------------------------
6045 | Returns the result of converting the quadruple-precision floating-point
6046 | value `a' to the 64-bit two's complement integer format. The conversion
6047 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6048 | Arithmetic, except that the conversion is always rounded toward zero.
6049 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6050 | the conversion overflows, the largest integer with the same sign as `a' is
6052 *----------------------------------------------------------------------------*/
6054 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
6057 int32_t aExp
, shiftCount
;
6058 uint64_t aSig0
, aSig1
;
6061 aSig1
= extractFloat128Frac1( a
);
6062 aSig0
= extractFloat128Frac0( a
);
6063 aExp
= extractFloat128Exp( a
);
6064 aSign
= extractFloat128Sign( a
);
6065 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6066 shiftCount
= aExp
- 0x402F;
6067 if ( 0 < shiftCount
) {
6068 if ( 0x403E <= aExp
) {
6069 aSig0
&= UINT64_C(0x0000FFFFFFFFFFFF);
6070 if ( ( a
.high
== UINT64_C(0xC03E000000000000) )
6071 && ( aSig1
< UINT64_C(0x0002000000000000) ) ) {
6073 status
->float_exception_flags
|= float_flag_inexact
;
6077 float_raise(float_flag_invalid
, status
);
6078 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
6084 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
6085 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
6086 status
->float_exception_flags
|= float_flag_inexact
;
6090 if ( aExp
< 0x3FFF ) {
6091 if ( aExp
| aSig0
| aSig1
) {
6092 status
->float_exception_flags
|= float_flag_inexact
;
6096 z
= aSig0
>>( - shiftCount
);
6098 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
6099 status
->float_exception_flags
|= float_flag_inexact
;
6102 if ( aSign
) z
= - z
;
6107 /*----------------------------------------------------------------------------
6108 | Returns the result of converting the quadruple-precision floating-point value
6109 | `a' to the 64-bit unsigned integer format. The conversion is
6110 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6111 | Arithmetic---which means in particular that the conversion is rounded
6112 | according to the current rounding mode. If `a' is a NaN, the largest
6113 | positive integer is returned. If the conversion overflows, the
6114 | largest unsigned integer is returned. If 'a' is negative, the value is
6115 | rounded and zero is returned; negative values that do not round to zero
6116 | will raise the inexact exception.
6117 *----------------------------------------------------------------------------*/
6119 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
6124 uint64_t aSig0
, aSig1
;
6126 aSig0
= extractFloat128Frac0(a
);
6127 aSig1
= extractFloat128Frac1(a
);
6128 aExp
= extractFloat128Exp(a
);
6129 aSign
= extractFloat128Sign(a
);
6130 if (aSign
&& (aExp
> 0x3FFE)) {
6131 float_raise(float_flag_invalid
, status
);
6132 if (float128_is_any_nan(a
)) {
6139 aSig0
|= UINT64_C(0x0001000000000000);
6141 shiftCount
= 0x402F - aExp
;
6142 if (shiftCount
<= 0) {
6143 if (0x403E < aExp
) {
6144 float_raise(float_flag_invalid
, status
);
6147 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
6149 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6151 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
6154 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
6157 signed char current_rounding_mode
= status
->float_rounding_mode
;
6159 set_float_rounding_mode(float_round_to_zero
, status
);
6160 v
= float128_to_uint64(a
, status
);
6161 set_float_rounding_mode(current_rounding_mode
, status
);
6166 /*----------------------------------------------------------------------------
6167 | Returns the result of converting the quadruple-precision floating-point
6168 | value `a' to the 32-bit unsigned integer format. The conversion
6169 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6170 | Arithmetic except that the conversion is always rounded toward zero.
6171 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6172 | if the conversion overflows, the largest unsigned integer is returned.
6173 | If 'a' is negative, the value is rounded and zero is returned; negative
6174 | values that do not round to zero will raise the inexact exception.
6175 *----------------------------------------------------------------------------*/
6177 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
6181 int old_exc_flags
= get_float_exception_flags(status
);
6183 v
= float128_to_uint64_round_to_zero(a
, status
);
6184 if (v
> 0xffffffff) {
6189 set_float_exception_flags(old_exc_flags
, status
);
6190 float_raise(float_flag_invalid
, status
);
6194 /*----------------------------------------------------------------------------
6195 | Returns the result of converting the quadruple-precision floating-point value
6196 | `a' to the 32-bit unsigned integer format. The conversion is
6197 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6198 | Arithmetic---which means in particular that the conversion is rounded
6199 | according to the current rounding mode. If `a' is a NaN, the largest
6200 | positive integer is returned. If the conversion overflows, the
6201 | largest unsigned integer is returned. If 'a' is negative, the value is
6202 | rounded and zero is returned; negative values that do not round to zero
6203 | will raise the inexact exception.
6204 *----------------------------------------------------------------------------*/
6206 uint32_t float128_to_uint32(float128 a
, float_status
*status
)
6210 int old_exc_flags
= get_float_exception_flags(status
);
6212 v
= float128_to_uint64(a
, status
);
6213 if (v
> 0xffffffff) {
6218 set_float_exception_flags(old_exc_flags
, status
);
6219 float_raise(float_flag_invalid
, status
);
6223 /*----------------------------------------------------------------------------
6224 | Returns the result of converting the quadruple-precision floating-point
6225 | value `a' to the single-precision floating-point format. The conversion
6226 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6228 *----------------------------------------------------------------------------*/
6230 float32
float128_to_float32(float128 a
, float_status
*status
)
6234 uint64_t aSig0
, aSig1
;
6237 aSig1
= extractFloat128Frac1( a
);
6238 aSig0
= extractFloat128Frac0( a
);
6239 aExp
= extractFloat128Exp( a
);
6240 aSign
= extractFloat128Sign( a
);
6241 if ( aExp
== 0x7FFF ) {
6242 if ( aSig0
| aSig1
) {
6243 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
6245 return packFloat32( aSign
, 0xFF, 0 );
6247 aSig0
|= ( aSig1
!= 0 );
6248 shift64RightJamming( aSig0
, 18, &aSig0
);
6250 if ( aExp
|| zSig
) {
6254 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
6258 /*----------------------------------------------------------------------------
6259 | Returns the result of converting the quadruple-precision floating-point
6260 | value `a' to the double-precision floating-point format. The conversion
6261 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6263 *----------------------------------------------------------------------------*/
6265 float64
float128_to_float64(float128 a
, float_status
*status
)
6269 uint64_t aSig0
, aSig1
;
6271 aSig1
= extractFloat128Frac1( a
);
6272 aSig0
= extractFloat128Frac0( a
);
6273 aExp
= extractFloat128Exp( a
);
6274 aSign
= extractFloat128Sign( a
);
6275 if ( aExp
== 0x7FFF ) {
6276 if ( aSig0
| aSig1
) {
6277 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
6279 return packFloat64( aSign
, 0x7FF, 0 );
6281 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6282 aSig0
|= ( aSig1
!= 0 );
6283 if ( aExp
|| aSig0
) {
6284 aSig0
|= UINT64_C(0x4000000000000000);
6287 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
6291 /*----------------------------------------------------------------------------
6292 | Returns the result of converting the quadruple-precision floating-point
6293 | value `a' to the extended double-precision floating-point format. The
6294 | conversion is performed according to the IEC/IEEE Standard for Binary
6295 | Floating-Point Arithmetic.
6296 *----------------------------------------------------------------------------*/
6298 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6302 uint64_t aSig0
, aSig1
;
6304 aSig1
= extractFloat128Frac1( a
);
6305 aSig0
= extractFloat128Frac0( a
);
6306 aExp
= extractFloat128Exp( a
);
6307 aSign
= extractFloat128Sign( a
);
6308 if ( aExp
== 0x7FFF ) {
6309 if ( aSig0
| aSig1
) {
6310 floatx80 res
= commonNaNToFloatx80(float128ToCommonNaN(a
, status
),
6312 return floatx80_silence_nan(res
, status
);
6314 return packFloatx80(aSign
, floatx80_infinity_high
,
6315 floatx80_infinity_low
);
6318 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6319 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6322 aSig0
|= UINT64_C(0x0001000000000000);
6324 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6325 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6329 /*----------------------------------------------------------------------------
6330 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6331 | returns the result as a quadruple-precision floating-point value. The
6332 | operation is performed according to the IEC/IEEE Standard for Binary
6333 | Floating-Point Arithmetic.
6334 *----------------------------------------------------------------------------*/
6336 float128
float128_round_to_int(float128 a
, float_status
*status
)
6340 uint64_t lastBitMask
, roundBitsMask
;
6343 aExp
= extractFloat128Exp( a
);
6344 if ( 0x402F <= aExp
) {
6345 if ( 0x406F <= aExp
) {
6346 if ( ( aExp
== 0x7FFF )
6347 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6349 return propagateFloat128NaN(a
, a
, status
);
6354 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6355 roundBitsMask
= lastBitMask
- 1;
6357 switch (status
->float_rounding_mode
) {
6358 case float_round_nearest_even
:
6359 if ( lastBitMask
) {
6360 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6361 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6364 if ( (int64_t) z
.low
< 0 ) {
6366 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6370 case float_round_ties_away
:
6372 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6374 if ((int64_t) z
.low
< 0) {
6379 case float_round_to_zero
:
6381 case float_round_up
:
6382 if (!extractFloat128Sign(z
)) {
6383 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6386 case float_round_down
:
6387 if (extractFloat128Sign(z
)) {
6388 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6391 case float_round_to_odd
:
6393 * Note that if lastBitMask == 0, the last bit is the lsb
6394 * of high, and roundBitsMask == -1.
6396 if ((lastBitMask
? z
.low
& lastBitMask
: z
.high
& 1) == 0) {
6397 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6403 z
.low
&= ~ roundBitsMask
;
6406 if ( aExp
< 0x3FFF ) {
6407 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6408 status
->float_exception_flags
|= float_flag_inexact
;
6409 aSign
= extractFloat128Sign( a
);
6410 switch (status
->float_rounding_mode
) {
6411 case float_round_nearest_even
:
6412 if ( ( aExp
== 0x3FFE )
6413 && ( extractFloat128Frac0( a
)
6414 | extractFloat128Frac1( a
) )
6416 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6419 case float_round_ties_away
:
6420 if (aExp
== 0x3FFE) {
6421 return packFloat128(aSign
, 0x3FFF, 0, 0);
6424 case float_round_down
:
6426 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6427 : packFloat128( 0, 0, 0, 0 );
6428 case float_round_up
:
6430 aSign
? packFloat128( 1, 0, 0, 0 )
6431 : packFloat128( 0, 0x3FFF, 0, 0 );
6433 case float_round_to_odd
:
6434 return packFloat128(aSign
, 0x3FFF, 0, 0);
6436 case float_round_to_zero
:
6439 return packFloat128( aSign
, 0, 0, 0 );
6442 lastBitMask
<<= 0x402F - aExp
;
6443 roundBitsMask
= lastBitMask
- 1;
6446 switch (status
->float_rounding_mode
) {
6447 case float_round_nearest_even
:
6448 z
.high
+= lastBitMask
>>1;
6449 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6450 z
.high
&= ~ lastBitMask
;
6453 case float_round_ties_away
:
6454 z
.high
+= lastBitMask
>>1;
6456 case float_round_to_zero
:
6458 case float_round_up
:
6459 if (!extractFloat128Sign(z
)) {
6460 z
.high
|= ( a
.low
!= 0 );
6461 z
.high
+= roundBitsMask
;
6464 case float_round_down
:
6465 if (extractFloat128Sign(z
)) {
6466 z
.high
|= (a
.low
!= 0);
6467 z
.high
+= roundBitsMask
;
6470 case float_round_to_odd
:
6471 if ((z
.high
& lastBitMask
) == 0) {
6472 z
.high
|= (a
.low
!= 0);
6473 z
.high
+= roundBitsMask
;
6479 z
.high
&= ~ roundBitsMask
;
6481 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
6482 status
->float_exception_flags
|= float_flag_inexact
;
6488 /*----------------------------------------------------------------------------
6489 | Returns the result of adding the absolute values of the quadruple-precision
6490 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6491 | before being returned. `zSign' is ignored if the result is a NaN.
6492 | The addition is performed according to the IEC/IEEE Standard for Binary
6493 | Floating-Point Arithmetic.
6494 *----------------------------------------------------------------------------*/
6496 static float128
addFloat128Sigs(float128 a
, float128 b
, bool zSign
,
6497 float_status
*status
)
6499 int32_t aExp
, bExp
, zExp
;
6500 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6503 aSig1
= extractFloat128Frac1( a
);
6504 aSig0
= extractFloat128Frac0( a
);
6505 aExp
= extractFloat128Exp( a
);
6506 bSig1
= extractFloat128Frac1( b
);
6507 bSig0
= extractFloat128Frac0( b
);
6508 bExp
= extractFloat128Exp( b
);
6509 expDiff
= aExp
- bExp
;
6510 if ( 0 < expDiff
) {
6511 if ( aExp
== 0x7FFF ) {
6512 if (aSig0
| aSig1
) {
6513 return propagateFloat128NaN(a
, b
, status
);
6521 bSig0
|= UINT64_C(0x0001000000000000);
6523 shift128ExtraRightJamming(
6524 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
6527 else if ( expDiff
< 0 ) {
6528 if ( bExp
== 0x7FFF ) {
6529 if (bSig0
| bSig1
) {
6530 return propagateFloat128NaN(a
, b
, status
);
6532 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6538 aSig0
|= UINT64_C(0x0001000000000000);
6540 shift128ExtraRightJamming(
6541 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
6545 if ( aExp
== 0x7FFF ) {
6546 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6547 return propagateFloat128NaN(a
, b
, status
);
6551 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6553 if (status
->flush_to_zero
) {
6554 if (zSig0
| zSig1
) {
6555 float_raise(float_flag_output_denormal
, status
);
6557 return packFloat128(zSign
, 0, 0, 0);
6559 return packFloat128( zSign
, 0, zSig0
, zSig1
);
6562 zSig0
|= UINT64_C(0x0002000000000000);
6566 aSig0
|= UINT64_C(0x0001000000000000);
6567 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6569 if ( zSig0
< UINT64_C(0x0002000000000000) ) goto roundAndPack
;
6572 shift128ExtraRightJamming(
6573 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6575 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6579 /*----------------------------------------------------------------------------
6580 | Returns the result of subtracting the absolute values of the quadruple-
6581 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6582 | difference is negated before being returned. `zSign' is ignored if the
6583 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6584 | Standard for Binary Floating-Point Arithmetic.
6585 *----------------------------------------------------------------------------*/
6587 static float128
subFloat128Sigs(float128 a
, float128 b
, bool zSign
,
6588 float_status
*status
)
6590 int32_t aExp
, bExp
, zExp
;
6591 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
6594 aSig1
= extractFloat128Frac1( a
);
6595 aSig0
= extractFloat128Frac0( a
);
6596 aExp
= extractFloat128Exp( a
);
6597 bSig1
= extractFloat128Frac1( b
);
6598 bSig0
= extractFloat128Frac0( b
);
6599 bExp
= extractFloat128Exp( b
);
6600 expDiff
= aExp
- bExp
;
6601 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6602 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
6603 if ( 0 < expDiff
) goto aExpBigger
;
6604 if ( expDiff
< 0 ) goto bExpBigger
;
6605 if ( aExp
== 0x7FFF ) {
6606 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6607 return propagateFloat128NaN(a
, b
, status
);
6609 float_raise(float_flag_invalid
, status
);
6610 return float128_default_nan(status
);
6616 if ( bSig0
< aSig0
) goto aBigger
;
6617 if ( aSig0
< bSig0
) goto bBigger
;
6618 if ( bSig1
< aSig1
) goto aBigger
;
6619 if ( aSig1
< bSig1
) goto bBigger
;
6620 return packFloat128(status
->float_rounding_mode
== float_round_down
,
6623 if ( bExp
== 0x7FFF ) {
6624 if (bSig0
| bSig1
) {
6625 return propagateFloat128NaN(a
, b
, status
);
6627 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
6633 aSig0
|= UINT64_C(0x4000000000000000);
6635 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6636 bSig0
|= UINT64_C(0x4000000000000000);
6638 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6641 goto normalizeRoundAndPack
;
6643 if ( aExp
== 0x7FFF ) {
6644 if (aSig0
| aSig1
) {
6645 return propagateFloat128NaN(a
, b
, status
);
6653 bSig0
|= UINT64_C(0x4000000000000000);
6655 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
6656 aSig0
|= UINT64_C(0x4000000000000000);
6658 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6660 normalizeRoundAndPack
:
6662 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
6667 /*----------------------------------------------------------------------------
6668 | Returns the result of adding the quadruple-precision floating-point values
6669 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6670 | for Binary Floating-Point Arithmetic.
6671 *----------------------------------------------------------------------------*/
6673 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
6677 aSign
= extractFloat128Sign( a
);
6678 bSign
= extractFloat128Sign( b
);
6679 if ( aSign
== bSign
) {
6680 return addFloat128Sigs(a
, b
, aSign
, status
);
6683 return subFloat128Sigs(a
, b
, aSign
, status
);
6688 /*----------------------------------------------------------------------------
6689 | Returns the result of subtracting the quadruple-precision floating-point
6690 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6691 | Standard for Binary Floating-Point Arithmetic.
6692 *----------------------------------------------------------------------------*/
6694 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
6698 aSign
= extractFloat128Sign( a
);
6699 bSign
= extractFloat128Sign( b
);
6700 if ( aSign
== bSign
) {
6701 return subFloat128Sigs(a
, b
, aSign
, status
);
6704 return addFloat128Sigs(a
, b
, aSign
, status
);
6709 /*----------------------------------------------------------------------------
6710 | Returns the result of multiplying the quadruple-precision floating-point
6711 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6712 | Standard for Binary Floating-Point Arithmetic.
6713 *----------------------------------------------------------------------------*/
6715 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
6717 bool aSign
, bSign
, zSign
;
6718 int32_t aExp
, bExp
, zExp
;
6719 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
6721 aSig1
= extractFloat128Frac1( a
);
6722 aSig0
= extractFloat128Frac0( a
);
6723 aExp
= extractFloat128Exp( a
);
6724 aSign
= extractFloat128Sign( a
);
6725 bSig1
= extractFloat128Frac1( b
);
6726 bSig0
= extractFloat128Frac0( b
);
6727 bExp
= extractFloat128Exp( b
);
6728 bSign
= extractFloat128Sign( b
);
6729 zSign
= aSign
^ bSign
;
6730 if ( aExp
== 0x7FFF ) {
6731 if ( ( aSig0
| aSig1
)
6732 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6733 return propagateFloat128NaN(a
, b
, status
);
6735 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
6736 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6738 if ( bExp
== 0x7FFF ) {
6739 if (bSig0
| bSig1
) {
6740 return propagateFloat128NaN(a
, b
, status
);
6742 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6744 float_raise(float_flag_invalid
, status
);
6745 return float128_default_nan(status
);
6747 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6750 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6751 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6754 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6755 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6757 zExp
= aExp
+ bExp
- 0x4000;
6758 aSig0
|= UINT64_C(0x0001000000000000);
6759 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
6760 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
6761 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6762 zSig2
|= ( zSig3
!= 0 );
6763 if (UINT64_C( 0x0002000000000000) <= zSig0
) {
6764 shift128ExtraRightJamming(
6765 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6768 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6772 /*----------------------------------------------------------------------------
6773 | Returns the result of dividing the quadruple-precision floating-point value
6774 | `a' by the corresponding value `b'. The operation is performed according to
6775 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6776 *----------------------------------------------------------------------------*/
6778 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
6780 bool aSign
, bSign
, zSign
;
6781 int32_t aExp
, bExp
, zExp
;
6782 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6783 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6785 aSig1
= extractFloat128Frac1( a
);
6786 aSig0
= extractFloat128Frac0( a
);
6787 aExp
= extractFloat128Exp( a
);
6788 aSign
= extractFloat128Sign( a
);
6789 bSig1
= extractFloat128Frac1( b
);
6790 bSig0
= extractFloat128Frac0( b
);
6791 bExp
= extractFloat128Exp( b
);
6792 bSign
= extractFloat128Sign( b
);
6793 zSign
= aSign
^ bSign
;
6794 if ( aExp
== 0x7FFF ) {
6795 if (aSig0
| aSig1
) {
6796 return propagateFloat128NaN(a
, b
, status
);
6798 if ( bExp
== 0x7FFF ) {
6799 if (bSig0
| bSig1
) {
6800 return propagateFloat128NaN(a
, b
, status
);
6804 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6806 if ( bExp
== 0x7FFF ) {
6807 if (bSig0
| bSig1
) {
6808 return propagateFloat128NaN(a
, b
, status
);
6810 return packFloat128( zSign
, 0, 0, 0 );
6813 if ( ( bSig0
| bSig1
) == 0 ) {
6814 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6816 float_raise(float_flag_invalid
, status
);
6817 return float128_default_nan(status
);
6819 float_raise(float_flag_divbyzero
, status
);
6820 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6822 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6825 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6826 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6828 zExp
= aExp
- bExp
+ 0x3FFD;
6830 aSig0
| UINT64_C(0x0001000000000000), aSig1
, 15, &aSig0
, &aSig1
);
6832 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
6833 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
6834 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
6837 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6838 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
6839 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
6840 while ( (int64_t) rem0
< 0 ) {
6842 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
6844 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
6845 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
6846 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
6847 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
6848 while ( (int64_t) rem1
< 0 ) {
6850 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
6852 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6854 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
6855 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6859 /*----------------------------------------------------------------------------
6860 | Returns the remainder of the quadruple-precision floating-point value `a'
6861 | with respect to the corresponding value `b'. The operation is performed
6862 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6863 *----------------------------------------------------------------------------*/
6865 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
6868 int32_t aExp
, bExp
, expDiff
;
6869 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
6870 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
6873 aSig1
= extractFloat128Frac1( a
);
6874 aSig0
= extractFloat128Frac0( a
);
6875 aExp
= extractFloat128Exp( a
);
6876 aSign
= extractFloat128Sign( a
);
6877 bSig1
= extractFloat128Frac1( b
);
6878 bSig0
= extractFloat128Frac0( b
);
6879 bExp
= extractFloat128Exp( b
);
6880 if ( aExp
== 0x7FFF ) {
6881 if ( ( aSig0
| aSig1
)
6882 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6883 return propagateFloat128NaN(a
, b
, status
);
6887 if ( bExp
== 0x7FFF ) {
6888 if (bSig0
| bSig1
) {
6889 return propagateFloat128NaN(a
, b
, status
);
6894 if ( ( bSig0
| bSig1
) == 0 ) {
6896 float_raise(float_flag_invalid
, status
);
6897 return float128_default_nan(status
);
6899 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6902 if ( ( aSig0
| aSig1
) == 0 ) return a
;
6903 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6905 expDiff
= aExp
- bExp
;
6906 if ( expDiff
< -1 ) return a
;
6908 aSig0
| UINT64_C(0x0001000000000000),
6910 15 - ( expDiff
< 0 ),
6915 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
6916 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
6917 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6919 while ( 0 < expDiff
) {
6920 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6921 q
= ( 4 < q
) ? q
- 4 : 0;
6922 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6923 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
6924 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
6925 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
6928 if ( -64 < expDiff
) {
6929 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6930 q
= ( 4 < q
) ? q
- 4 : 0;
6932 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6934 if ( expDiff
< 0 ) {
6935 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6938 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
6940 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6941 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
6944 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
6945 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6948 alternateASig0
= aSig0
;
6949 alternateASig1
= aSig1
;
6951 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6952 } while ( 0 <= (int64_t) aSig0
);
6954 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
6955 if ( ( sigMean0
< 0 )
6956 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
6957 aSig0
= alternateASig0
;
6958 aSig1
= alternateASig1
;
6960 zSign
= ( (int64_t) aSig0
< 0 );
6961 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
6962 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
6966 /*----------------------------------------------------------------------------
6967 | Returns the square root of the quadruple-precision floating-point value `a'.
6968 | The operation is performed according to the IEC/IEEE Standard for Binary
6969 | Floating-Point Arithmetic.
6970 *----------------------------------------------------------------------------*/
6972 float128
float128_sqrt(float128 a
, float_status
*status
)
6976 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
6977 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6979 aSig1
= extractFloat128Frac1( a
);
6980 aSig0
= extractFloat128Frac0( a
);
6981 aExp
= extractFloat128Exp( a
);
6982 aSign
= extractFloat128Sign( a
);
6983 if ( aExp
== 0x7FFF ) {
6984 if (aSig0
| aSig1
) {
6985 return propagateFloat128NaN(a
, a
, status
);
6987 if ( ! aSign
) return a
;
6991 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
6993 float_raise(float_flag_invalid
, status
);
6994 return float128_default_nan(status
);
6997 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
6998 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7000 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
7001 aSig0
|= UINT64_C(0x0001000000000000);
7002 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
7003 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
7004 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
7005 doubleZSig0
= zSig0
<<1;
7006 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
7007 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
7008 while ( (int64_t) rem0
< 0 ) {
7011 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
7013 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
7014 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
7015 if ( zSig1
== 0 ) zSig1
= 1;
7016 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
7017 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
7018 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
7019 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7020 while ( (int64_t) rem1
< 0 ) {
7022 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
7024 term2
|= doubleZSig0
;
7025 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7027 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7029 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
7030 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
7034 static inline FloatRelation
7035 floatx80_compare_internal(floatx80 a
, floatx80 b
, bool is_quiet
,
7036 float_status
*status
)
7040 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
7041 float_raise(float_flag_invalid
, status
);
7042 return float_relation_unordered
;
7044 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7045 ( extractFloatx80Frac( a
)<<1 ) ) ||
7046 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7047 ( extractFloatx80Frac( b
)<<1 ) )) {
7049 floatx80_is_signaling_nan(a
, status
) ||
7050 floatx80_is_signaling_nan(b
, status
)) {
7051 float_raise(float_flag_invalid
, status
);
7053 return float_relation_unordered
;
7055 aSign
= extractFloatx80Sign( a
);
7056 bSign
= extractFloatx80Sign( b
);
7057 if ( aSign
!= bSign
) {
7059 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7060 ( ( a
.low
| b
.low
) == 0 ) ) {
7062 return float_relation_equal
;
7064 return 1 - (2 * aSign
);
7067 /* Normalize pseudo-denormals before comparison. */
7068 if ((a
.high
& 0x7fff) == 0 && a
.low
& UINT64_C(0x8000000000000000)) {
7071 if ((b
.high
& 0x7fff) == 0 && b
.low
& UINT64_C(0x8000000000000000)) {
7074 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7075 return float_relation_equal
;
7077 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7082 FloatRelation
floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
7084 return floatx80_compare_internal(a
, b
, 0, status
);
7087 FloatRelation
floatx80_compare_quiet(floatx80 a
, floatx80 b
,
7088 float_status
*status
)
7090 return floatx80_compare_internal(a
, b
, 1, status
);
7093 static inline FloatRelation
7094 float128_compare_internal(float128 a
, float128 b
, bool is_quiet
,
7095 float_status
*status
)
7099 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7100 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7101 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7102 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7104 float128_is_signaling_nan(a
, status
) ||
7105 float128_is_signaling_nan(b
, status
)) {
7106 float_raise(float_flag_invalid
, status
);
7108 return float_relation_unordered
;
7110 aSign
= extractFloat128Sign( a
);
7111 bSign
= extractFloat128Sign( b
);
7112 if ( aSign
!= bSign
) {
7113 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7115 return float_relation_equal
;
7117 return 1 - (2 * aSign
);
7120 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7121 return float_relation_equal
;
7123 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7128 FloatRelation
float128_compare(float128 a
, float128 b
, float_status
*status
)
7130 return float128_compare_internal(a
, b
, 0, status
);
7133 FloatRelation
float128_compare_quiet(float128 a
, float128 b
,
7134 float_status
*status
)
7136 return float128_compare_internal(a
, b
, 1, status
);
7139 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7145 if (floatx80_invalid_encoding(a
)) {
7146 float_raise(float_flag_invalid
, status
);
7147 return floatx80_default_nan(status
);
7149 aSig
= extractFloatx80Frac( a
);
7150 aExp
= extractFloatx80Exp( a
);
7151 aSign
= extractFloatx80Sign( a
);
7153 if ( aExp
== 0x7FFF ) {
7155 return propagateFloatx80NaN(a
, a
, status
);
7169 } else if (n
< -0x10000) {
7174 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7175 aSign
, aExp
, aSig
, 0, status
);
7178 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7182 uint64_t aSig0
, aSig1
;
7184 aSig1
= extractFloat128Frac1( a
);
7185 aSig0
= extractFloat128Frac0( a
);
7186 aExp
= extractFloat128Exp( a
);
7187 aSign
= extractFloat128Sign( a
);
7188 if ( aExp
== 0x7FFF ) {
7189 if ( aSig0
| aSig1
) {
7190 return propagateFloat128NaN(a
, a
, status
);
7195 aSig0
|= UINT64_C(0x0001000000000000);
7196 } else if (aSig0
== 0 && aSig1
== 0) {
7204 } else if (n
< -0x10000) {
7209 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1
7214 static void __attribute__((constructor
)) softfloat_init(void)
7216 union_float64 ua
, ub
, uc
, ur
;
7218 if (QEMU_NO_HARDFLOAT
) {
7222 * Test that the host's FMA is not obviously broken. For example,
7223 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7224 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7226 ua
.s
= 0x0020000000000001ULL
;
7227 ub
.s
= 0x3ca0000000000000ULL
;
7228 uc
.s
= 0x0020000000000000ULL
;
7229 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
7230 if (ur
.s
!= 0x0020000000000001ULL
) {
7231 force_soft_fma
= true;