4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
90 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 s->float_exception_flags |= float_flag_input_denormal; \
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck
, float32
)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck
, float64
)
141 #undef GEN_INPUT_FLUSH__NOCHECK
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
146 if (likely(!s->flush_inputs_to_zero)) { \
149 soft_t ## _input_flush__nocheck(a, s); \
152 GEN_INPUT_FLUSH1(float32_input_flush1
, float32
)
153 GEN_INPUT_FLUSH1(float64_input_flush1
, float64
)
154 #undef GEN_INPUT_FLUSH1
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
159 if (likely(!s->flush_inputs_to_zero)) { \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
166 GEN_INPUT_FLUSH2(float32_input_flush2
, float32
)
167 GEN_INPUT_FLUSH2(float64_input_flush2
, float64
)
168 #undef GEN_INPUT_FLUSH2
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
173 if (likely(!s->flush_inputs_to_zero)) { \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
181 GEN_INPUT_FLUSH3(float32_input_flush3
, float32
)
182 GEN_INPUT_FLUSH3(float64_input_flush3
, float64
)
183 #undef GEN_INPUT_FLUSH3
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
215 # define QEMU_HARDFLOAT_USE_ISINF 0
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
235 static inline bool can_use_fpu(const float_status
*s
)
237 if (QEMU_NO_HARDFLOAT
) {
240 return likely(s
->float_exception_flags
& float_flag_inexact
&&
241 s
->float_rounding_mode
== float_round_nearest_even
);
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
266 typedef bool (*f32_check_fn
)(union_float32 a
, union_float32 b
);
267 typedef bool (*f64_check_fn
)(union_float64 a
, union_float64 b
);
269 typedef float32 (*soft_f32_op2_fn
)(float32 a
, float32 b
, float_status
*s
);
270 typedef float64 (*soft_f64_op2_fn
)(float64 a
, float64 b
, float_status
*s
);
271 typedef float (*hard_f32_op2_fn
)(float a
, float b
);
272 typedef double (*hard_f64_op2_fn
)(double a
, double b
);
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a
, union_float32 b
)
277 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
282 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
283 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
285 return float32_is_zero_or_normal(a
.s
) &&
286 float32_is_zero_or_normal(b
.s
);
289 static inline bool f64_is_zon2(union_float64 a
, union_float64 b
)
291 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
292 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
293 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
);
295 return float64_is_zero_or_normal(a
.s
) &&
296 float64_is_zero_or_normal(b
.s
);
299 /* 3-input is-zero-or-normal */
301 bool f32_is_zon3(union_float32 a
, union_float32 b
, union_float32 c
)
303 if (QEMU_HARDFLOAT_3F32_USE_FP
) {
304 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
305 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
306 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
308 return float32_is_zero_or_normal(a
.s
) &&
309 float32_is_zero_or_normal(b
.s
) &&
310 float32_is_zero_or_normal(c
.s
);
314 bool f64_is_zon3(union_float64 a
, union_float64 b
, union_float64 c
)
316 if (QEMU_HARDFLOAT_3F64_USE_FP
) {
317 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
318 (fpclassify(b
.h
) == FP_NORMAL
|| fpclassify(b
.h
) == FP_ZERO
) &&
319 (fpclassify(c
.h
) == FP_NORMAL
|| fpclassify(c
.h
) == FP_ZERO
);
321 return float64_is_zero_or_normal(a
.s
) &&
322 float64_is_zero_or_normal(b
.s
) &&
323 float64_is_zero_or_normal(c
.s
);
326 static inline bool f32_is_inf(union_float32 a
)
328 if (QEMU_HARDFLOAT_USE_ISINF
) {
331 return float32_is_infinity(a
.s
);
334 static inline bool f64_is_inf(union_float64 a
)
336 if (QEMU_HARDFLOAT_USE_ISINF
) {
339 return float64_is_infinity(a
.s
);
342 static inline float32
343 float32_gen2(float32 xa
, float32 xb
, float_status
*s
,
344 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
,
345 f32_check_fn pre
, f32_check_fn post
)
347 union_float32 ua
, ub
, ur
;
352 if (unlikely(!can_use_fpu(s
))) {
356 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
357 if (unlikely(!pre(ua
, ub
))) {
361 ur
.h
= hard(ua
.h
, ub
.h
);
362 if (unlikely(f32_is_inf(ur
))) {
363 s
->float_exception_flags
|= float_flag_overflow
;
364 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
) && post(ua
, ub
)) {
370 return soft(ua
.s
, ub
.s
, s
);
373 static inline float64
374 float64_gen2(float64 xa
, float64 xb
, float_status
*s
,
375 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
,
376 f64_check_fn pre
, f64_check_fn post
)
378 union_float64 ua
, ub
, ur
;
383 if (unlikely(!can_use_fpu(s
))) {
387 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
388 if (unlikely(!pre(ua
, ub
))) {
392 ur
.h
= hard(ua
.h
, ub
.h
);
393 if (unlikely(f64_is_inf(ur
))) {
394 s
->float_exception_flags
|= float_flag_overflow
;
395 } else if (unlikely(fabs(ur
.h
) <= DBL_MIN
) && post(ua
, ub
)) {
401 return soft(ua
.s
, ub
.s
, s
);
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
408 static inline uint32_t extractFloat32Frac(float32 a
)
410 return float32_val(a
) & 0x007FFFFF;
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
417 static inline int extractFloat32Exp(float32 a
)
419 return (float32_val(a
) >> 23) & 0xFF;
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
426 static inline bool extractFloat32Sign(float32 a
)
428 return float32_val(a
) >> 31;
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
435 static inline uint64_t extractFloat64Frac(float64 a
)
437 return float64_val(a
) & UINT64_C(0x000FFFFFFFFFFFFF);
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
444 static inline int extractFloat64Exp(float64 a
)
446 return (float64_val(a
) >> 52) & 0x7FF;
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
453 static inline bool extractFloat64Sign(float64 a
)
455 return float64_val(a
) >> 63;
459 * Classify a floating point number. Everything above float_class_qnan
460 * is a NaN so cls >= float_class_qnan is any NaN.
463 typedef enum __attribute__ ((__packed__
)) {
464 float_class_unclassified
,
468 float_class_qnan
, /* all NaNs from here */
472 /* Simple helpers for checking if, or what kind of, NaN we have */
473 static inline __attribute__((unused
)) bool is_nan(FloatClass c
)
475 return unlikely(c
>= float_class_qnan
);
478 static inline __attribute__((unused
)) bool is_snan(FloatClass c
)
480 return c
== float_class_snan
;
483 static inline __attribute__((unused
)) bool is_qnan(FloatClass c
)
485 return c
== float_class_qnan
;
489 * Structure holding all of the decomposed parts of a float. The
490 * exponent is unbiased and the fraction is normalized. All
491 * calculations are done with a 64 bit fraction and then rounded as
492 * appropriate for the final format.
494 * Thanks to the packed FloatClass a decent compiler should be able to
495 * fit the whole structure into registers and avoid using the stack
496 * for parameter passing.
506 #define DECOMPOSED_BINARY_POINT (64 - 2)
507 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
508 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
510 /* Structure holding all of the relevant parameters for a format.
511 * exp_size: the size of the exponent field
512 * exp_bias: the offset applied to the exponent field
513 * exp_max: the maximum normalised exponent
514 * frac_size: the size of the fraction field
515 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
516 * The following are computed based the size of fraction
517 * frac_lsb: least significant bit of fraction
518 * frac_lsbm1: the bit below the least significant bit (for rounding)
519 * round_mask/roundeven_mask: masks used for rounding
520 * The following optional modifiers are available:
521 * arm_althp: handle ARM Alternative Half Precision
532 uint64_t roundeven_mask
;
536 /* Expand fields based on the size of exponent and fraction */
537 #define FLOAT_PARAMS(E, F) \
539 .exp_bias = ((1 << E) - 1) >> 1, \
540 .exp_max = (1 << E) - 1, \
542 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
543 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
544 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
545 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
546 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
548 static const FloatFmt float16_params
= {
552 static const FloatFmt float16_params_ahp
= {
557 static const FloatFmt bfloat16_params
= {
561 static const FloatFmt float32_params
= {
565 static const FloatFmt float64_params
= {
569 /* Unpack a float to parts, but do not canonicalize. */
570 static inline FloatParts
unpack_raw(FloatFmt fmt
, uint64_t raw
)
572 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
574 return (FloatParts
) {
575 .cls
= float_class_unclassified
,
576 .sign
= extract64(raw
, sign_pos
, 1),
577 .exp
= extract64(raw
, fmt
.frac_size
, fmt
.exp_size
),
578 .frac
= extract64(raw
, 0, fmt
.frac_size
),
582 static inline FloatParts
float16_unpack_raw(float16 f
)
584 return unpack_raw(float16_params
, f
);
587 static inline FloatParts
bfloat16_unpack_raw(bfloat16 f
)
589 return unpack_raw(bfloat16_params
, f
);
592 static inline FloatParts
float32_unpack_raw(float32 f
)
594 return unpack_raw(float32_params
, f
);
597 static inline FloatParts
float64_unpack_raw(float64 f
)
599 return unpack_raw(float64_params
, f
);
602 /* Pack a float from parts, but do not canonicalize. */
603 static inline uint64_t pack_raw(FloatFmt fmt
, FloatParts p
)
605 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
606 uint64_t ret
= deposit64(p
.frac
, fmt
.frac_size
, fmt
.exp_size
, p
.exp
);
607 return deposit64(ret
, sign_pos
, 1, p
.sign
);
610 static inline float16
float16_pack_raw(FloatParts p
)
612 return make_float16(pack_raw(float16_params
, p
));
615 static inline bfloat16
bfloat16_pack_raw(FloatParts p
)
617 return pack_raw(bfloat16_params
, p
);
620 static inline float32
float32_pack_raw(FloatParts p
)
622 return make_float32(pack_raw(float32_params
, p
));
625 static inline float64
float64_pack_raw(FloatParts p
)
627 return make_float64(pack_raw(float64_params
, p
));
630 /*----------------------------------------------------------------------------
631 | Functions and definitions to determine: (1) whether tininess for underflow
632 | is detected before or after rounding by default, (2) what (if anything)
633 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
634 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
635 | are propagated from function inputs to output. These details are target-
637 *----------------------------------------------------------------------------*/
638 #include "softfloat-specialize.c.inc"
640 /* Canonicalize EXP and FRAC, setting CLS. */
641 static FloatParts
sf_canonicalize(FloatParts part
, const FloatFmt
*parm
,
642 float_status
*status
)
644 if (part
.exp
== parm
->exp_max
&& !parm
->arm_althp
) {
645 if (part
.frac
== 0) {
646 part
.cls
= float_class_inf
;
648 part
.frac
<<= parm
->frac_shift
;
649 part
.cls
= (parts_is_snan_frac(part
.frac
, status
)
650 ? float_class_snan
: float_class_qnan
);
652 } else if (part
.exp
== 0) {
653 if (likely(part
.frac
== 0)) {
654 part
.cls
= float_class_zero
;
655 } else if (status
->flush_inputs_to_zero
) {
656 float_raise(float_flag_input_denormal
, status
);
657 part
.cls
= float_class_zero
;
660 int shift
= clz64(part
.frac
) - 1;
661 part
.cls
= float_class_normal
;
662 part
.exp
= parm
->frac_shift
- parm
->exp_bias
- shift
+ 1;
666 part
.cls
= float_class_normal
;
667 part
.exp
-= parm
->exp_bias
;
668 part
.frac
= DECOMPOSED_IMPLICIT_BIT
+ (part
.frac
<< parm
->frac_shift
);
673 /* Round and uncanonicalize a floating-point number by parts. There
674 * are FRAC_SHIFT bits that may require rounding at the bottom of the
675 * fraction; these bits will be removed. The exponent will be biased
676 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
679 static FloatParts
round_canonical(FloatParts p
, float_status
*s
,
680 const FloatFmt
*parm
)
682 const uint64_t frac_lsb
= parm
->frac_lsb
;
683 const uint64_t frac_lsbm1
= parm
->frac_lsbm1
;
684 const uint64_t round_mask
= parm
->round_mask
;
685 const uint64_t roundeven_mask
= parm
->roundeven_mask
;
686 const int exp_max
= parm
->exp_max
;
687 const int frac_shift
= parm
->frac_shift
;
696 case float_class_normal
:
697 switch (s
->float_rounding_mode
) {
698 case float_round_nearest_even
:
699 overflow_norm
= false;
700 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
702 case float_round_ties_away
:
703 overflow_norm
= false;
706 case float_round_to_zero
:
707 overflow_norm
= true;
711 inc
= p
.sign
? 0 : round_mask
;
712 overflow_norm
= p
.sign
;
714 case float_round_down
:
715 inc
= p
.sign
? round_mask
: 0;
716 overflow_norm
= !p
.sign
;
718 case float_round_to_odd
:
719 overflow_norm
= true;
720 inc
= frac
& frac_lsb
? 0 : round_mask
;
723 g_assert_not_reached();
726 exp
+= parm
->exp_bias
;
727 if (likely(exp
> 0)) {
728 if (frac
& round_mask
) {
729 flags
|= float_flag_inexact
;
731 if (frac
& DECOMPOSED_OVERFLOW_BIT
) {
738 if (parm
->arm_althp
) {
739 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
740 if (unlikely(exp
> exp_max
)) {
741 /* Overflow. Return the maximum normal. */
742 flags
= float_flag_invalid
;
746 } else if (unlikely(exp
>= exp_max
)) {
747 flags
|= float_flag_overflow
| float_flag_inexact
;
752 p
.cls
= float_class_inf
;
756 } else if (s
->flush_to_zero
) {
757 flags
|= float_flag_output_denormal
;
758 p
.cls
= float_class_zero
;
761 bool is_tiny
= s
->tininess_before_rounding
763 || !((frac
+ inc
) & DECOMPOSED_OVERFLOW_BIT
);
765 shift64RightJamming(frac
, 1 - exp
, &frac
);
766 if (frac
& round_mask
) {
767 /* Need to recompute round-to-even. */
768 switch (s
->float_rounding_mode
) {
769 case float_round_nearest_even
:
770 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
773 case float_round_to_odd
:
774 inc
= frac
& frac_lsb
? 0 : round_mask
;
779 flags
|= float_flag_inexact
;
783 exp
= (frac
& DECOMPOSED_IMPLICIT_BIT
? 1 : 0);
786 if (is_tiny
&& (flags
& float_flag_inexact
)) {
787 flags
|= float_flag_underflow
;
789 if (exp
== 0 && frac
== 0) {
790 p
.cls
= float_class_zero
;
795 case float_class_zero
:
801 case float_class_inf
:
803 assert(!parm
->arm_althp
);
808 case float_class_qnan
:
809 case float_class_snan
:
810 assert(!parm
->arm_althp
);
812 frac
>>= parm
->frac_shift
;
816 g_assert_not_reached();
819 float_raise(flags
, s
);
825 /* Explicit FloatFmt version */
826 static FloatParts
float16a_unpack_canonical(float16 f
, float_status
*s
,
827 const FloatFmt
*params
)
829 return sf_canonicalize(float16_unpack_raw(f
), params
, s
);
832 static FloatParts
float16_unpack_canonical(float16 f
, float_status
*s
)
834 return float16a_unpack_canonical(f
, s
, &float16_params
);
837 static FloatParts
bfloat16_unpack_canonical(bfloat16 f
, float_status
*s
)
839 return sf_canonicalize(bfloat16_unpack_raw(f
), &bfloat16_params
, s
);
842 static float16
float16a_round_pack_canonical(FloatParts p
, float_status
*s
,
843 const FloatFmt
*params
)
845 return float16_pack_raw(round_canonical(p
, s
, params
));
848 static float16
float16_round_pack_canonical(FloatParts p
, float_status
*s
)
850 return float16a_round_pack_canonical(p
, s
, &float16_params
);
853 static bfloat16
bfloat16_round_pack_canonical(FloatParts p
, float_status
*s
)
855 return bfloat16_pack_raw(round_canonical(p
, s
, &bfloat16_params
));
858 static FloatParts
float32_unpack_canonical(float32 f
, float_status
*s
)
860 return sf_canonicalize(float32_unpack_raw(f
), &float32_params
, s
);
863 static float32
float32_round_pack_canonical(FloatParts p
, float_status
*s
)
865 return float32_pack_raw(round_canonical(p
, s
, &float32_params
));
868 static FloatParts
float64_unpack_canonical(float64 f
, float_status
*s
)
870 return sf_canonicalize(float64_unpack_raw(f
), &float64_params
, s
);
873 static float64
float64_round_pack_canonical(FloatParts p
, float_status
*s
)
875 return float64_pack_raw(round_canonical(p
, s
, &float64_params
));
878 static FloatParts
return_nan(FloatParts a
, float_status
*s
)
881 case float_class_snan
:
882 s
->float_exception_flags
|= float_flag_invalid
;
883 a
= parts_silence_nan(a
, s
);
885 case float_class_qnan
:
886 if (s
->default_nan_mode
) {
887 return parts_default_nan(s
);
892 g_assert_not_reached();
897 static FloatParts
pick_nan(FloatParts a
, FloatParts b
, float_status
*s
)
899 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
900 s
->float_exception_flags
|= float_flag_invalid
;
903 if (s
->default_nan_mode
) {
904 return parts_default_nan(s
);
906 if (pickNaN(a
.cls
, b
.cls
,
908 (a
.frac
== b
.frac
&& a
.sign
< b
.sign
), s
)) {
911 if (is_snan(a
.cls
)) {
912 return parts_silence_nan(a
, s
);
918 static FloatParts
pick_nan_muladd(FloatParts a
, FloatParts b
, FloatParts c
,
919 bool inf_zero
, float_status
*s
)
923 if (is_snan(a
.cls
) || is_snan(b
.cls
) || is_snan(c
.cls
)) {
924 s
->float_exception_flags
|= float_flag_invalid
;
927 which
= pickNaNMulAdd(a
.cls
, b
.cls
, c
.cls
, inf_zero
, s
);
929 if (s
->default_nan_mode
) {
930 /* Note that this check is after pickNaNMulAdd so that function
931 * has an opportunity to set the Invalid flag.
946 return parts_default_nan(s
);
948 g_assert_not_reached();
951 if (is_snan(a
.cls
)) {
952 return parts_silence_nan(a
, s
);
958 * Returns the result of adding or subtracting the values of the
959 * floating-point values `a' and `b'. The operation is performed
960 * according to the IEC/IEEE Standard for Binary Floating-Point
964 static FloatParts
addsub_floats(FloatParts a
, FloatParts b
, bool subtract
,
967 bool a_sign
= a
.sign
;
968 bool b_sign
= b
.sign
^ subtract
;
970 if (a_sign
!= b_sign
) {
973 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
974 if (a
.exp
> b
.exp
|| (a
.exp
== b
.exp
&& a
.frac
>= b
.frac
)) {
975 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
976 a
.frac
= a
.frac
- b
.frac
;
978 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
979 a
.frac
= b
.frac
- a
.frac
;
985 a
.cls
= float_class_zero
;
986 a
.sign
= s
->float_rounding_mode
== float_round_down
;
988 int shift
= clz64(a
.frac
) - 1;
989 a
.frac
= a
.frac
<< shift
;
990 a
.exp
= a
.exp
- shift
;
995 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
996 return pick_nan(a
, b
, s
);
998 if (a
.cls
== float_class_inf
) {
999 if (b
.cls
== float_class_inf
) {
1000 float_raise(float_flag_invalid
, s
);
1001 return parts_default_nan(s
);
1005 if (a
.cls
== float_class_zero
&& b
.cls
== float_class_zero
) {
1006 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1009 if (a
.cls
== float_class_zero
|| b
.cls
== float_class_inf
) {
1010 b
.sign
= a_sign
^ 1;
1013 if (b
.cls
== float_class_zero
) {
1018 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1019 if (a
.exp
> b
.exp
) {
1020 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
1021 } else if (a
.exp
< b
.exp
) {
1022 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
1026 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
1027 shift64RightJamming(a
.frac
, 1, &a
.frac
);
1032 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1033 return pick_nan(a
, b
, s
);
1035 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1038 if (b
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1043 g_assert_not_reached();
1047 * Returns the result of adding or subtracting the floating-point
1048 * values `a' and `b'. The operation is performed according to the
1049 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1052 float16 QEMU_FLATTEN
float16_add(float16 a
, float16 b
, float_status
*status
)
1054 FloatParts pa
= float16_unpack_canonical(a
, status
);
1055 FloatParts pb
= float16_unpack_canonical(b
, status
);
1056 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
1058 return float16_round_pack_canonical(pr
, status
);
1061 float16 QEMU_FLATTEN
float16_sub(float16 a
, float16 b
, float_status
*status
)
1063 FloatParts pa
= float16_unpack_canonical(a
, status
);
1064 FloatParts pb
= float16_unpack_canonical(b
, status
);
1065 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
1067 return float16_round_pack_canonical(pr
, status
);
1070 static float32 QEMU_SOFTFLOAT_ATTR
1071 soft_f32_addsub(float32 a
, float32 b
, bool subtract
, float_status
*status
)
1073 FloatParts pa
= float32_unpack_canonical(a
, status
);
1074 FloatParts pb
= float32_unpack_canonical(b
, status
);
1075 FloatParts pr
= addsub_floats(pa
, pb
, subtract
, status
);
1077 return float32_round_pack_canonical(pr
, status
);
1080 static inline float32
soft_f32_add(float32 a
, float32 b
, float_status
*status
)
1082 return soft_f32_addsub(a
, b
, false, status
);
1085 static inline float32
soft_f32_sub(float32 a
, float32 b
, float_status
*status
)
1087 return soft_f32_addsub(a
, b
, true, status
);
1090 static float64 QEMU_SOFTFLOAT_ATTR
1091 soft_f64_addsub(float64 a
, float64 b
, bool subtract
, float_status
*status
)
1093 FloatParts pa
= float64_unpack_canonical(a
, status
);
1094 FloatParts pb
= float64_unpack_canonical(b
, status
);
1095 FloatParts pr
= addsub_floats(pa
, pb
, subtract
, status
);
1097 return float64_round_pack_canonical(pr
, status
);
1100 static inline float64
soft_f64_add(float64 a
, float64 b
, float_status
*status
)
1102 return soft_f64_addsub(a
, b
, false, status
);
1105 static inline float64
soft_f64_sub(float64 a
, float64 b
, float_status
*status
)
1107 return soft_f64_addsub(a
, b
, true, status
);
1110 static float hard_f32_add(float a
, float b
)
1115 static float hard_f32_sub(float a
, float b
)
1120 static double hard_f64_add(double a
, double b
)
1125 static double hard_f64_sub(double a
, double b
)
1130 static bool f32_addsubmul_post(union_float32 a
, union_float32 b
)
1132 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1133 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1135 return !(float32_is_zero(a
.s
) && float32_is_zero(b
.s
));
1138 static bool f64_addsubmul_post(union_float64 a
, union_float64 b
)
1140 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1141 return !(fpclassify(a
.h
) == FP_ZERO
&& fpclassify(b
.h
) == FP_ZERO
);
1143 return !(float64_is_zero(a
.s
) && float64_is_zero(b
.s
));
1147 static float32
float32_addsub(float32 a
, float32 b
, float_status
*s
,
1148 hard_f32_op2_fn hard
, soft_f32_op2_fn soft
)
1150 return float32_gen2(a
, b
, s
, hard
, soft
,
1151 f32_is_zon2
, f32_addsubmul_post
);
1154 static float64
float64_addsub(float64 a
, float64 b
, float_status
*s
,
1155 hard_f64_op2_fn hard
, soft_f64_op2_fn soft
)
1157 return float64_gen2(a
, b
, s
, hard
, soft
,
1158 f64_is_zon2
, f64_addsubmul_post
);
1161 float32 QEMU_FLATTEN
1162 float32_add(float32 a
, float32 b
, float_status
*s
)
1164 return float32_addsub(a
, b
, s
, hard_f32_add
, soft_f32_add
);
1167 float32 QEMU_FLATTEN
1168 float32_sub(float32 a
, float32 b
, float_status
*s
)
1170 return float32_addsub(a
, b
, s
, hard_f32_sub
, soft_f32_sub
);
1173 float64 QEMU_FLATTEN
1174 float64_add(float64 a
, float64 b
, float_status
*s
)
1176 return float64_addsub(a
, b
, s
, hard_f64_add
, soft_f64_add
);
1179 float64 QEMU_FLATTEN
1180 float64_sub(float64 a
, float64 b
, float_status
*s
)
1182 return float64_addsub(a
, b
, s
, hard_f64_sub
, soft_f64_sub
);
1186 * Returns the result of adding or subtracting the bfloat16
1187 * values `a' and `b'.
1189 bfloat16 QEMU_FLATTEN
bfloat16_add(bfloat16 a
, bfloat16 b
, float_status
*status
)
1191 FloatParts pa
= bfloat16_unpack_canonical(a
, status
);
1192 FloatParts pb
= bfloat16_unpack_canonical(b
, status
);
1193 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
1195 return bfloat16_round_pack_canonical(pr
, status
);
1198 bfloat16 QEMU_FLATTEN
bfloat16_sub(bfloat16 a
, bfloat16 b
, float_status
*status
)
1200 FloatParts pa
= bfloat16_unpack_canonical(a
, status
);
1201 FloatParts pb
= bfloat16_unpack_canonical(b
, status
);
1202 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
1204 return bfloat16_round_pack_canonical(pr
, status
);
1208 * Returns the result of multiplying the floating-point values `a' and
1209 * `b'. The operation is performed according to the IEC/IEEE Standard
1210 * for Binary Floating-Point Arithmetic.
1213 static FloatParts
mul_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1215 bool sign
= a
.sign
^ b
.sign
;
1217 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1219 int exp
= a
.exp
+ b
.exp
;
1221 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1222 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1223 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
1224 shift64RightJamming(lo
, 1, &lo
);
1234 /* handle all the NaN cases */
1235 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1236 return pick_nan(a
, b
, s
);
1238 /* Inf * Zero == NaN */
1239 if ((a
.cls
== float_class_inf
&& b
.cls
== float_class_zero
) ||
1240 (a
.cls
== float_class_zero
&& b
.cls
== float_class_inf
)) {
1241 s
->float_exception_flags
|= float_flag_invalid
;
1242 return parts_default_nan(s
);
1244 /* Multiply by 0 or Inf */
1245 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1249 if (b
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
1253 g_assert_not_reached();
1256 float16 QEMU_FLATTEN
float16_mul(float16 a
, float16 b
, float_status
*status
)
1258 FloatParts pa
= float16_unpack_canonical(a
, status
);
1259 FloatParts pb
= float16_unpack_canonical(b
, status
);
1260 FloatParts pr
= mul_floats(pa
, pb
, status
);
1262 return float16_round_pack_canonical(pr
, status
);
1265 static float32 QEMU_SOFTFLOAT_ATTR
1266 soft_f32_mul(float32 a
, float32 b
, float_status
*status
)
1268 FloatParts pa
= float32_unpack_canonical(a
, status
);
1269 FloatParts pb
= float32_unpack_canonical(b
, status
);
1270 FloatParts pr
= mul_floats(pa
, pb
, status
);
1272 return float32_round_pack_canonical(pr
, status
);
1275 static float64 QEMU_SOFTFLOAT_ATTR
1276 soft_f64_mul(float64 a
, float64 b
, float_status
*status
)
1278 FloatParts pa
= float64_unpack_canonical(a
, status
);
1279 FloatParts pb
= float64_unpack_canonical(b
, status
);
1280 FloatParts pr
= mul_floats(pa
, pb
, status
);
1282 return float64_round_pack_canonical(pr
, status
);
1285 static float hard_f32_mul(float a
, float b
)
1290 static double hard_f64_mul(double a
, double b
)
1295 float32 QEMU_FLATTEN
1296 float32_mul(float32 a
, float32 b
, float_status
*s
)
1298 return float32_gen2(a
, b
, s
, hard_f32_mul
, soft_f32_mul
,
1299 f32_is_zon2
, f32_addsubmul_post
);
1302 float64 QEMU_FLATTEN
1303 float64_mul(float64 a
, float64 b
, float_status
*s
)
1305 return float64_gen2(a
, b
, s
, hard_f64_mul
, soft_f64_mul
,
1306 f64_is_zon2
, f64_addsubmul_post
);
1310 * Returns the result of multiplying the bfloat16
1311 * values `a' and `b'.
1314 bfloat16 QEMU_FLATTEN
bfloat16_mul(bfloat16 a
, bfloat16 b
, float_status
*status
)
1316 FloatParts pa
= bfloat16_unpack_canonical(a
, status
);
1317 FloatParts pb
= bfloat16_unpack_canonical(b
, status
);
1318 FloatParts pr
= mul_floats(pa
, pb
, status
);
1320 return bfloat16_round_pack_canonical(pr
, status
);
1324 * Returns the result of multiplying the floating-point values `a' and
1325 * `b' then adding 'c', with no intermediate rounding step after the
1326 * multiplication. The operation is performed according to the
1327 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1328 * The flags argument allows the caller to select negation of the
1329 * addend, the intermediate product, or the final result. (The
1330 * difference between this and having the caller do a separate
1331 * negation is that negating externally will flip the sign bit on
1335 static FloatParts
muladd_floats(FloatParts a
, FloatParts b
, FloatParts c
,
1336 int flags
, float_status
*s
)
1338 bool inf_zero
= ((1 << a
.cls
) | (1 << b
.cls
)) ==
1339 ((1 << float_class_inf
) | (1 << float_class_zero
));
1341 bool sign_flip
= flags
& float_muladd_negate_result
;
1346 /* It is implementation-defined whether the cases of (0,inf,qnan)
1347 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1348 * they return if they do), so we have to hand this information
1349 * off to the target-specific pick-a-NaN routine.
1351 if (is_nan(a
.cls
) || is_nan(b
.cls
) || is_nan(c
.cls
)) {
1352 return pick_nan_muladd(a
, b
, c
, inf_zero
, s
);
1356 s
->float_exception_flags
|= float_flag_invalid
;
1357 return parts_default_nan(s
);
1360 if (flags
& float_muladd_negate_c
) {
1364 p_sign
= a
.sign
^ b
.sign
;
1366 if (flags
& float_muladd_negate_product
) {
1370 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_inf
) {
1371 p_class
= float_class_inf
;
1372 } else if (a
.cls
== float_class_zero
|| b
.cls
== float_class_zero
) {
1373 p_class
= float_class_zero
;
1375 p_class
= float_class_normal
;
1378 if (c
.cls
== float_class_inf
) {
1379 if (p_class
== float_class_inf
&& p_sign
!= c
.sign
) {
1380 s
->float_exception_flags
|= float_flag_invalid
;
1381 return parts_default_nan(s
);
1383 a
.cls
= float_class_inf
;
1384 a
.sign
= c
.sign
^ sign_flip
;
1389 if (p_class
== float_class_inf
) {
1390 a
.cls
= float_class_inf
;
1391 a
.sign
= p_sign
^ sign_flip
;
1395 if (p_class
== float_class_zero
) {
1396 if (c
.cls
== float_class_zero
) {
1397 if (p_sign
!= c
.sign
) {
1398 p_sign
= s
->float_rounding_mode
== float_round_down
;
1401 } else if (flags
& float_muladd_halve_result
) {
1404 c
.sign
^= sign_flip
;
1408 /* a & b should be normals now... */
1409 assert(a
.cls
== float_class_normal
&&
1410 b
.cls
== float_class_normal
);
1412 p_exp
= a
.exp
+ b
.exp
;
1414 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1417 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
1418 /* binary point now at bit 124 */
1420 /* check for overflow */
1421 if (hi
& (1ULL << (DECOMPOSED_BINARY_POINT
* 2 + 1 - 64))) {
1422 shift128RightJamming(hi
, lo
, 1, &hi
, &lo
);
1427 if (c
.cls
== float_class_zero
) {
1428 /* move binary point back to 62 */
1429 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1431 int exp_diff
= p_exp
- c
.exp
;
1432 if (p_sign
== c
.sign
) {
1434 if (exp_diff
<= 0) {
1435 shift128RightJamming(hi
, lo
,
1436 DECOMPOSED_BINARY_POINT
- exp_diff
,
1441 uint64_t c_hi
, c_lo
;
1442 /* shift c to the same binary point as the product (124) */
1445 shift128RightJamming(c_hi
, c_lo
,
1448 add128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1449 /* move binary point back to 62 */
1450 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1453 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
1454 shift64RightJamming(lo
, 1, &lo
);
1460 uint64_t c_hi
, c_lo
;
1461 /* make C binary point match product at bit 124 */
1465 if (exp_diff
<= 0) {
1466 shift128RightJamming(hi
, lo
, -exp_diff
, &hi
, &lo
);
1469 (hi
> c_hi
|| (hi
== c_hi
&& lo
>= c_lo
))) {
1470 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1472 sub128(c_hi
, c_lo
, hi
, lo
, &hi
, &lo
);
1477 shift128RightJamming(c_hi
, c_lo
,
1480 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1483 if (hi
== 0 && lo
== 0) {
1484 a
.cls
= float_class_zero
;
1485 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1486 a
.sign
^= sign_flip
;
1493 shift
= clz64(lo
) + 64;
1495 /* Normalizing to a binary point of 124 is the
1496 correct adjust for the exponent. However since we're
1497 shifting, we might as well put the binary point back
1498 at 62 where we really want it. Therefore shift as
1499 if we're leaving 1 bit at the top of the word, but
1500 adjust the exponent as if we're leaving 3 bits. */
1503 lo
= lo
<< (shift
- 64);
1505 hi
= (hi
<< shift
) | (lo
>> (64 - shift
));
1506 lo
= hi
| ((lo
<< shift
) != 0);
1513 if (flags
& float_muladd_halve_result
) {
1517 /* finally prepare our result */
1518 a
.cls
= float_class_normal
;
1519 a
.sign
= p_sign
^ sign_flip
;
1526 float16 QEMU_FLATTEN
float16_muladd(float16 a
, float16 b
, float16 c
,
1527 int flags
, float_status
*status
)
1529 FloatParts pa
= float16_unpack_canonical(a
, status
);
1530 FloatParts pb
= float16_unpack_canonical(b
, status
);
1531 FloatParts pc
= float16_unpack_canonical(c
, status
);
1532 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1534 return float16_round_pack_canonical(pr
, status
);
1537 static float32 QEMU_SOFTFLOAT_ATTR
1538 soft_f32_muladd(float32 a
, float32 b
, float32 c
, int flags
,
1539 float_status
*status
)
1541 FloatParts pa
= float32_unpack_canonical(a
, status
);
1542 FloatParts pb
= float32_unpack_canonical(b
, status
);
1543 FloatParts pc
= float32_unpack_canonical(c
, status
);
1544 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1546 return float32_round_pack_canonical(pr
, status
);
1549 static float64 QEMU_SOFTFLOAT_ATTR
1550 soft_f64_muladd(float64 a
, float64 b
, float64 c
, int flags
,
1551 float_status
*status
)
1553 FloatParts pa
= float64_unpack_canonical(a
, status
);
1554 FloatParts pb
= float64_unpack_canonical(b
, status
);
1555 FloatParts pc
= float64_unpack_canonical(c
, status
);
1556 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1558 return float64_round_pack_canonical(pr
, status
);
1561 static bool force_soft_fma
;
1563 float32 QEMU_FLATTEN
1564 float32_muladd(float32 xa
, float32 xb
, float32 xc
, int flags
, float_status
*s
)
1566 union_float32 ua
, ub
, uc
, ur
;
1572 if (unlikely(!can_use_fpu(s
))) {
1575 if (unlikely(flags
& float_muladd_halve_result
)) {
1579 float32_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1580 if (unlikely(!f32_is_zon3(ua
, ub
, uc
))) {
1584 if (unlikely(force_soft_fma
)) {
1589 * When (a || b) == 0, there's no need to check for under/over flow,
1590 * since we know the addend is (normal || 0) and the product is 0.
1592 if (float32_is_zero(ua
.s
) || float32_is_zero(ub
.s
)) {
1596 prod_sign
= float32_is_neg(ua
.s
) ^ float32_is_neg(ub
.s
);
1597 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1598 up
.s
= float32_set_sign(float32_zero
, prod_sign
);
1600 if (flags
& float_muladd_negate_c
) {
1605 union_float32 ua_orig
= ua
;
1606 union_float32 uc_orig
= uc
;
1608 if (flags
& float_muladd_negate_product
) {
1611 if (flags
& float_muladd_negate_c
) {
1615 ur
.h
= fmaf(ua
.h
, ub
.h
, uc
.h
);
1617 if (unlikely(f32_is_inf(ur
))) {
1618 s
->float_exception_flags
|= float_flag_overflow
;
1619 } else if (unlikely(fabsf(ur
.h
) <= FLT_MIN
)) {
1625 if (flags
& float_muladd_negate_result
) {
1626 return float32_chs(ur
.s
);
1631 return soft_f32_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1634 float64 QEMU_FLATTEN
1635 float64_muladd(float64 xa
, float64 xb
, float64 xc
, int flags
, float_status
*s
)
1637 union_float64 ua
, ub
, uc
, ur
;
1643 if (unlikely(!can_use_fpu(s
))) {
1646 if (unlikely(flags
& float_muladd_halve_result
)) {
1650 float64_input_flush3(&ua
.s
, &ub
.s
, &uc
.s
, s
);
1651 if (unlikely(!f64_is_zon3(ua
, ub
, uc
))) {
1655 if (unlikely(force_soft_fma
)) {
1660 * When (a || b) == 0, there's no need to check for under/over flow,
1661 * since we know the addend is (normal || 0) and the product is 0.
1663 if (float64_is_zero(ua
.s
) || float64_is_zero(ub
.s
)) {
1667 prod_sign
= float64_is_neg(ua
.s
) ^ float64_is_neg(ub
.s
);
1668 prod_sign
^= !!(flags
& float_muladd_negate_product
);
1669 up
.s
= float64_set_sign(float64_zero
, prod_sign
);
1671 if (flags
& float_muladd_negate_c
) {
1676 union_float64 ua_orig
= ua
;
1677 union_float64 uc_orig
= uc
;
1679 if (flags
& float_muladd_negate_product
) {
1682 if (flags
& float_muladd_negate_c
) {
1686 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
1688 if (unlikely(f64_is_inf(ur
))) {
1689 s
->float_exception_flags
|= float_flag_overflow
;
1690 } else if (unlikely(fabs(ur
.h
) <= FLT_MIN
)) {
1696 if (flags
& float_muladd_negate_result
) {
1697 return float64_chs(ur
.s
);
1702 return soft_f64_muladd(ua
.s
, ub
.s
, uc
.s
, flags
, s
);
1706 * Returns the result of multiplying the bfloat16 values `a'
1707 * and `b' then adding 'c', with no intermediate rounding step after the
1711 bfloat16 QEMU_FLATTEN
bfloat16_muladd(bfloat16 a
, bfloat16 b
, bfloat16 c
,
1712 int flags
, float_status
*status
)
1714 FloatParts pa
= bfloat16_unpack_canonical(a
, status
);
1715 FloatParts pb
= bfloat16_unpack_canonical(b
, status
);
1716 FloatParts pc
= bfloat16_unpack_canonical(c
, status
);
1717 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1719 return bfloat16_round_pack_canonical(pr
, status
);
1723 * Returns the result of dividing the floating-point value `a' by the
1724 * corresponding value `b'. The operation is performed according to
1725 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1728 static FloatParts
div_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1730 bool sign
= a
.sign
^ b
.sign
;
1732 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1733 uint64_t n0
, n1
, q
, r
;
1734 int exp
= a
.exp
- b
.exp
;
1737 * We want a 2*N / N-bit division to produce exactly an N-bit
1738 * result, so that we do not lose any precision and so that we
1739 * do not have to renormalize afterward. If A.frac < B.frac,
1740 * then division would produce an (N-1)-bit result; shift A left
1741 * by one to produce the an N-bit result, and decrement the
1742 * exponent to match.
1744 * The udiv_qrnnd algorithm that we're using requires normalization,
1745 * i.e. the msb of the denominator must be set. Since we know that
1746 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1747 * by one (more), and the remainder must be shifted right by one.
1749 if (a
.frac
< b
.frac
) {
1751 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 2, &n1
, &n0
);
1753 shift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1, &n1
, &n0
);
1755 q
= udiv_qrnnd(&r
, n1
, n0
, b
.frac
<< 1);
1758 * Set lsb if there is a remainder, to set inexact.
1759 * As mentioned above, to find the actual value of the remainder we
1760 * would need to shift right, but (1) we are only concerned about
1761 * non-zero-ness, and (2) the remainder will always be even because
1762 * both inputs to the division primitive are even.
1764 a
.frac
= q
| (r
!= 0);
1769 /* handle all the NaN cases */
1770 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1771 return pick_nan(a
, b
, s
);
1773 /* 0/0 or Inf/Inf */
1776 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1777 s
->float_exception_flags
|= float_flag_invalid
;
1778 return parts_default_nan(s
);
1780 /* Inf / x or 0 / x */
1781 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1786 if (b
.cls
== float_class_zero
) {
1787 s
->float_exception_flags
|= float_flag_divbyzero
;
1788 a
.cls
= float_class_inf
;
1793 if (b
.cls
== float_class_inf
) {
1794 a
.cls
= float_class_zero
;
1798 g_assert_not_reached();
1801 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1803 FloatParts pa
= float16_unpack_canonical(a
, status
);
1804 FloatParts pb
= float16_unpack_canonical(b
, status
);
1805 FloatParts pr
= div_floats(pa
, pb
, status
);
1807 return float16_round_pack_canonical(pr
, status
);
1810 static float32 QEMU_SOFTFLOAT_ATTR
1811 soft_f32_div(float32 a
, float32 b
, float_status
*status
)
1813 FloatParts pa
= float32_unpack_canonical(a
, status
);
1814 FloatParts pb
= float32_unpack_canonical(b
, status
);
1815 FloatParts pr
= div_floats(pa
, pb
, status
);
1817 return float32_round_pack_canonical(pr
, status
);
1820 static float64 QEMU_SOFTFLOAT_ATTR
1821 soft_f64_div(float64 a
, float64 b
, float_status
*status
)
1823 FloatParts pa
= float64_unpack_canonical(a
, status
);
1824 FloatParts pb
= float64_unpack_canonical(b
, status
);
1825 FloatParts pr
= div_floats(pa
, pb
, status
);
1827 return float64_round_pack_canonical(pr
, status
);
1830 static float hard_f32_div(float a
, float b
)
1835 static double hard_f64_div(double a
, double b
)
1840 static bool f32_div_pre(union_float32 a
, union_float32 b
)
1842 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1843 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1844 fpclassify(b
.h
) == FP_NORMAL
;
1846 return float32_is_zero_or_normal(a
.s
) && float32_is_normal(b
.s
);
1849 static bool f64_div_pre(union_float64 a
, union_float64 b
)
1851 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1852 return (fpclassify(a
.h
) == FP_NORMAL
|| fpclassify(a
.h
) == FP_ZERO
) &&
1853 fpclassify(b
.h
) == FP_NORMAL
;
1855 return float64_is_zero_or_normal(a
.s
) && float64_is_normal(b
.s
);
1858 static bool f32_div_post(union_float32 a
, union_float32 b
)
1860 if (QEMU_HARDFLOAT_2F32_USE_FP
) {
1861 return fpclassify(a
.h
) != FP_ZERO
;
1863 return !float32_is_zero(a
.s
);
1866 static bool f64_div_post(union_float64 a
, union_float64 b
)
1868 if (QEMU_HARDFLOAT_2F64_USE_FP
) {
1869 return fpclassify(a
.h
) != FP_ZERO
;
1871 return !float64_is_zero(a
.s
);
1874 float32 QEMU_FLATTEN
1875 float32_div(float32 a
, float32 b
, float_status
*s
)
1877 return float32_gen2(a
, b
, s
, hard_f32_div
, soft_f32_div
,
1878 f32_div_pre
, f32_div_post
);
1881 float64 QEMU_FLATTEN
1882 float64_div(float64 a
, float64 b
, float_status
*s
)
1884 return float64_gen2(a
, b
, s
, hard_f64_div
, soft_f64_div
,
1885 f64_div_pre
, f64_div_post
);
1889 * Returns the result of dividing the bfloat16
1890 * value `a' by the corresponding value `b'.
1893 bfloat16
bfloat16_div(bfloat16 a
, bfloat16 b
, float_status
*status
)
1895 FloatParts pa
= bfloat16_unpack_canonical(a
, status
);
1896 FloatParts pb
= bfloat16_unpack_canonical(b
, status
);
1897 FloatParts pr
= div_floats(pa
, pb
, status
);
1899 return bfloat16_round_pack_canonical(pr
, status
);
1903 * Float to Float conversions
1905 * Returns the result of converting one float format to another. The
1906 * conversion is performed according to the IEC/IEEE Standard for
1907 * Binary Floating-Point Arithmetic.
1909 * The float_to_float helper only needs to take care of raising
1910 * invalid exceptions and handling the conversion on NaNs.
1913 static FloatParts
float_to_float(FloatParts a
, const FloatFmt
*dstf
,
1916 if (dstf
->arm_althp
) {
1918 case float_class_qnan
:
1919 case float_class_snan
:
1920 /* There is no NaN in the destination format. Raise Invalid
1921 * and return a zero with the sign of the input NaN.
1923 s
->float_exception_flags
|= float_flag_invalid
;
1924 a
.cls
= float_class_zero
;
1929 case float_class_inf
:
1930 /* There is no Inf in the destination format. Raise Invalid
1931 * and return the maximum normal with the correct sign.
1933 s
->float_exception_flags
|= float_flag_invalid
;
1934 a
.cls
= float_class_normal
;
1935 a
.exp
= dstf
->exp_max
;
1936 a
.frac
= ((1ull << dstf
->frac_size
) - 1) << dstf
->frac_shift
;
1942 } else if (is_nan(a
.cls
)) {
1943 if (is_snan(a
.cls
)) {
1944 s
->float_exception_flags
|= float_flag_invalid
;
1945 a
= parts_silence_nan(a
, s
);
1947 if (s
->default_nan_mode
) {
1948 return parts_default_nan(s
);
1954 float32
float16_to_float32(float16 a
, bool ieee
, float_status
*s
)
1956 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1957 FloatParts p
= float16a_unpack_canonical(a
, s
, fmt16
);
1958 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
1959 return float32_round_pack_canonical(pr
, s
);
1962 float64
float16_to_float64(float16 a
, bool ieee
, float_status
*s
)
1964 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1965 FloatParts p
= float16a_unpack_canonical(a
, s
, fmt16
);
1966 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
1967 return float64_round_pack_canonical(pr
, s
);
1970 float16
float32_to_float16(float32 a
, bool ieee
, float_status
*s
)
1972 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
1973 FloatParts p
= float32_unpack_canonical(a
, s
);
1974 FloatParts pr
= float_to_float(p
, fmt16
, s
);
1975 return float16a_round_pack_canonical(pr
, s
, fmt16
);
1978 static float64 QEMU_SOFTFLOAT_ATTR
1979 soft_float32_to_float64(float32 a
, float_status
*s
)
1981 FloatParts p
= float32_unpack_canonical(a
, s
);
1982 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
1983 return float64_round_pack_canonical(pr
, s
);
1986 float64
float32_to_float64(float32 a
, float_status
*s
)
1988 if (likely(float32_is_normal(a
))) {
1989 /* Widening conversion can never produce inexact results. */
1995 } else if (float32_is_zero(a
)) {
1996 return float64_set_sign(float64_zero
, float32_is_neg(a
));
1998 return soft_float32_to_float64(a
, s
);
2002 float16
float64_to_float16(float64 a
, bool ieee
, float_status
*s
)
2004 const FloatFmt
*fmt16
= ieee
? &float16_params
: &float16_params_ahp
;
2005 FloatParts p
= float64_unpack_canonical(a
, s
);
2006 FloatParts pr
= float_to_float(p
, fmt16
, s
);
2007 return float16a_round_pack_canonical(pr
, s
, fmt16
);
2010 float32
float64_to_float32(float64 a
, float_status
*s
)
2012 FloatParts p
= float64_unpack_canonical(a
, s
);
2013 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
2014 return float32_round_pack_canonical(pr
, s
);
2017 float32
bfloat16_to_float32(bfloat16 a
, float_status
*s
)
2019 FloatParts p
= bfloat16_unpack_canonical(a
, s
);
2020 FloatParts pr
= float_to_float(p
, &float32_params
, s
);
2021 return float32_round_pack_canonical(pr
, s
);
2024 float64
bfloat16_to_float64(bfloat16 a
, float_status
*s
)
2026 FloatParts p
= bfloat16_unpack_canonical(a
, s
);
2027 FloatParts pr
= float_to_float(p
, &float64_params
, s
);
2028 return float64_round_pack_canonical(pr
, s
);
2031 bfloat16
float32_to_bfloat16(float32 a
, float_status
*s
)
2033 FloatParts p
= float32_unpack_canonical(a
, s
);
2034 FloatParts pr
= float_to_float(p
, &bfloat16_params
, s
);
2035 return bfloat16_round_pack_canonical(pr
, s
);
2038 bfloat16
float64_to_bfloat16(float64 a
, float_status
*s
)
2040 FloatParts p
= float64_unpack_canonical(a
, s
);
2041 FloatParts pr
= float_to_float(p
, &bfloat16_params
, s
);
2042 return bfloat16_round_pack_canonical(pr
, s
);
2046 * Rounds the floating-point value `a' to an integer, and returns the
2047 * result as a floating-point value. The operation is performed
2048 * according to the IEC/IEEE Standard for Binary Floating-Point
2052 static FloatParts
round_to_int(FloatParts a
, FloatRoundMode rmode
,
2053 int scale
, float_status
*s
)
2056 case float_class_qnan
:
2057 case float_class_snan
:
2058 return return_nan(a
, s
);
2060 case float_class_zero
:
2061 case float_class_inf
:
2062 /* already "integral" */
2065 case float_class_normal
:
2066 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2069 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
2070 /* already integral */
2075 /* all fractional */
2076 s
->float_exception_flags
|= float_flag_inexact
;
2078 case float_round_nearest_even
:
2079 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
2081 case float_round_ties_away
:
2082 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
2084 case float_round_to_zero
:
2087 case float_round_up
:
2090 case float_round_down
:
2093 case float_round_to_odd
:
2097 g_assert_not_reached();
2101 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
2104 a
.cls
= float_class_zero
;
2107 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
2108 uint64_t frac_lsbm1
= frac_lsb
>> 1;
2109 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
2110 uint64_t rnd_mask
= rnd_even_mask
>> 1;
2114 case float_round_nearest_even
:
2115 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
2117 case float_round_ties_away
:
2120 case float_round_to_zero
:
2123 case float_round_up
:
2124 inc
= a
.sign
? 0 : rnd_mask
;
2126 case float_round_down
:
2127 inc
= a
.sign
? rnd_mask
: 0;
2129 case float_round_to_odd
:
2130 inc
= a
.frac
& frac_lsb
? 0 : rnd_mask
;
2133 g_assert_not_reached();
2136 if (a
.frac
& rnd_mask
) {
2137 s
->float_exception_flags
|= float_flag_inexact
;
2139 a
.frac
&= ~rnd_mask
;
2140 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
2148 g_assert_not_reached();
2153 float16
float16_round_to_int(float16 a
, float_status
*s
)
2155 FloatParts pa
= float16_unpack_canonical(a
, s
);
2156 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2157 return float16_round_pack_canonical(pr
, s
);
2160 float32
float32_round_to_int(float32 a
, float_status
*s
)
2162 FloatParts pa
= float32_unpack_canonical(a
, s
);
2163 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2164 return float32_round_pack_canonical(pr
, s
);
2167 float64
float64_round_to_int(float64 a
, float_status
*s
)
2169 FloatParts pa
= float64_unpack_canonical(a
, s
);
2170 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2171 return float64_round_pack_canonical(pr
, s
);
2175 * Rounds the bfloat16 value `a' to an integer, and returns the
2176 * result as a bfloat16 value.
2179 bfloat16
bfloat16_round_to_int(bfloat16 a
, float_status
*s
)
2181 FloatParts pa
= bfloat16_unpack_canonical(a
, s
);
2182 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, 0, s
);
2183 return bfloat16_round_pack_canonical(pr
, s
);
2187 * Returns the result of converting the floating-point value `a' to
2188 * the two's complement integer format. The conversion is performed
2189 * according to the IEC/IEEE Standard for Binary Floating-Point
2190 * Arithmetic---which means in particular that the conversion is
2191 * rounded according to the current rounding mode. If `a' is a NaN,
2192 * the largest positive integer is returned. Otherwise, if the
2193 * conversion overflows, the largest integer with the same sign as `a'
2197 static int64_t round_to_int_and_pack(FloatParts in
, FloatRoundMode rmode
,
2198 int scale
, int64_t min
, int64_t max
,
2202 int orig_flags
= get_float_exception_flags(s
);
2203 FloatParts p
= round_to_int(in
, rmode
, scale
, s
);
2206 case float_class_snan
:
2207 case float_class_qnan
:
2208 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2210 case float_class_inf
:
2211 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2212 return p
.sign
? min
: max
;
2213 case float_class_zero
:
2215 case float_class_normal
:
2216 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
2217 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2218 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
2219 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
2224 if (r
<= -(uint64_t) min
) {
2227 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2234 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2239 g_assert_not_reached();
2243 int8_t float16_to_int8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2246 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2247 rmode
, scale
, INT8_MIN
, INT8_MAX
, s
);
2250 int16_t float16_to_int16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2253 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2254 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2257 int32_t float16_to_int32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2260 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2261 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2264 int64_t float16_to_int64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2267 return round_to_int_and_pack(float16_unpack_canonical(a
, s
),
2268 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2271 int16_t float32_to_int16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2274 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2275 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2278 int32_t float32_to_int32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2281 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2282 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2285 int64_t float32_to_int64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2288 return round_to_int_and_pack(float32_unpack_canonical(a
, s
),
2289 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2292 int16_t float64_to_int16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2295 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2296 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2299 int32_t float64_to_int32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2302 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2303 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2306 int64_t float64_to_int64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2309 return round_to_int_and_pack(float64_unpack_canonical(a
, s
),
2310 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2313 int8_t float16_to_int8(float16 a
, float_status
*s
)
2315 return float16_to_int8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2318 int16_t float16_to_int16(float16 a
, float_status
*s
)
2320 return float16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2323 int32_t float16_to_int32(float16 a
, float_status
*s
)
2325 return float16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2328 int64_t float16_to_int64(float16 a
, float_status
*s
)
2330 return float16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2333 int16_t float32_to_int16(float32 a
, float_status
*s
)
2335 return float32_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2338 int32_t float32_to_int32(float32 a
, float_status
*s
)
2340 return float32_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2343 int64_t float32_to_int64(float32 a
, float_status
*s
)
2345 return float32_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2348 int16_t float64_to_int16(float64 a
, float_status
*s
)
2350 return float64_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2353 int32_t float64_to_int32(float64 a
, float_status
*s
)
2355 return float64_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2358 int64_t float64_to_int64(float64 a
, float_status
*s
)
2360 return float64_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2363 int16_t float16_to_int16_round_to_zero(float16 a
, float_status
*s
)
2365 return float16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2368 int32_t float16_to_int32_round_to_zero(float16 a
, float_status
*s
)
2370 return float16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2373 int64_t float16_to_int64_round_to_zero(float16 a
, float_status
*s
)
2375 return float16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2378 int16_t float32_to_int16_round_to_zero(float32 a
, float_status
*s
)
2380 return float32_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2383 int32_t float32_to_int32_round_to_zero(float32 a
, float_status
*s
)
2385 return float32_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2388 int64_t float32_to_int64_round_to_zero(float32 a
, float_status
*s
)
2390 return float32_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2393 int16_t float64_to_int16_round_to_zero(float64 a
, float_status
*s
)
2395 return float64_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2398 int32_t float64_to_int32_round_to_zero(float64 a
, float_status
*s
)
2400 return float64_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2403 int64_t float64_to_int64_round_to_zero(float64 a
, float_status
*s
)
2405 return float64_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2409 * Returns the result of converting the floating-point value `a' to
2410 * the two's complement integer format.
2413 int16_t bfloat16_to_int16_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2416 return round_to_int_and_pack(bfloat16_unpack_canonical(a
, s
),
2417 rmode
, scale
, INT16_MIN
, INT16_MAX
, s
);
2420 int32_t bfloat16_to_int32_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2423 return round_to_int_and_pack(bfloat16_unpack_canonical(a
, s
),
2424 rmode
, scale
, INT32_MIN
, INT32_MAX
, s
);
2427 int64_t bfloat16_to_int64_scalbn(bfloat16 a
, FloatRoundMode rmode
, int scale
,
2430 return round_to_int_and_pack(bfloat16_unpack_canonical(a
, s
),
2431 rmode
, scale
, INT64_MIN
, INT64_MAX
, s
);
2434 int16_t bfloat16_to_int16(bfloat16 a
, float_status
*s
)
2436 return bfloat16_to_int16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2439 int32_t bfloat16_to_int32(bfloat16 a
, float_status
*s
)
2441 return bfloat16_to_int32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2444 int64_t bfloat16_to_int64(bfloat16 a
, float_status
*s
)
2446 return bfloat16_to_int64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2449 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a
, float_status
*s
)
2451 return bfloat16_to_int16_scalbn(a
, float_round_to_zero
, 0, s
);
2454 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a
, float_status
*s
)
2456 return bfloat16_to_int32_scalbn(a
, float_round_to_zero
, 0, s
);
2459 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a
, float_status
*s
)
2461 return bfloat16_to_int64_scalbn(a
, float_round_to_zero
, 0, s
);
2465 * Returns the result of converting the floating-point value `a' to
2466 * the unsigned integer format. The conversion is performed according
2467 * to the IEC/IEEE Standard for Binary Floating-Point
2468 * Arithmetic---which means in particular that the conversion is
2469 * rounded according to the current rounding mode. If `a' is a NaN,
2470 * the largest unsigned integer is returned. Otherwise, if the
2471 * conversion overflows, the largest unsigned integer is returned. If
2472 * the 'a' is negative, the result is rounded and zero is returned;
2473 * values that do not round to zero will raise the inexact exception
2477 static uint64_t round_to_uint_and_pack(FloatParts in
, FloatRoundMode rmode
,
2478 int scale
, uint64_t max
,
2481 int orig_flags
= get_float_exception_flags(s
);
2482 FloatParts p
= round_to_int(in
, rmode
, scale
, s
);
2486 case float_class_snan
:
2487 case float_class_qnan
:
2488 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2490 case float_class_inf
:
2491 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2492 return p
.sign
? 0 : max
;
2493 case float_class_zero
:
2495 case float_class_normal
:
2497 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2501 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
2502 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
2503 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
2504 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
2506 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2510 /* For uint64 this will never trip, but if p.exp is too large
2511 * to shift a decomposed fraction we shall have exited via the
2515 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
2520 g_assert_not_reached();
2524 uint8_t float16_to_uint8_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2527 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2528 rmode
, scale
, UINT8_MAX
, s
);
2531 uint16_t float16_to_uint16_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2534 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2535 rmode
, scale
, UINT16_MAX
, s
);
2538 uint32_t float16_to_uint32_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2541 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2542 rmode
, scale
, UINT32_MAX
, s
);
2545 uint64_t float16_to_uint64_scalbn(float16 a
, FloatRoundMode rmode
, int scale
,
2548 return round_to_uint_and_pack(float16_unpack_canonical(a
, s
),
2549 rmode
, scale
, UINT64_MAX
, s
);
2552 uint16_t float32_to_uint16_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2555 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2556 rmode
, scale
, UINT16_MAX
, s
);
2559 uint32_t float32_to_uint32_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2562 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2563 rmode
, scale
, UINT32_MAX
, s
);
2566 uint64_t float32_to_uint64_scalbn(float32 a
, FloatRoundMode rmode
, int scale
,
2569 return round_to_uint_and_pack(float32_unpack_canonical(a
, s
),
2570 rmode
, scale
, UINT64_MAX
, s
);
2573 uint16_t float64_to_uint16_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2576 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2577 rmode
, scale
, UINT16_MAX
, s
);
2580 uint32_t float64_to_uint32_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2583 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2584 rmode
, scale
, UINT32_MAX
, s
);
2587 uint64_t float64_to_uint64_scalbn(float64 a
, FloatRoundMode rmode
, int scale
,
2590 return round_to_uint_and_pack(float64_unpack_canonical(a
, s
),
2591 rmode
, scale
, UINT64_MAX
, s
);
2594 uint8_t float16_to_uint8(float16 a
, float_status
*s
)
2596 return float16_to_uint8_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2599 uint16_t float16_to_uint16(float16 a
, float_status
*s
)
2601 return float16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2604 uint32_t float16_to_uint32(float16 a
, float_status
*s
)
2606 return float16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2609 uint64_t float16_to_uint64(float16 a
, float_status
*s
)
2611 return float16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2614 uint16_t float32_to_uint16(float32 a
, float_status
*s
)
2616 return float32_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2619 uint32_t float32_to_uint32(float32 a
, float_status
*s
)
2621 return float32_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2624 uint64_t float32_to_uint64(float32 a
, float_status
*s
)
2626 return float32_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2629 uint16_t float64_to_uint16(float64 a
, float_status
*s
)
2631 return float64_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2634 uint32_t float64_to_uint32(float64 a
, float_status
*s
)
2636 return float64_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2639 uint64_t float64_to_uint64(float64 a
, float_status
*s
)
2641 return float64_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2644 uint16_t float16_to_uint16_round_to_zero(float16 a
, float_status
*s
)
2646 return float16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2649 uint32_t float16_to_uint32_round_to_zero(float16 a
, float_status
*s
)
2651 return float16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2654 uint64_t float16_to_uint64_round_to_zero(float16 a
, float_status
*s
)
2656 return float16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2659 uint16_t float32_to_uint16_round_to_zero(float32 a
, float_status
*s
)
2661 return float32_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2664 uint32_t float32_to_uint32_round_to_zero(float32 a
, float_status
*s
)
2666 return float32_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2669 uint64_t float32_to_uint64_round_to_zero(float32 a
, float_status
*s
)
2671 return float32_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2674 uint16_t float64_to_uint16_round_to_zero(float64 a
, float_status
*s
)
2676 return float64_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2679 uint32_t float64_to_uint32_round_to_zero(float64 a
, float_status
*s
)
2681 return float64_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2684 uint64_t float64_to_uint64_round_to_zero(float64 a
, float_status
*s
)
2686 return float64_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2690 * Returns the result of converting the bfloat16 value `a' to
2691 * the unsigned integer format.
2694 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2695 int scale
, float_status
*s
)
2697 return round_to_uint_and_pack(bfloat16_unpack_canonical(a
, s
),
2698 rmode
, scale
, UINT16_MAX
, s
);
2701 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2702 int scale
, float_status
*s
)
2704 return round_to_uint_and_pack(bfloat16_unpack_canonical(a
, s
),
2705 rmode
, scale
, UINT32_MAX
, s
);
2708 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a
, FloatRoundMode rmode
,
2709 int scale
, float_status
*s
)
2711 return round_to_uint_and_pack(bfloat16_unpack_canonical(a
, s
),
2712 rmode
, scale
, UINT64_MAX
, s
);
2715 uint16_t bfloat16_to_uint16(bfloat16 a
, float_status
*s
)
2717 return bfloat16_to_uint16_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2720 uint32_t bfloat16_to_uint32(bfloat16 a
, float_status
*s
)
2722 return bfloat16_to_uint32_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2725 uint64_t bfloat16_to_uint64(bfloat16 a
, float_status
*s
)
2727 return bfloat16_to_uint64_scalbn(a
, s
->float_rounding_mode
, 0, s
);
2730 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a
, float_status
*s
)
2732 return bfloat16_to_uint16_scalbn(a
, float_round_to_zero
, 0, s
);
2735 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a
, float_status
*s
)
2737 return bfloat16_to_uint32_scalbn(a
, float_round_to_zero
, 0, s
);
2740 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a
, float_status
*s
)
2742 return bfloat16_to_uint64_scalbn(a
, float_round_to_zero
, 0, s
);
2746 * Integer to float conversions
2748 * Returns the result of converting the two's complement integer `a'
2749 * to the floating-point format. The conversion is performed according
2750 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2753 static FloatParts
int_to_float(int64_t a
, int scale
, float_status
*status
)
2755 FloatParts r
= { .sign
= false };
2758 r
.cls
= float_class_zero
;
2763 r
.cls
= float_class_normal
;
2768 shift
= clz64(f
) - 1;
2769 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2771 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2772 r
.frac
= (shift
< 0 ? DECOMPOSED_IMPLICIT_BIT
: f
<< shift
);
2778 float16
int64_to_float16_scalbn(int64_t a
, int scale
, float_status
*status
)
2780 FloatParts pa
= int_to_float(a
, scale
, status
);
2781 return float16_round_pack_canonical(pa
, status
);
2784 float16
int32_to_float16_scalbn(int32_t a
, int scale
, float_status
*status
)
2786 return int64_to_float16_scalbn(a
, scale
, status
);
2789 float16
int16_to_float16_scalbn(int16_t a
, int scale
, float_status
*status
)
2791 return int64_to_float16_scalbn(a
, scale
, status
);
2794 float16
int64_to_float16(int64_t a
, float_status
*status
)
2796 return int64_to_float16_scalbn(a
, 0, status
);
2799 float16
int32_to_float16(int32_t a
, float_status
*status
)
2801 return int64_to_float16_scalbn(a
, 0, status
);
2804 float16
int16_to_float16(int16_t a
, float_status
*status
)
2806 return int64_to_float16_scalbn(a
, 0, status
);
2809 float16
int8_to_float16(int8_t a
, float_status
*status
)
2811 return int64_to_float16_scalbn(a
, 0, status
);
2814 float32
int64_to_float32_scalbn(int64_t a
, int scale
, float_status
*status
)
2816 FloatParts pa
= int_to_float(a
, scale
, status
);
2817 return float32_round_pack_canonical(pa
, status
);
2820 float32
int32_to_float32_scalbn(int32_t a
, int scale
, float_status
*status
)
2822 return int64_to_float32_scalbn(a
, scale
, status
);
2825 float32
int16_to_float32_scalbn(int16_t a
, int scale
, float_status
*status
)
2827 return int64_to_float32_scalbn(a
, scale
, status
);
2830 float32
int64_to_float32(int64_t a
, float_status
*status
)
2832 return int64_to_float32_scalbn(a
, 0, status
);
2835 float32
int32_to_float32(int32_t a
, float_status
*status
)
2837 return int64_to_float32_scalbn(a
, 0, status
);
2840 float32
int16_to_float32(int16_t a
, float_status
*status
)
2842 return int64_to_float32_scalbn(a
, 0, status
);
2845 float64
int64_to_float64_scalbn(int64_t a
, int scale
, float_status
*status
)
2847 FloatParts pa
= int_to_float(a
, scale
, status
);
2848 return float64_round_pack_canonical(pa
, status
);
2851 float64
int32_to_float64_scalbn(int32_t a
, int scale
, float_status
*status
)
2853 return int64_to_float64_scalbn(a
, scale
, status
);
2856 float64
int16_to_float64_scalbn(int16_t a
, int scale
, float_status
*status
)
2858 return int64_to_float64_scalbn(a
, scale
, status
);
2861 float64
int64_to_float64(int64_t a
, float_status
*status
)
2863 return int64_to_float64_scalbn(a
, 0, status
);
2866 float64
int32_to_float64(int32_t a
, float_status
*status
)
2868 return int64_to_float64_scalbn(a
, 0, status
);
2871 float64
int16_to_float64(int16_t a
, float_status
*status
)
2873 return int64_to_float64_scalbn(a
, 0, status
);
2877 * Returns the result of converting the two's complement integer `a'
2878 * to the bfloat16 format.
2881 bfloat16
int64_to_bfloat16_scalbn(int64_t a
, int scale
, float_status
*status
)
2883 FloatParts pa
= int_to_float(a
, scale
, status
);
2884 return bfloat16_round_pack_canonical(pa
, status
);
2887 bfloat16
int32_to_bfloat16_scalbn(int32_t a
, int scale
, float_status
*status
)
2889 return int64_to_bfloat16_scalbn(a
, scale
, status
);
2892 bfloat16
int16_to_bfloat16_scalbn(int16_t a
, int scale
, float_status
*status
)
2894 return int64_to_bfloat16_scalbn(a
, scale
, status
);
2897 bfloat16
int64_to_bfloat16(int64_t a
, float_status
*status
)
2899 return int64_to_bfloat16_scalbn(a
, 0, status
);
2902 bfloat16
int32_to_bfloat16(int32_t a
, float_status
*status
)
2904 return int64_to_bfloat16_scalbn(a
, 0, status
);
2907 bfloat16
int16_to_bfloat16(int16_t a
, float_status
*status
)
2909 return int64_to_bfloat16_scalbn(a
, 0, status
);
2913 * Unsigned Integer to float conversions
2915 * Returns the result of converting the unsigned integer `a' to the
2916 * floating-point format. The conversion is performed according to the
2917 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2920 static FloatParts
uint_to_float(uint64_t a
, int scale
, float_status
*status
)
2922 FloatParts r
= { .sign
= false };
2925 r
.cls
= float_class_zero
;
2927 scale
= MIN(MAX(scale
, -0x10000), 0x10000);
2928 r
.cls
= float_class_normal
;
2929 if ((int64_t)a
< 0) {
2930 r
.exp
= DECOMPOSED_BINARY_POINT
+ 1 + scale
;
2931 shift64RightJamming(a
, 1, &a
);
2934 int shift
= clz64(a
) - 1;
2935 r
.exp
= DECOMPOSED_BINARY_POINT
- shift
+ scale
;
2936 r
.frac
= a
<< shift
;
2943 float16
uint64_to_float16_scalbn(uint64_t a
, int scale
, float_status
*status
)
2945 FloatParts pa
= uint_to_float(a
, scale
, status
);
2946 return float16_round_pack_canonical(pa
, status
);
2949 float16
uint32_to_float16_scalbn(uint32_t a
, int scale
, float_status
*status
)
2951 return uint64_to_float16_scalbn(a
, scale
, status
);
2954 float16
uint16_to_float16_scalbn(uint16_t a
, int scale
, float_status
*status
)
2956 return uint64_to_float16_scalbn(a
, scale
, status
);
2959 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
2961 return uint64_to_float16_scalbn(a
, 0, status
);
2964 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
2966 return uint64_to_float16_scalbn(a
, 0, status
);
2969 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
2971 return uint64_to_float16_scalbn(a
, 0, status
);
2974 float16
uint8_to_float16(uint8_t a
, float_status
*status
)
2976 return uint64_to_float16_scalbn(a
, 0, status
);
2979 float32
uint64_to_float32_scalbn(uint64_t a
, int scale
, float_status
*status
)
2981 FloatParts pa
= uint_to_float(a
, scale
, status
);
2982 return float32_round_pack_canonical(pa
, status
);
2985 float32
uint32_to_float32_scalbn(uint32_t a
, int scale
, float_status
*status
)
2987 return uint64_to_float32_scalbn(a
, scale
, status
);
2990 float32
uint16_to_float32_scalbn(uint16_t a
, int scale
, float_status
*status
)
2992 return uint64_to_float32_scalbn(a
, scale
, status
);
2995 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
2997 return uint64_to_float32_scalbn(a
, 0, status
);
3000 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
3002 return uint64_to_float32_scalbn(a
, 0, status
);
3005 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
3007 return uint64_to_float32_scalbn(a
, 0, status
);
3010 float64
uint64_to_float64_scalbn(uint64_t a
, int scale
, float_status
*status
)
3012 FloatParts pa
= uint_to_float(a
, scale
, status
);
3013 return float64_round_pack_canonical(pa
, status
);
3016 float64
uint32_to_float64_scalbn(uint32_t a
, int scale
, float_status
*status
)
3018 return uint64_to_float64_scalbn(a
, scale
, status
);
3021 float64
uint16_to_float64_scalbn(uint16_t a
, int scale
, float_status
*status
)
3023 return uint64_to_float64_scalbn(a
, scale
, status
);
3026 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
3028 return uint64_to_float64_scalbn(a
, 0, status
);
3031 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
3033 return uint64_to_float64_scalbn(a
, 0, status
);
3036 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
3038 return uint64_to_float64_scalbn(a
, 0, status
);
3042 * Returns the result of converting the unsigned integer `a' to the
3046 bfloat16
uint64_to_bfloat16_scalbn(uint64_t a
, int scale
, float_status
*status
)
3048 FloatParts pa
= uint_to_float(a
, scale
, status
);
3049 return bfloat16_round_pack_canonical(pa
, status
);
3052 bfloat16
uint32_to_bfloat16_scalbn(uint32_t a
, int scale
, float_status
*status
)
3054 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3057 bfloat16
uint16_to_bfloat16_scalbn(uint16_t a
, int scale
, float_status
*status
)
3059 return uint64_to_bfloat16_scalbn(a
, scale
, status
);
3062 bfloat16
uint64_to_bfloat16(uint64_t a
, float_status
*status
)
3064 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3067 bfloat16
uint32_to_bfloat16(uint32_t a
, float_status
*status
)
3069 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3072 bfloat16
uint16_to_bfloat16(uint16_t a
, float_status
*status
)
3074 return uint64_to_bfloat16_scalbn(a
, 0, status
);
3078 /* min() and max() functions. These can't be implemented as
3079 * 'compare and pick one input' because that would mishandle
3080 * NaNs and +0 vs -0.
3082 * minnum() and maxnum() functions. These are similar to the min()
3083 * and max() functions but if one of the arguments is a QNaN and
3084 * the other is numerical then the numerical argument is returned.
3085 * SNaNs will get quietened before being returned.
3086 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3087 * and maxNum() operations. min() and max() are the typical min/max
3088 * semantics provided by many CPUs which predate that specification.
3090 * minnummag() and maxnummag() functions correspond to minNumMag()
3091 * and minNumMag() from the IEEE-754 2008.
3093 static FloatParts
minmax_floats(FloatParts a
, FloatParts b
, bool ismin
,
3094 bool ieee
, bool ismag
, float_status
*s
)
3096 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
3098 /* Takes two floating-point values `a' and `b', one of
3099 * which is a NaN, and returns the appropriate NaN
3100 * result. If either `a' or `b' is a signaling NaN,
3101 * the invalid exception is raised.
3103 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
3104 return pick_nan(a
, b
, s
);
3105 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
3107 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
3111 return pick_nan(a
, b
, s
);
3116 case float_class_normal
:
3119 case float_class_inf
:
3122 case float_class_zero
:
3126 g_assert_not_reached();
3130 case float_class_normal
:
3133 case float_class_inf
:
3136 case float_class_zero
:
3140 g_assert_not_reached();
3144 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
3145 bool a_less
= a_exp
< b_exp
;
3146 if (a_exp
== b_exp
) {
3147 a_less
= a
.frac
< b
.frac
;
3149 return a_less
^ ismin
? b
: a
;
3152 if (a
.sign
== b
.sign
) {
3153 bool a_less
= a_exp
< b_exp
;
3154 if (a_exp
== b_exp
) {
3155 a_less
= a
.frac
< b
.frac
;
3157 return a
.sign
^ a_less
^ ismin
? b
: a
;
3159 return a
.sign
^ ismin
? b
: a
;
3164 #define MINMAX(sz, name, ismin, isiee, ismag) \
3165 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
3168 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
3169 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
3170 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3172 return float ## sz ## _round_pack_canonical(pr, s); \
3175 MINMAX(16, min
, true, false, false)
3176 MINMAX(16, minnum
, true, true, false)
3177 MINMAX(16, minnummag
, true, true, true)
3178 MINMAX(16, max
, false, false, false)
3179 MINMAX(16, maxnum
, false, true, false)
3180 MINMAX(16, maxnummag
, false, true, true)
3182 MINMAX(32, min
, true, false, false)
3183 MINMAX(32, minnum
, true, true, false)
3184 MINMAX(32, minnummag
, true, true, true)
3185 MINMAX(32, max
, false, false, false)
3186 MINMAX(32, maxnum
, false, true, false)
3187 MINMAX(32, maxnummag
, false, true, true)
3189 MINMAX(64, min
, true, false, false)
3190 MINMAX(64, minnum
, true, true, false)
3191 MINMAX(64, minnummag
, true, true, true)
3192 MINMAX(64, max
, false, false, false)
3193 MINMAX(64, maxnum
, false, true, false)
3194 MINMAX(64, maxnummag
, false, true, true)
3198 #define BF16_MINMAX(name, ismin, isiee, ismag) \
3199 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \
3201 FloatParts pa = bfloat16_unpack_canonical(a, s); \
3202 FloatParts pb = bfloat16_unpack_canonical(b, s); \
3203 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
3205 return bfloat16_round_pack_canonical(pr, s); \
3208 BF16_MINMAX(min
, true, false, false)
3209 BF16_MINMAX(minnum
, true, true, false)
3210 BF16_MINMAX(minnummag
, true, true, true)
3211 BF16_MINMAX(max
, false, false, false)
3212 BF16_MINMAX(maxnum
, false, true, false)
3213 BF16_MINMAX(maxnummag
, false, true, true)
3217 /* Floating point compare */
3218 static FloatRelation
compare_floats(FloatParts a
, FloatParts b
, bool is_quiet
,
3221 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
3223 a
.cls
== float_class_snan
||
3224 b
.cls
== float_class_snan
) {
3225 s
->float_exception_flags
|= float_flag_invalid
;
3227 return float_relation_unordered
;
3230 if (a
.cls
== float_class_zero
) {
3231 if (b
.cls
== float_class_zero
) {
3232 return float_relation_equal
;
3234 return b
.sign
? float_relation_greater
: float_relation_less
;
3235 } else if (b
.cls
== float_class_zero
) {
3236 return a
.sign
? float_relation_less
: float_relation_greater
;
3239 /* The only really important thing about infinity is its sign. If
3240 * both are infinities the sign marks the smallest of the two.
3242 if (a
.cls
== float_class_inf
) {
3243 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
3244 return float_relation_equal
;
3246 return a
.sign
? float_relation_less
: float_relation_greater
;
3247 } else if (b
.cls
== float_class_inf
) {
3248 return b
.sign
? float_relation_greater
: float_relation_less
;
3251 if (a
.sign
!= b
.sign
) {
3252 return a
.sign
? float_relation_less
: float_relation_greater
;
3255 if (a
.exp
== b
.exp
) {
3256 if (a
.frac
== b
.frac
) {
3257 return float_relation_equal
;
3260 return a
.frac
> b
.frac
?
3261 float_relation_less
: float_relation_greater
;
3263 return a
.frac
> b
.frac
?
3264 float_relation_greater
: float_relation_less
;
3268 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
3270 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
3275 #define COMPARE(name, attr, sz) \
3277 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
3279 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
3280 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
3281 return compare_floats(pa, pb, is_quiet, s); \
3284 COMPARE(soft_f16_compare
, QEMU_FLATTEN
, 16)
3285 COMPARE(soft_f32_compare
, QEMU_SOFTFLOAT_ATTR
, 32)
3286 COMPARE(soft_f64_compare
, QEMU_SOFTFLOAT_ATTR
, 64)
3290 FloatRelation
float16_compare(float16 a
, float16 b
, float_status
*s
)
3292 return soft_f16_compare(a
, b
, false, s
);
3295 FloatRelation
float16_compare_quiet(float16 a
, float16 b
, float_status
*s
)
3297 return soft_f16_compare(a
, b
, true, s
);
3300 static FloatRelation QEMU_FLATTEN
3301 f32_compare(float32 xa
, float32 xb
, bool is_quiet
, float_status
*s
)
3303 union_float32 ua
, ub
;
3308 if (QEMU_NO_HARDFLOAT
) {
3312 float32_input_flush2(&ua
.s
, &ub
.s
, s
);
3313 if (isgreaterequal(ua
.h
, ub
.h
)) {
3314 if (isgreater(ua
.h
, ub
.h
)) {
3315 return float_relation_greater
;
3317 return float_relation_equal
;
3319 if (likely(isless(ua
.h
, ub
.h
))) {
3320 return float_relation_less
;
3322 /* The only condition remaining is unordered.
3323 * Fall through to set flags.
3326 return soft_f32_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3329 FloatRelation
float32_compare(float32 a
, float32 b
, float_status
*s
)
3331 return f32_compare(a
, b
, false, s
);
3334 FloatRelation
float32_compare_quiet(float32 a
, float32 b
, float_status
*s
)
3336 return f32_compare(a
, b
, true, s
);
3339 static FloatRelation QEMU_FLATTEN
3340 f64_compare(float64 xa
, float64 xb
, bool is_quiet
, float_status
*s
)
3342 union_float64 ua
, ub
;
3347 if (QEMU_NO_HARDFLOAT
) {
3351 float64_input_flush2(&ua
.s
, &ub
.s
, s
);
3352 if (isgreaterequal(ua
.h
, ub
.h
)) {
3353 if (isgreater(ua
.h
, ub
.h
)) {
3354 return float_relation_greater
;
3356 return float_relation_equal
;
3358 if (likely(isless(ua
.h
, ub
.h
))) {
3359 return float_relation_less
;
3361 /* The only condition remaining is unordered.
3362 * Fall through to set flags.
3365 return soft_f64_compare(ua
.s
, ub
.s
, is_quiet
, s
);
3368 FloatRelation
float64_compare(float64 a
, float64 b
, float_status
*s
)
3370 return f64_compare(a
, b
, false, s
);
3373 FloatRelation
float64_compare_quiet(float64 a
, float64 b
, float_status
*s
)
3375 return f64_compare(a
, b
, true, s
);
3378 static FloatRelation QEMU_FLATTEN
3379 soft_bf16_compare(bfloat16 a
, bfloat16 b
, bool is_quiet
, float_status
*s
)
3381 FloatParts pa
= bfloat16_unpack_canonical(a
, s
);
3382 FloatParts pb
= bfloat16_unpack_canonical(b
, s
);
3383 return compare_floats(pa
, pb
, is_quiet
, s
);
3386 FloatRelation
bfloat16_compare(bfloat16 a
, bfloat16 b
, float_status
*s
)
3388 return soft_bf16_compare(a
, b
, false, s
);
3391 FloatRelation
bfloat16_compare_quiet(bfloat16 a
, bfloat16 b
, float_status
*s
)
3393 return soft_bf16_compare(a
, b
, true, s
);
3396 /* Multiply A by 2 raised to the power N. */
3397 static FloatParts
scalbn_decomposed(FloatParts a
, int n
, float_status
*s
)
3399 if (unlikely(is_nan(a
.cls
))) {
3400 return return_nan(a
, s
);
3402 if (a
.cls
== float_class_normal
) {
3403 /* The largest float type (even though not supported by FloatParts)
3404 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3405 * still allows rounding to infinity, without allowing overflow
3406 * within the int32_t that backs FloatParts.exp.
3408 n
= MIN(MAX(n
, -0x10000), 0x10000);
3414 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
3416 FloatParts pa
= float16_unpack_canonical(a
, status
);
3417 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3418 return float16_round_pack_canonical(pr
, status
);
3421 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
3423 FloatParts pa
= float32_unpack_canonical(a
, status
);
3424 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3425 return float32_round_pack_canonical(pr
, status
);
3428 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
3430 FloatParts pa
= float64_unpack_canonical(a
, status
);
3431 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3432 return float64_round_pack_canonical(pr
, status
);
3435 bfloat16
bfloat16_scalbn(bfloat16 a
, int n
, float_status
*status
)
3437 FloatParts pa
= bfloat16_unpack_canonical(a
, status
);
3438 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
3439 return bfloat16_round_pack_canonical(pr
, status
);
3445 * The old softfloat code did an approximation step before zeroing in
3446 * on the final result. However for simpleness we just compute the
3447 * square root by iterating down from the implicit bit to enough extra
3448 * bits to ensure we get a correctly rounded result.
3450 * This does mean however the calculation is slower than before,
3451 * especially for 64 bit floats.
3454 static FloatParts
sqrt_float(FloatParts a
, float_status
*s
, const FloatFmt
*p
)
3456 uint64_t a_frac
, r_frac
, s_frac
;
3459 if (is_nan(a
.cls
)) {
3460 return return_nan(a
, s
);
3462 if (a
.cls
== float_class_zero
) {
3463 return a
; /* sqrt(+-0) = +-0 */
3466 s
->float_exception_flags
|= float_flag_invalid
;
3467 return parts_default_nan(s
);
3469 if (a
.cls
== float_class_inf
) {
3470 return a
; /* sqrt(+inf) = +inf */
3473 assert(a
.cls
== float_class_normal
);
3475 /* We need two overflow bits at the top. Adding room for that is a
3476 * right shift. If the exponent is odd, we can discard the low bit
3477 * by multiplying the fraction by 2; that's a left shift. Combine
3478 * those and we shift right if the exponent is even.
3486 /* Bit-by-bit computation of sqrt. */
3490 /* Iterate from implicit bit down to the 3 extra bits to compute a
3491 * properly rounded result. Remember we've inserted one more bit
3492 * at the top, so these positions are one less.
3494 bit
= DECOMPOSED_BINARY_POINT
- 1;
3495 last_bit
= MAX(p
->frac_shift
- 4, 0);
3497 uint64_t q
= 1ULL << bit
;
3498 uint64_t t_frac
= s_frac
+ q
;
3499 if (t_frac
<= a_frac
) {
3500 s_frac
= t_frac
+ q
;
3505 } while (--bit
>= last_bit
);
3507 /* Undo the right shift done above. If there is any remaining
3508 * fraction, the result is inexact. Set the sticky bit.
3510 a
.frac
= (r_frac
<< 1) + (a_frac
!= 0);
3515 float16 QEMU_FLATTEN
float16_sqrt(float16 a
, float_status
*status
)
3517 FloatParts pa
= float16_unpack_canonical(a
, status
);
3518 FloatParts pr
= sqrt_float(pa
, status
, &float16_params
);
3519 return float16_round_pack_canonical(pr
, status
);
3522 static float32 QEMU_SOFTFLOAT_ATTR
3523 soft_f32_sqrt(float32 a
, float_status
*status
)
3525 FloatParts pa
= float32_unpack_canonical(a
, status
);
3526 FloatParts pr
= sqrt_float(pa
, status
, &float32_params
);
3527 return float32_round_pack_canonical(pr
, status
);
3530 static float64 QEMU_SOFTFLOAT_ATTR
3531 soft_f64_sqrt(float64 a
, float_status
*status
)
3533 FloatParts pa
= float64_unpack_canonical(a
, status
);
3534 FloatParts pr
= sqrt_float(pa
, status
, &float64_params
);
3535 return float64_round_pack_canonical(pr
, status
);
3538 float32 QEMU_FLATTEN
float32_sqrt(float32 xa
, float_status
*s
)
3540 union_float32 ua
, ur
;
3543 if (unlikely(!can_use_fpu(s
))) {
3547 float32_input_flush1(&ua
.s
, s
);
3548 if (QEMU_HARDFLOAT_1F32_USE_FP
) {
3549 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3550 fpclassify(ua
.h
) == FP_ZERO
) ||
3554 } else if (unlikely(!float32_is_zero_or_normal(ua
.s
) ||
3555 float32_is_neg(ua
.s
))) {
3562 return soft_f32_sqrt(ua
.s
, s
);
3565 float64 QEMU_FLATTEN
float64_sqrt(float64 xa
, float_status
*s
)
3567 union_float64 ua
, ur
;
3570 if (unlikely(!can_use_fpu(s
))) {
3574 float64_input_flush1(&ua
.s
, s
);
3575 if (QEMU_HARDFLOAT_1F64_USE_FP
) {
3576 if (unlikely(!(fpclassify(ua
.h
) == FP_NORMAL
||
3577 fpclassify(ua
.h
) == FP_ZERO
) ||
3581 } else if (unlikely(!float64_is_zero_or_normal(ua
.s
) ||
3582 float64_is_neg(ua
.s
))) {
3589 return soft_f64_sqrt(ua
.s
, s
);
3592 bfloat16 QEMU_FLATTEN
bfloat16_sqrt(bfloat16 a
, float_status
*status
)
3594 FloatParts pa
= bfloat16_unpack_canonical(a
, status
);
3595 FloatParts pr
= sqrt_float(pa
, status
, &bfloat16_params
);
3596 return bfloat16_round_pack_canonical(pr
, status
);
3599 /*----------------------------------------------------------------------------
3600 | The pattern for a default generated NaN.
3601 *----------------------------------------------------------------------------*/
3603 float16
float16_default_nan(float_status
*status
)
3605 FloatParts p
= parts_default_nan(status
);
3606 p
.frac
>>= float16_params
.frac_shift
;
3607 return float16_pack_raw(p
);
3610 float32
float32_default_nan(float_status
*status
)
3612 FloatParts p
= parts_default_nan(status
);
3613 p
.frac
>>= float32_params
.frac_shift
;
3614 return float32_pack_raw(p
);
3617 float64
float64_default_nan(float_status
*status
)
3619 FloatParts p
= parts_default_nan(status
);
3620 p
.frac
>>= float64_params
.frac_shift
;
3621 return float64_pack_raw(p
);
3624 float128
float128_default_nan(float_status
*status
)
3626 FloatParts p
= parts_default_nan(status
);
3629 /* Extrapolate from the choices made by parts_default_nan to fill
3630 * in the quad-floating format. If the low bit is set, assume we
3631 * want to set all non-snan bits.
3633 r
.low
= -(p
.frac
& 1);
3634 r
.high
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- 48);
3635 r
.high
|= UINT64_C(0x7FFF000000000000);
3636 r
.high
|= (uint64_t)p
.sign
<< 63;
3641 bfloat16
bfloat16_default_nan(float_status
*status
)
3643 FloatParts p
= parts_default_nan(status
);
3644 p
.frac
>>= bfloat16_params
.frac_shift
;
3645 return bfloat16_pack_raw(p
);
3648 /*----------------------------------------------------------------------------
3649 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3650 *----------------------------------------------------------------------------*/
3652 float16
float16_silence_nan(float16 a
, float_status
*status
)
3654 FloatParts p
= float16_unpack_raw(a
);
3655 p
.frac
<<= float16_params
.frac_shift
;
3656 p
= parts_silence_nan(p
, status
);
3657 p
.frac
>>= float16_params
.frac_shift
;
3658 return float16_pack_raw(p
);
3661 float32
float32_silence_nan(float32 a
, float_status
*status
)
3663 FloatParts p
= float32_unpack_raw(a
);
3664 p
.frac
<<= float32_params
.frac_shift
;
3665 p
= parts_silence_nan(p
, status
);
3666 p
.frac
>>= float32_params
.frac_shift
;
3667 return float32_pack_raw(p
);
3670 float64
float64_silence_nan(float64 a
, float_status
*status
)
3672 FloatParts p
= float64_unpack_raw(a
);
3673 p
.frac
<<= float64_params
.frac_shift
;
3674 p
= parts_silence_nan(p
, status
);
3675 p
.frac
>>= float64_params
.frac_shift
;
3676 return float64_pack_raw(p
);
3679 bfloat16
bfloat16_silence_nan(bfloat16 a
, float_status
*status
)
3681 FloatParts p
= bfloat16_unpack_raw(a
);
3682 p
.frac
<<= bfloat16_params
.frac_shift
;
3683 p
= parts_silence_nan(p
, status
);
3684 p
.frac
>>= bfloat16_params
.frac_shift
;
3685 return bfloat16_pack_raw(p
);
3688 /*----------------------------------------------------------------------------
3689 | If `a' is denormal and we are in flush-to-zero mode then set the
3690 | input-denormal exception and return zero. Otherwise just return the value.
3691 *----------------------------------------------------------------------------*/
3693 static bool parts_squash_denormal(FloatParts p
, float_status
*status
)
3695 if (p
.exp
== 0 && p
.frac
!= 0) {
3696 float_raise(float_flag_input_denormal
, status
);
3703 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
3705 if (status
->flush_inputs_to_zero
) {
3706 FloatParts p
= float16_unpack_raw(a
);
3707 if (parts_squash_denormal(p
, status
)) {
3708 return float16_set_sign(float16_zero
, p
.sign
);
3714 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
3716 if (status
->flush_inputs_to_zero
) {
3717 FloatParts p
= float32_unpack_raw(a
);
3718 if (parts_squash_denormal(p
, status
)) {
3719 return float32_set_sign(float32_zero
, p
.sign
);
3725 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
3727 if (status
->flush_inputs_to_zero
) {
3728 FloatParts p
= float64_unpack_raw(a
);
3729 if (parts_squash_denormal(p
, status
)) {
3730 return float64_set_sign(float64_zero
, p
.sign
);
3736 bfloat16
bfloat16_squash_input_denormal(bfloat16 a
, float_status
*status
)
3738 if (status
->flush_inputs_to_zero
) {
3739 FloatParts p
= bfloat16_unpack_raw(a
);
3740 if (parts_squash_denormal(p
, status
)) {
3741 return bfloat16_set_sign(bfloat16_zero
, p
.sign
);
3747 /*----------------------------------------------------------------------------
3748 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3749 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3750 | input. If `zSign' is 1, the input is negated before being converted to an
3751 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3752 | is simply rounded to an integer, with the inexact exception raised if the
3753 | input cannot be represented exactly as an integer. However, if the fixed-
3754 | point input is too large, the invalid exception is raised and the largest
3755 | positive or negative integer is returned.
3756 *----------------------------------------------------------------------------*/
3758 static int32_t roundAndPackInt32(bool zSign
, uint64_t absZ
,
3759 float_status
*status
)
3761 int8_t roundingMode
;
3762 bool roundNearestEven
;
3763 int8_t roundIncrement
, roundBits
;
3766 roundingMode
= status
->float_rounding_mode
;
3767 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3768 switch (roundingMode
) {
3769 case float_round_nearest_even
:
3770 case float_round_ties_away
:
3771 roundIncrement
= 0x40;
3773 case float_round_to_zero
:
3776 case float_round_up
:
3777 roundIncrement
= zSign
? 0 : 0x7f;
3779 case float_round_down
:
3780 roundIncrement
= zSign
? 0x7f : 0;
3782 case float_round_to_odd
:
3783 roundIncrement
= absZ
& 0x80 ? 0 : 0x7f;
3788 roundBits
= absZ
& 0x7F;
3789 absZ
= ( absZ
+ roundIncrement
)>>7;
3790 if (!(roundBits
^ 0x40) && roundNearestEven
) {
3794 if ( zSign
) z
= - z
;
3795 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
3796 float_raise(float_flag_invalid
, status
);
3797 return zSign
? INT32_MIN
: INT32_MAX
;
3800 status
->float_exception_flags
|= float_flag_inexact
;
3806 /*----------------------------------------------------------------------------
3807 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3808 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3809 | and returns the properly rounded 64-bit integer corresponding to the input.
3810 | If `zSign' is 1, the input is negated before being converted to an integer.
3811 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3812 | the inexact exception raised if the input cannot be represented exactly as
3813 | an integer. However, if the fixed-point input is too large, the invalid
3814 | exception is raised and the largest positive or negative integer is
3816 *----------------------------------------------------------------------------*/
3818 static int64_t roundAndPackInt64(bool zSign
, uint64_t absZ0
, uint64_t absZ1
,
3819 float_status
*status
)
3821 int8_t roundingMode
;
3822 bool roundNearestEven
, increment
;
3825 roundingMode
= status
->float_rounding_mode
;
3826 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3827 switch (roundingMode
) {
3828 case float_round_nearest_even
:
3829 case float_round_ties_away
:
3830 increment
= ((int64_t) absZ1
< 0);
3832 case float_round_to_zero
:
3835 case float_round_up
:
3836 increment
= !zSign
&& absZ1
;
3838 case float_round_down
:
3839 increment
= zSign
&& absZ1
;
3841 case float_round_to_odd
:
3842 increment
= !(absZ0
& 1) && absZ1
;
3849 if ( absZ0
== 0 ) goto overflow
;
3850 if (!(absZ1
<< 1) && roundNearestEven
) {
3855 if ( zSign
) z
= - z
;
3856 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
3858 float_raise(float_flag_invalid
, status
);
3859 return zSign
? INT64_MIN
: INT64_MAX
;
3862 status
->float_exception_flags
|= float_flag_inexact
;
3868 /*----------------------------------------------------------------------------
3869 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3870 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3871 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3872 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3873 | with the inexact exception raised if the input cannot be represented exactly
3874 | as an integer. However, if the fixed-point input is too large, the invalid
3875 | exception is raised and the largest unsigned integer is returned.
3876 *----------------------------------------------------------------------------*/
3878 static int64_t roundAndPackUint64(bool zSign
, uint64_t absZ0
,
3879 uint64_t absZ1
, float_status
*status
)
3881 int8_t roundingMode
;
3882 bool roundNearestEven
, increment
;
3884 roundingMode
= status
->float_rounding_mode
;
3885 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
3886 switch (roundingMode
) {
3887 case float_round_nearest_even
:
3888 case float_round_ties_away
:
3889 increment
= ((int64_t)absZ1
< 0);
3891 case float_round_to_zero
:
3894 case float_round_up
:
3895 increment
= !zSign
&& absZ1
;
3897 case float_round_down
:
3898 increment
= zSign
&& absZ1
;
3900 case float_round_to_odd
:
3901 increment
= !(absZ0
& 1) && absZ1
;
3909 float_raise(float_flag_invalid
, status
);
3912 if (!(absZ1
<< 1) && roundNearestEven
) {
3917 if (zSign
&& absZ0
) {
3918 float_raise(float_flag_invalid
, status
);
3923 status
->float_exception_flags
|= float_flag_inexact
;
3928 /*----------------------------------------------------------------------------
3929 | Normalizes the subnormal single-precision floating-point value represented
3930 | by the denormalized significand `aSig'. The normalized exponent and
3931 | significand are stored at the locations pointed to by `zExpPtr' and
3932 | `zSigPtr', respectively.
3933 *----------------------------------------------------------------------------*/
3936 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
3940 shiftCount
= clz32(aSig
) - 8;
3941 *zSigPtr
= aSig
<<shiftCount
;
3942 *zExpPtr
= 1 - shiftCount
;
3946 /*----------------------------------------------------------------------------
3947 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3948 | and significand `zSig', and returns the proper single-precision floating-
3949 | point value corresponding to the abstract input. Ordinarily, the abstract
3950 | value is simply rounded and packed into the single-precision format, with
3951 | the inexact exception raised if the abstract input cannot be represented
3952 | exactly. However, if the abstract value is too large, the overflow and
3953 | inexact exceptions are raised and an infinity or maximal finite value is
3954 | returned. If the abstract value is too small, the input value is rounded to
3955 | a subnormal number, and the underflow and inexact exceptions are raised if
3956 | the abstract input cannot be represented exactly as a subnormal single-
3957 | precision floating-point number.
3958 | The input significand `zSig' has its binary point between bits 30
3959 | and 29, which is 7 bits to the left of the usual location. This shifted
3960 | significand must be normalized or smaller. If `zSig' is not normalized,
3961 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3962 | and it must not require rounding. In the usual case that `zSig' is
3963 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3964 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3965 | Binary Floating-Point Arithmetic.
3966 *----------------------------------------------------------------------------*/
3968 static float32
roundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
3969 float_status
*status
)
3971 int8_t roundingMode
;
3972 bool roundNearestEven
;
3973 int8_t roundIncrement
, roundBits
;
3976 roundingMode
= status
->float_rounding_mode
;
3977 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
3978 switch (roundingMode
) {
3979 case float_round_nearest_even
:
3980 case float_round_ties_away
:
3981 roundIncrement
= 0x40;
3983 case float_round_to_zero
:
3986 case float_round_up
:
3987 roundIncrement
= zSign
? 0 : 0x7f;
3989 case float_round_down
:
3990 roundIncrement
= zSign
? 0x7f : 0;
3992 case float_round_to_odd
:
3993 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
3999 roundBits
= zSig
& 0x7F;
4000 if ( 0xFD <= (uint16_t) zExp
) {
4001 if ( ( 0xFD < zExp
)
4002 || ( ( zExp
== 0xFD )
4003 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
4005 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
4006 roundIncrement
!= 0;
4007 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4008 return packFloat32(zSign
, 0xFF, -!overflow_to_inf
);
4011 if (status
->flush_to_zero
) {
4012 float_raise(float_flag_output_denormal
, status
);
4013 return packFloat32(zSign
, 0, 0);
4015 isTiny
= status
->tininess_before_rounding
4017 || (zSig
+ roundIncrement
< 0x80000000);
4018 shift32RightJamming( zSig
, - zExp
, &zSig
);
4020 roundBits
= zSig
& 0x7F;
4021 if (isTiny
&& roundBits
) {
4022 float_raise(float_flag_underflow
, status
);
4024 if (roundingMode
== float_round_to_odd
) {
4026 * For round-to-odd case, the roundIncrement depends on
4027 * zSig which just changed.
4029 roundIncrement
= zSig
& 0x80 ? 0 : 0x7f;
4034 status
->float_exception_flags
|= float_flag_inexact
;
4036 zSig
= ( zSig
+ roundIncrement
)>>7;
4037 if (!(roundBits
^ 0x40) && roundNearestEven
) {
4040 if ( zSig
== 0 ) zExp
= 0;
4041 return packFloat32( zSign
, zExp
, zSig
);
4045 /*----------------------------------------------------------------------------
4046 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4047 | and significand `zSig', and returns the proper single-precision floating-
4048 | point value corresponding to the abstract input. This routine is just like
4049 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4050 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4051 | floating-point exponent.
4052 *----------------------------------------------------------------------------*/
4055 normalizeRoundAndPackFloat32(bool zSign
, int zExp
, uint32_t zSig
,
4056 float_status
*status
)
4060 shiftCount
= clz32(zSig
) - 1;
4061 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4066 /*----------------------------------------------------------------------------
4067 | Normalizes the subnormal double-precision floating-point value represented
4068 | by the denormalized significand `aSig'. The normalized exponent and
4069 | significand are stored at the locations pointed to by `zExpPtr' and
4070 | `zSigPtr', respectively.
4071 *----------------------------------------------------------------------------*/
4074 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
4078 shiftCount
= clz64(aSig
) - 11;
4079 *zSigPtr
= aSig
<<shiftCount
;
4080 *zExpPtr
= 1 - shiftCount
;
4084 /*----------------------------------------------------------------------------
4085 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4086 | double-precision floating-point value, returning the result. After being
4087 | shifted into the proper positions, the three fields are simply added
4088 | together to form the result. This means that any integer portion of `zSig'
4089 | will be added into the exponent. Since a properly normalized significand
4090 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4091 | than the desired result exponent whenever `zSig' is a complete, normalized
4093 *----------------------------------------------------------------------------*/
4095 static inline float64
packFloat64(bool zSign
, int zExp
, uint64_t zSig
)
4098 return make_float64(
4099 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
4103 /*----------------------------------------------------------------------------
4104 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4105 | and significand `zSig', and returns the proper double-precision floating-
4106 | point value corresponding to the abstract input. Ordinarily, the abstract
4107 | value is simply rounded and packed into the double-precision format, with
4108 | the inexact exception raised if the abstract input cannot be represented
4109 | exactly. However, if the abstract value is too large, the overflow and
4110 | inexact exceptions are raised and an infinity or maximal finite value is
4111 | returned. If the abstract value is too small, the input value is rounded to
4112 | a subnormal number, and the underflow and inexact exceptions are raised if
4113 | the abstract input cannot be represented exactly as a subnormal double-
4114 | precision floating-point number.
4115 | The input significand `zSig' has its binary point between bits 62
4116 | and 61, which is 10 bits to the left of the usual location. This shifted
4117 | significand must be normalized or smaller. If `zSig' is not normalized,
4118 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4119 | and it must not require rounding. In the usual case that `zSig' is
4120 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4121 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4122 | Binary Floating-Point Arithmetic.
4123 *----------------------------------------------------------------------------*/
4125 static float64
roundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4126 float_status
*status
)
4128 int8_t roundingMode
;
4129 bool roundNearestEven
;
4130 int roundIncrement
, roundBits
;
4133 roundingMode
= status
->float_rounding_mode
;
4134 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4135 switch (roundingMode
) {
4136 case float_round_nearest_even
:
4137 case float_round_ties_away
:
4138 roundIncrement
= 0x200;
4140 case float_round_to_zero
:
4143 case float_round_up
:
4144 roundIncrement
= zSign
? 0 : 0x3ff;
4146 case float_round_down
:
4147 roundIncrement
= zSign
? 0x3ff : 0;
4149 case float_round_to_odd
:
4150 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4155 roundBits
= zSig
& 0x3FF;
4156 if ( 0x7FD <= (uint16_t) zExp
) {
4157 if ( ( 0x7FD < zExp
)
4158 || ( ( zExp
== 0x7FD )
4159 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
4161 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
4162 roundIncrement
!= 0;
4163 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4164 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
4167 if (status
->flush_to_zero
) {
4168 float_raise(float_flag_output_denormal
, status
);
4169 return packFloat64(zSign
, 0, 0);
4171 isTiny
= status
->tininess_before_rounding
4173 || (zSig
+ roundIncrement
< UINT64_C(0x8000000000000000));
4174 shift64RightJamming( zSig
, - zExp
, &zSig
);
4176 roundBits
= zSig
& 0x3FF;
4177 if (isTiny
&& roundBits
) {
4178 float_raise(float_flag_underflow
, status
);
4180 if (roundingMode
== float_round_to_odd
) {
4182 * For round-to-odd case, the roundIncrement depends on
4183 * zSig which just changed.
4185 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
4190 status
->float_exception_flags
|= float_flag_inexact
;
4192 zSig
= ( zSig
+ roundIncrement
)>>10;
4193 if (!(roundBits
^ 0x200) && roundNearestEven
) {
4196 if ( zSig
== 0 ) zExp
= 0;
4197 return packFloat64( zSign
, zExp
, zSig
);
4201 /*----------------------------------------------------------------------------
4202 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4203 | and significand `zSig', and returns the proper double-precision floating-
4204 | point value corresponding to the abstract input. This routine is just like
4205 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4206 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4207 | floating-point exponent.
4208 *----------------------------------------------------------------------------*/
4211 normalizeRoundAndPackFloat64(bool zSign
, int zExp
, uint64_t zSig
,
4212 float_status
*status
)
4216 shiftCount
= clz64(zSig
) - 1;
4217 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
4222 /*----------------------------------------------------------------------------
4223 | Normalizes the subnormal extended double-precision floating-point value
4224 | represented by the denormalized significand `aSig'. The normalized exponent
4225 | and significand are stored at the locations pointed to by `zExpPtr' and
4226 | `zSigPtr', respectively.
4227 *----------------------------------------------------------------------------*/
4229 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
4234 shiftCount
= clz64(aSig
);
4235 *zSigPtr
= aSig
<<shiftCount
;
4236 *zExpPtr
= 1 - shiftCount
;
4239 /*----------------------------------------------------------------------------
4240 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4241 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4242 | and returns the proper extended double-precision floating-point value
4243 | corresponding to the abstract input. Ordinarily, the abstract value is
4244 | rounded and packed into the extended double-precision format, with the
4245 | inexact exception raised if the abstract input cannot be represented
4246 | exactly. However, if the abstract value is too large, the overflow and
4247 | inexact exceptions are raised and an infinity or maximal finite value is
4248 | returned. If the abstract value is too small, the input value is rounded to
4249 | a subnormal number, and the underflow and inexact exceptions are raised if
4250 | the abstract input cannot be represented exactly as a subnormal extended
4251 | double-precision floating-point number.
4252 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
4253 | number of bits as single or double precision, respectively. Otherwise, the
4254 | result is rounded to the full precision of the extended double-precision
4256 | The input significand must be normalized or smaller. If the input
4257 | significand is not normalized, `zExp' must be 0; in that case, the result
4258 | returned is a subnormal number, and it must not require rounding. The
4259 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4260 | Floating-Point Arithmetic.
4261 *----------------------------------------------------------------------------*/
4263 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, bool zSign
,
4264 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
4265 float_status
*status
)
4267 int8_t roundingMode
;
4268 bool roundNearestEven
, increment
, isTiny
;
4269 int64_t roundIncrement
, roundMask
, roundBits
;
4271 roundingMode
= status
->float_rounding_mode
;
4272 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4273 if ( roundingPrecision
== 80 ) goto precision80
;
4274 if ( roundingPrecision
== 64 ) {
4275 roundIncrement
= UINT64_C(0x0000000000000400);
4276 roundMask
= UINT64_C(0x00000000000007FF);
4278 else if ( roundingPrecision
== 32 ) {
4279 roundIncrement
= UINT64_C(0x0000008000000000);
4280 roundMask
= UINT64_C(0x000000FFFFFFFFFF);
4285 zSig0
|= ( zSig1
!= 0 );
4286 switch (roundingMode
) {
4287 case float_round_nearest_even
:
4288 case float_round_ties_away
:
4290 case float_round_to_zero
:
4293 case float_round_up
:
4294 roundIncrement
= zSign
? 0 : roundMask
;
4296 case float_round_down
:
4297 roundIncrement
= zSign
? roundMask
: 0;
4302 roundBits
= zSig0
& roundMask
;
4303 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4304 if ( ( 0x7FFE < zExp
)
4305 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
4310 if (status
->flush_to_zero
) {
4311 float_raise(float_flag_output_denormal
, status
);
4312 return packFloatx80(zSign
, 0, 0);
4314 isTiny
= status
->tininess_before_rounding
4316 || (zSig0
<= zSig0
+ roundIncrement
);
4317 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
4319 roundBits
= zSig0
& roundMask
;
4320 if (isTiny
&& roundBits
) {
4321 float_raise(float_flag_underflow
, status
);
4324 status
->float_exception_flags
|= float_flag_inexact
;
4326 zSig0
+= roundIncrement
;
4327 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4328 roundIncrement
= roundMask
+ 1;
4329 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4330 roundMask
|= roundIncrement
;
4332 zSig0
&= ~ roundMask
;
4333 return packFloatx80( zSign
, zExp
, zSig0
);
4337 status
->float_exception_flags
|= float_flag_inexact
;
4339 zSig0
+= roundIncrement
;
4340 if ( zSig0
< roundIncrement
) {
4342 zSig0
= UINT64_C(0x8000000000000000);
4344 roundIncrement
= roundMask
+ 1;
4345 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
4346 roundMask
|= roundIncrement
;
4348 zSig0
&= ~ roundMask
;
4349 if ( zSig0
== 0 ) zExp
= 0;
4350 return packFloatx80( zSign
, zExp
, zSig0
);
4352 switch (roundingMode
) {
4353 case float_round_nearest_even
:
4354 case float_round_ties_away
:
4355 increment
= ((int64_t)zSig1
< 0);
4357 case float_round_to_zero
:
4360 case float_round_up
:
4361 increment
= !zSign
&& zSig1
;
4363 case float_round_down
:
4364 increment
= zSign
&& zSig1
;
4369 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
4370 if ( ( 0x7FFE < zExp
)
4371 || ( ( zExp
== 0x7FFE )
4372 && ( zSig0
== UINT64_C(0xFFFFFFFFFFFFFFFF) )
4378 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4379 if ( ( roundingMode
== float_round_to_zero
)
4380 || ( zSign
&& ( roundingMode
== float_round_up
) )
4381 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4383 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
4385 return packFloatx80(zSign
,
4386 floatx80_infinity_high
,
4387 floatx80_infinity_low
);
4390 isTiny
= status
->tininess_before_rounding
4393 || (zSig0
< UINT64_C(0xFFFFFFFFFFFFFFFF));
4394 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
4396 if (isTiny
&& zSig1
) {
4397 float_raise(float_flag_underflow
, status
);
4400 status
->float_exception_flags
|= float_flag_inexact
;
4402 switch (roundingMode
) {
4403 case float_round_nearest_even
:
4404 case float_round_ties_away
:
4405 increment
= ((int64_t)zSig1
< 0);
4407 case float_round_to_zero
:
4410 case float_round_up
:
4411 increment
= !zSign
&& zSig1
;
4413 case float_round_down
:
4414 increment
= zSign
&& zSig1
;
4421 if (!(zSig1
<< 1) && roundNearestEven
) {
4424 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
4426 return packFloatx80( zSign
, zExp
, zSig0
);
4430 status
->float_exception_flags
|= float_flag_inexact
;
4436 zSig0
= UINT64_C(0x8000000000000000);
4439 if (!(zSig1
<< 1) && roundNearestEven
) {
4445 if ( zSig0
== 0 ) zExp
= 0;
4447 return packFloatx80( zSign
, zExp
, zSig0
);
4451 /*----------------------------------------------------------------------------
4452 | Takes an abstract floating-point value having sign `zSign', exponent
4453 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4454 | and returns the proper extended double-precision floating-point value
4455 | corresponding to the abstract input. This routine is just like
4456 | `roundAndPackFloatx80' except that the input significand does not have to be
4458 *----------------------------------------------------------------------------*/
4460 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
4461 bool zSign
, int32_t zExp
,
4462 uint64_t zSig0
, uint64_t zSig1
,
4463 float_status
*status
)
4472 shiftCount
= clz64(zSig0
);
4473 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4475 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
4476 zSig0
, zSig1
, status
);
4480 /*----------------------------------------------------------------------------
4481 | Returns the least-significant 64 fraction bits of the quadruple-precision
4482 | floating-point value `a'.
4483 *----------------------------------------------------------------------------*/
4485 static inline uint64_t extractFloat128Frac1( float128 a
)
4492 /*----------------------------------------------------------------------------
4493 | Returns the most-significant 48 fraction bits of the quadruple-precision
4494 | floating-point value `a'.
4495 *----------------------------------------------------------------------------*/
4497 static inline uint64_t extractFloat128Frac0( float128 a
)
4500 return a
.high
& UINT64_C(0x0000FFFFFFFFFFFF);
4504 /*----------------------------------------------------------------------------
4505 | Returns the exponent bits of the quadruple-precision floating-point value
4507 *----------------------------------------------------------------------------*/
4509 static inline int32_t extractFloat128Exp( float128 a
)
4512 return ( a
.high
>>48 ) & 0x7FFF;
4516 /*----------------------------------------------------------------------------
4517 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4518 *----------------------------------------------------------------------------*/
4520 static inline bool extractFloat128Sign(float128 a
)
4522 return a
.high
>> 63;
4525 /*----------------------------------------------------------------------------
4526 | Normalizes the subnormal quadruple-precision floating-point value
4527 | represented by the denormalized significand formed by the concatenation of
4528 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4529 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4530 | significand are stored at the location pointed to by `zSig0Ptr', and the
4531 | least significant 64 bits of the normalized significand are stored at the
4532 | location pointed to by `zSig1Ptr'.
4533 *----------------------------------------------------------------------------*/
4536 normalizeFloat128Subnormal(
4547 shiftCount
= clz64(aSig1
) - 15;
4548 if ( shiftCount
< 0 ) {
4549 *zSig0Ptr
= aSig1
>>( - shiftCount
);
4550 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
4553 *zSig0Ptr
= aSig1
<<shiftCount
;
4556 *zExpPtr
= - shiftCount
- 63;
4559 shiftCount
= clz64(aSig0
) - 15;
4560 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
4561 *zExpPtr
= 1 - shiftCount
;
4566 /*----------------------------------------------------------------------------
4567 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4568 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4569 | floating-point value, returning the result. After being shifted into the
4570 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4571 | added together to form the most significant 32 bits of the result. This
4572 | means that any integer portion of `zSig0' will be added into the exponent.
4573 | Since a properly normalized significand will have an integer portion equal
4574 | to 1, the `zExp' input should be 1 less than the desired result exponent
4575 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4577 *----------------------------------------------------------------------------*/
4579 static inline float128
4580 packFloat128(bool zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
4585 z
.high
= ((uint64_t)zSign
<< 63) + ((uint64_t)zExp
<< 48) + zSig0
;
4589 /*----------------------------------------------------------------------------
4590 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4591 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4592 | and `zSig2', and returns the proper quadruple-precision floating-point value
4593 | corresponding to the abstract input. Ordinarily, the abstract value is
4594 | simply rounded and packed into the quadruple-precision format, with the
4595 | inexact exception raised if the abstract input cannot be represented
4596 | exactly. However, if the abstract value is too large, the overflow and
4597 | inexact exceptions are raised and an infinity or maximal finite value is
4598 | returned. If the abstract value is too small, the input value is rounded to
4599 | a subnormal number, and the underflow and inexact exceptions are raised if
4600 | the abstract input cannot be represented exactly as a subnormal quadruple-
4601 | precision floating-point number.
4602 | The input significand must be normalized or smaller. If the input
4603 | significand is not normalized, `zExp' must be 0; in that case, the result
4604 | returned is a subnormal number, and it must not require rounding. In the
4605 | usual case that the input significand is normalized, `zExp' must be 1 less
4606 | than the ``true'' floating-point exponent. The handling of underflow and
4607 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4608 *----------------------------------------------------------------------------*/
4610 static float128
roundAndPackFloat128(bool zSign
, int32_t zExp
,
4611 uint64_t zSig0
, uint64_t zSig1
,
4612 uint64_t zSig2
, float_status
*status
)
4614 int8_t roundingMode
;
4615 bool roundNearestEven
, increment
, isTiny
;
4617 roundingMode
= status
->float_rounding_mode
;
4618 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
4619 switch (roundingMode
) {
4620 case float_round_nearest_even
:
4621 case float_round_ties_away
:
4622 increment
= ((int64_t)zSig2
< 0);
4624 case float_round_to_zero
:
4627 case float_round_up
:
4628 increment
= !zSign
&& zSig2
;
4630 case float_round_down
:
4631 increment
= zSign
&& zSig2
;
4633 case float_round_to_odd
:
4634 increment
= !(zSig1
& 0x1) && zSig2
;
4639 if ( 0x7FFD <= (uint32_t) zExp
) {
4640 if ( ( 0x7FFD < zExp
)
4641 || ( ( zExp
== 0x7FFD )
4643 UINT64_C(0x0001FFFFFFFFFFFF),
4644 UINT64_C(0xFFFFFFFFFFFFFFFF),
4651 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
4652 if ( ( roundingMode
== float_round_to_zero
)
4653 || ( zSign
&& ( roundingMode
== float_round_up
) )
4654 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
4655 || (roundingMode
== float_round_to_odd
)
4661 UINT64_C(0x0000FFFFFFFFFFFF),
4662 UINT64_C(0xFFFFFFFFFFFFFFFF)
4665 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4668 if (status
->flush_to_zero
) {
4669 float_raise(float_flag_output_denormal
, status
);
4670 return packFloat128(zSign
, 0, 0, 0);
4672 isTiny
= status
->tininess_before_rounding
4675 || lt128(zSig0
, zSig1
,
4676 UINT64_C(0x0001FFFFFFFFFFFF),
4677 UINT64_C(0xFFFFFFFFFFFFFFFF));
4678 shift128ExtraRightJamming(
4679 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
4681 if (isTiny
&& zSig2
) {
4682 float_raise(float_flag_underflow
, status
);
4684 switch (roundingMode
) {
4685 case float_round_nearest_even
:
4686 case float_round_ties_away
:
4687 increment
= ((int64_t)zSig2
< 0);
4689 case float_round_to_zero
:
4692 case float_round_up
:
4693 increment
= !zSign
&& zSig2
;
4695 case float_round_down
:
4696 increment
= zSign
&& zSig2
;
4698 case float_round_to_odd
:
4699 increment
= !(zSig1
& 0x1) && zSig2
;
4707 status
->float_exception_flags
|= float_flag_inexact
;
4710 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
4711 if ((zSig2
+ zSig2
== 0) && roundNearestEven
) {
4716 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
4718 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4722 /*----------------------------------------------------------------------------
4723 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4724 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4725 | returns the proper quadruple-precision floating-point value corresponding
4726 | to the abstract input. This routine is just like `roundAndPackFloat128'
4727 | except that the input significand has fewer bits and does not have to be
4728 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4730 *----------------------------------------------------------------------------*/
4732 static float128
normalizeRoundAndPackFloat128(bool zSign
, int32_t zExp
,
4733 uint64_t zSig0
, uint64_t zSig1
,
4734 float_status
*status
)
4744 shiftCount
= clz64(zSig0
) - 15;
4745 if ( 0 <= shiftCount
) {
4747 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4750 shift128ExtraRightJamming(
4751 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
4754 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
4759 /*----------------------------------------------------------------------------
4760 | Returns the result of converting the 32-bit two's complement integer `a'
4761 | to the extended double-precision floating-point format. The conversion
4762 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4764 *----------------------------------------------------------------------------*/
4766 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
4773 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4775 absA
= zSign
? - a
: a
;
4776 shiftCount
= clz32(absA
) + 32;
4778 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
4782 /*----------------------------------------------------------------------------
4783 | Returns the result of converting the 32-bit two's complement integer `a' to
4784 | the quadruple-precision floating-point format. The conversion is performed
4785 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4786 *----------------------------------------------------------------------------*/
4788 float128
int32_to_float128(int32_t a
, float_status
*status
)
4795 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4797 absA
= zSign
? - a
: a
;
4798 shiftCount
= clz32(absA
) + 17;
4800 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
4804 /*----------------------------------------------------------------------------
4805 | Returns the result of converting the 64-bit two's complement integer `a'
4806 | to the extended double-precision floating-point format. The conversion
4807 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4809 *----------------------------------------------------------------------------*/
4811 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
4817 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
4819 absA
= zSign
? - a
: a
;
4820 shiftCount
= clz64(absA
);
4821 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
4825 /*----------------------------------------------------------------------------
4826 | Returns the result of converting the 64-bit two's complement integer `a' to
4827 | the quadruple-precision floating-point format. The conversion is performed
4828 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4829 *----------------------------------------------------------------------------*/
4831 float128
int64_to_float128(int64_t a
, float_status
*status
)
4837 uint64_t zSig0
, zSig1
;
4839 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
4841 absA
= zSign
? - a
: a
;
4842 shiftCount
= clz64(absA
) + 49;
4843 zExp
= 0x406E - shiftCount
;
4844 if ( 64 <= shiftCount
) {
4853 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
4854 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
4858 /*----------------------------------------------------------------------------
4859 | Returns the result of converting the 64-bit unsigned integer `a'
4860 | to the quadruple-precision floating-point format. The conversion is performed
4861 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4862 *----------------------------------------------------------------------------*/
4864 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
4867 return float128_zero
;
4869 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a
, status
);
4872 /*----------------------------------------------------------------------------
4873 | Returns the result of converting the single-precision floating-point value
4874 | `a' to the extended double-precision floating-point format. The conversion
4875 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4877 *----------------------------------------------------------------------------*/
4879 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
4885 a
= float32_squash_input_denormal(a
, status
);
4886 aSig
= extractFloat32Frac( a
);
4887 aExp
= extractFloat32Exp( a
);
4888 aSign
= extractFloat32Sign( a
);
4889 if ( aExp
== 0xFF ) {
4891 floatx80 res
= commonNaNToFloatx80(float32ToCommonNaN(a
, status
),
4893 return floatx80_silence_nan(res
, status
);
4895 return packFloatx80(aSign
,
4896 floatx80_infinity_high
,
4897 floatx80_infinity_low
);
4900 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
4901 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4904 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
4908 /*----------------------------------------------------------------------------
4909 | Returns the result of converting the single-precision floating-point value
4910 | `a' to the double-precision floating-point format. The conversion is
4911 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4913 *----------------------------------------------------------------------------*/
4915 float128
float32_to_float128(float32 a
, float_status
*status
)
4921 a
= float32_squash_input_denormal(a
, status
);
4922 aSig
= extractFloat32Frac( a
);
4923 aExp
= extractFloat32Exp( a
);
4924 aSign
= extractFloat32Sign( a
);
4925 if ( aExp
== 0xFF ) {
4927 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
4929 return packFloat128( aSign
, 0x7FFF, 0, 0 );
4932 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
4933 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4936 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
4940 /*----------------------------------------------------------------------------
4941 | Returns the remainder of the single-precision floating-point value `a'
4942 | with respect to the corresponding value `b'. The operation is performed
4943 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4944 *----------------------------------------------------------------------------*/
4946 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
4949 int aExp
, bExp
, expDiff
;
4950 uint32_t aSig
, bSig
;
4952 uint64_t aSig64
, bSig64
, q64
;
4953 uint32_t alternateASig
;
4955 a
= float32_squash_input_denormal(a
, status
);
4956 b
= float32_squash_input_denormal(b
, status
);
4958 aSig
= extractFloat32Frac( a
);
4959 aExp
= extractFloat32Exp( a
);
4960 aSign
= extractFloat32Sign( a
);
4961 bSig
= extractFloat32Frac( b
);
4962 bExp
= extractFloat32Exp( b
);
4963 if ( aExp
== 0xFF ) {
4964 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
4965 return propagateFloat32NaN(a
, b
, status
);
4967 float_raise(float_flag_invalid
, status
);
4968 return float32_default_nan(status
);
4970 if ( bExp
== 0xFF ) {
4972 return propagateFloat32NaN(a
, b
, status
);
4978 float_raise(float_flag_invalid
, status
);
4979 return float32_default_nan(status
);
4981 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
4984 if ( aSig
== 0 ) return a
;
4985 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
4987 expDiff
= aExp
- bExp
;
4990 if ( expDiff
< 32 ) {
4993 if ( expDiff
< 0 ) {
4994 if ( expDiff
< -1 ) return a
;
4997 q
= ( bSig
<= aSig
);
4998 if ( q
) aSig
-= bSig
;
4999 if ( 0 < expDiff
) {
5000 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
5003 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5011 if ( bSig
<= aSig
) aSig
-= bSig
;
5012 aSig64
= ( (uint64_t) aSig
)<<40;
5013 bSig64
= ( (uint64_t) bSig
)<<40;
5015 while ( 0 < expDiff
) {
5016 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
5017 q64
= ( 2 < q64
) ? q64
- 2 : 0;
5018 aSig64
= - ( ( bSig
* q64
)<<38 );
5022 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
5023 q64
= ( 2 < q64
) ? q64
- 2 : 0;
5024 q
= q64
>>( 64 - expDiff
);
5026 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
5029 alternateASig
= aSig
;
5032 } while ( 0 <= (int32_t) aSig
);
5033 sigMean
= aSig
+ alternateASig
;
5034 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5035 aSig
= alternateASig
;
5037 zSign
= ( (int32_t) aSig
< 0 );
5038 if ( zSign
) aSig
= - aSig
;
5039 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
5044 /*----------------------------------------------------------------------------
5045 | Returns the binary exponential of the single-precision floating-point value
5046 | `a'. The operation is performed according to the IEC/IEEE Standard for
5047 | Binary Floating-Point Arithmetic.
5049 | Uses the following identities:
5051 | 1. -------------------------------------------------------------------------
5055 | 2. -------------------------------------------------------------------------
5058 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5060 *----------------------------------------------------------------------------*/
5062 static const float64 float32_exp2_coefficients
[15] =
5064 const_float64( 0x3ff0000000000000ll
), /* 1 */
5065 const_float64( 0x3fe0000000000000ll
), /* 2 */
5066 const_float64( 0x3fc5555555555555ll
), /* 3 */
5067 const_float64( 0x3fa5555555555555ll
), /* 4 */
5068 const_float64( 0x3f81111111111111ll
), /* 5 */
5069 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
5070 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
5071 const_float64( 0x3efa01a01a01a01all
), /* 8 */
5072 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
5073 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
5074 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
5075 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
5076 const_float64( 0x3de6124613a86d09ll
), /* 13 */
5077 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
5078 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
5081 float32
float32_exp2(float32 a
, float_status
*status
)
5088 a
= float32_squash_input_denormal(a
, status
);
5090 aSig
= extractFloat32Frac( a
);
5091 aExp
= extractFloat32Exp( a
);
5092 aSign
= extractFloat32Sign( a
);
5094 if ( aExp
== 0xFF) {
5096 return propagateFloat32NaN(a
, float32_zero
, status
);
5098 return (aSign
) ? float32_zero
: a
;
5101 if (aSig
== 0) return float32_one
;
5104 float_raise(float_flag_inexact
, status
);
5106 /* ******************************* */
5107 /* using float64 for approximation */
5108 /* ******************************* */
5109 x
= float32_to_float64(a
, status
);
5110 x
= float64_mul(x
, float64_ln2
, status
);
5114 for (i
= 0 ; i
< 15 ; i
++) {
5117 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
5118 r
= float64_add(r
, f
, status
);
5120 xn
= float64_mul(xn
, x
, status
);
5123 return float64_to_float32(r
, status
);
5126 /*----------------------------------------------------------------------------
5127 | Returns the binary log of the single-precision floating-point value `a'.
5128 | The operation is performed according to the IEC/IEEE Standard for Binary
5129 | Floating-Point Arithmetic.
5130 *----------------------------------------------------------------------------*/
5131 float32
float32_log2(float32 a
, float_status
*status
)
5135 uint32_t aSig
, zSig
, i
;
5137 a
= float32_squash_input_denormal(a
, status
);
5138 aSig
= extractFloat32Frac( a
);
5139 aExp
= extractFloat32Exp( a
);
5140 aSign
= extractFloat32Sign( a
);
5143 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
5144 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
5147 float_raise(float_flag_invalid
, status
);
5148 return float32_default_nan(status
);
5150 if ( aExp
== 0xFF ) {
5152 return propagateFloat32NaN(a
, float32_zero
, status
);
5162 for (i
= 1 << 22; i
> 0; i
>>= 1) {
5163 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
5164 if ( aSig
& 0x01000000 ) {
5173 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
5176 /*----------------------------------------------------------------------------
5177 | Returns the result of converting the double-precision floating-point value
5178 | `a' to the extended double-precision floating-point format. The conversion
5179 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5181 *----------------------------------------------------------------------------*/
5183 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
5189 a
= float64_squash_input_denormal(a
, status
);
5190 aSig
= extractFloat64Frac( a
);
5191 aExp
= extractFloat64Exp( a
);
5192 aSign
= extractFloat64Sign( a
);
5193 if ( aExp
== 0x7FF ) {
5195 floatx80 res
= commonNaNToFloatx80(float64ToCommonNaN(a
, status
),
5197 return floatx80_silence_nan(res
, status
);
5199 return packFloatx80(aSign
,
5200 floatx80_infinity_high
,
5201 floatx80_infinity_low
);
5204 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
5205 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5209 aSign
, aExp
+ 0x3C00, (aSig
| UINT64_C(0x0010000000000000)) << 11);
5213 /*----------------------------------------------------------------------------
5214 | Returns the result of converting the double-precision floating-point value
5215 | `a' to the quadruple-precision floating-point format. The conversion is
5216 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5218 *----------------------------------------------------------------------------*/
5220 float128
float64_to_float128(float64 a
, float_status
*status
)
5224 uint64_t aSig
, zSig0
, zSig1
;
5226 a
= float64_squash_input_denormal(a
, status
);
5227 aSig
= extractFloat64Frac( a
);
5228 aExp
= extractFloat64Exp( a
);
5229 aSign
= extractFloat64Sign( a
);
5230 if ( aExp
== 0x7FF ) {
5232 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
5234 return packFloat128( aSign
, 0x7FFF, 0, 0 );
5237 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
5238 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5241 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
5242 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
5247 /*----------------------------------------------------------------------------
5248 | Returns the remainder of the double-precision floating-point value `a'
5249 | with respect to the corresponding value `b'. The operation is performed
5250 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5251 *----------------------------------------------------------------------------*/
5253 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
5256 int aExp
, bExp
, expDiff
;
5257 uint64_t aSig
, bSig
;
5258 uint64_t q
, alternateASig
;
5261 a
= float64_squash_input_denormal(a
, status
);
5262 b
= float64_squash_input_denormal(b
, status
);
5263 aSig
= extractFloat64Frac( a
);
5264 aExp
= extractFloat64Exp( a
);
5265 aSign
= extractFloat64Sign( a
);
5266 bSig
= extractFloat64Frac( b
);
5267 bExp
= extractFloat64Exp( b
);
5268 if ( aExp
== 0x7FF ) {
5269 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
5270 return propagateFloat64NaN(a
, b
, status
);
5272 float_raise(float_flag_invalid
, status
);
5273 return float64_default_nan(status
);
5275 if ( bExp
== 0x7FF ) {
5277 return propagateFloat64NaN(a
, b
, status
);
5283 float_raise(float_flag_invalid
, status
);
5284 return float64_default_nan(status
);
5286 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
5289 if ( aSig
== 0 ) return a
;
5290 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5292 expDiff
= aExp
- bExp
;
5293 aSig
= (aSig
| UINT64_C(0x0010000000000000)) << 11;
5294 bSig
= (bSig
| UINT64_C(0x0010000000000000)) << 11;
5295 if ( expDiff
< 0 ) {
5296 if ( expDiff
< -1 ) return a
;
5299 q
= ( bSig
<= aSig
);
5300 if ( q
) aSig
-= bSig
;
5302 while ( 0 < expDiff
) {
5303 q
= estimateDiv128To64( aSig
, 0, bSig
);
5304 q
= ( 2 < q
) ? q
- 2 : 0;
5305 aSig
= - ( ( bSig
>>2 ) * q
);
5309 if ( 0 < expDiff
) {
5310 q
= estimateDiv128To64( aSig
, 0, bSig
);
5311 q
= ( 2 < q
) ? q
- 2 : 0;
5314 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
5321 alternateASig
= aSig
;
5324 } while ( 0 <= (int64_t) aSig
);
5325 sigMean
= aSig
+ alternateASig
;
5326 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
5327 aSig
= alternateASig
;
5329 zSign
= ( (int64_t) aSig
< 0 );
5330 if ( zSign
) aSig
= - aSig
;
5331 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
5335 /*----------------------------------------------------------------------------
5336 | Returns the binary log of the double-precision floating-point value `a'.
5337 | The operation is performed according to the IEC/IEEE Standard for Binary
5338 | Floating-Point Arithmetic.
5339 *----------------------------------------------------------------------------*/
5340 float64
float64_log2(float64 a
, float_status
*status
)
5344 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
5345 a
= float64_squash_input_denormal(a
, status
);
5347 aSig
= extractFloat64Frac( a
);
5348 aExp
= extractFloat64Exp( a
);
5349 aSign
= extractFloat64Sign( a
);
5352 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
5353 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
5356 float_raise(float_flag_invalid
, status
);
5357 return float64_default_nan(status
);
5359 if ( aExp
== 0x7FF ) {
5361 return propagateFloat64NaN(a
, float64_zero
, status
);
5367 aSig
|= UINT64_C(0x0010000000000000);
5369 zSig
= (uint64_t)aExp
<< 52;
5370 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
5371 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
5372 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
5373 if ( aSig
& UINT64_C(0x0020000000000000) ) {
5381 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
5384 /*----------------------------------------------------------------------------
5385 | Returns the result of converting the extended double-precision floating-
5386 | point value `a' to the 32-bit two's complement integer format. The
5387 | conversion is performed according to the IEC/IEEE Standard for Binary
5388 | Floating-Point Arithmetic---which means in particular that the conversion
5389 | is rounded according to the current rounding mode. If `a' is a NaN, the
5390 | largest positive integer is returned. Otherwise, if the conversion
5391 | overflows, the largest integer with the same sign as `a' is returned.
5392 *----------------------------------------------------------------------------*/
5394 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
5397 int32_t aExp
, shiftCount
;
5400 if (floatx80_invalid_encoding(a
)) {
5401 float_raise(float_flag_invalid
, status
);
5404 aSig
= extractFloatx80Frac( a
);
5405 aExp
= extractFloatx80Exp( a
);
5406 aSign
= extractFloatx80Sign( a
);
5407 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5408 shiftCount
= 0x4037 - aExp
;
5409 if ( shiftCount
<= 0 ) shiftCount
= 1;
5410 shift64RightJamming( aSig
, shiftCount
, &aSig
);
5411 return roundAndPackInt32(aSign
, aSig
, status
);
5415 /*----------------------------------------------------------------------------
5416 | Returns the result of converting the extended double-precision floating-
5417 | point value `a' to the 32-bit two's complement integer format. The
5418 | conversion is performed according to the IEC/IEEE Standard for Binary
5419 | Floating-Point Arithmetic, except that the conversion is always rounded
5420 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5421 | Otherwise, if the conversion overflows, the largest integer with the same
5422 | sign as `a' is returned.
5423 *----------------------------------------------------------------------------*/
5425 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
5428 int32_t aExp
, shiftCount
;
5429 uint64_t aSig
, savedASig
;
5432 if (floatx80_invalid_encoding(a
)) {
5433 float_raise(float_flag_invalid
, status
);
5436 aSig
= extractFloatx80Frac( a
);
5437 aExp
= extractFloatx80Exp( a
);
5438 aSign
= extractFloatx80Sign( a
);
5439 if ( 0x401E < aExp
) {
5440 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
5443 else if ( aExp
< 0x3FFF ) {
5445 status
->float_exception_flags
|= float_flag_inexact
;
5449 shiftCount
= 0x403E - aExp
;
5451 aSig
>>= shiftCount
;
5453 if ( aSign
) z
= - z
;
5454 if ( ( z
< 0 ) ^ aSign
) {
5456 float_raise(float_flag_invalid
, status
);
5457 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5459 if ( ( aSig
<<shiftCount
) != savedASig
) {
5460 status
->float_exception_flags
|= float_flag_inexact
;
5466 /*----------------------------------------------------------------------------
5467 | Returns the result of converting the extended double-precision floating-
5468 | point value `a' to the 64-bit two's complement integer format. The
5469 | conversion is performed according to the IEC/IEEE Standard for Binary
5470 | Floating-Point Arithmetic---which means in particular that the conversion
5471 | is rounded according to the current rounding mode. If `a' is a NaN,
5472 | the largest positive integer is returned. Otherwise, if the conversion
5473 | overflows, the largest integer with the same sign as `a' is returned.
5474 *----------------------------------------------------------------------------*/
5476 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
5479 int32_t aExp
, shiftCount
;
5480 uint64_t aSig
, aSigExtra
;
5482 if (floatx80_invalid_encoding(a
)) {
5483 float_raise(float_flag_invalid
, status
);
5486 aSig
= extractFloatx80Frac( a
);
5487 aExp
= extractFloatx80Exp( a
);
5488 aSign
= extractFloatx80Sign( a
);
5489 shiftCount
= 0x403E - aExp
;
5490 if ( shiftCount
<= 0 ) {
5492 float_raise(float_flag_invalid
, status
);
5493 if (!aSign
|| floatx80_is_any_nan(a
)) {
5501 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
5503 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
5507 /*----------------------------------------------------------------------------
5508 | Returns the result of converting the extended double-precision floating-
5509 | point value `a' to the 64-bit two's complement integer format. The
5510 | conversion is performed according to the IEC/IEEE Standard for Binary
5511 | Floating-Point Arithmetic, except that the conversion is always rounded
5512 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5513 | Otherwise, if the conversion overflows, the largest integer with the same
5514 | sign as `a' is returned.
5515 *----------------------------------------------------------------------------*/
5517 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
5520 int32_t aExp
, shiftCount
;
5524 if (floatx80_invalid_encoding(a
)) {
5525 float_raise(float_flag_invalid
, status
);
5528 aSig
= extractFloatx80Frac( a
);
5529 aExp
= extractFloatx80Exp( a
);
5530 aSign
= extractFloatx80Sign( a
);
5531 shiftCount
= aExp
- 0x403E;
5532 if ( 0 <= shiftCount
) {
5533 aSig
&= UINT64_C(0x7FFFFFFFFFFFFFFF);
5534 if ( ( a
.high
!= 0xC03E ) || aSig
) {
5535 float_raise(float_flag_invalid
, status
);
5536 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
5542 else if ( aExp
< 0x3FFF ) {
5544 status
->float_exception_flags
|= float_flag_inexact
;
5548 z
= aSig
>>( - shiftCount
);
5549 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
5550 status
->float_exception_flags
|= float_flag_inexact
;
5552 if ( aSign
) z
= - z
;
5557 /*----------------------------------------------------------------------------
5558 | Returns the result of converting the extended double-precision floating-
5559 | point value `a' to the single-precision floating-point format. The
5560 | conversion is performed according to the IEC/IEEE Standard for Binary
5561 | Floating-Point Arithmetic.
5562 *----------------------------------------------------------------------------*/
5564 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
5570 if (floatx80_invalid_encoding(a
)) {
5571 float_raise(float_flag_invalid
, status
);
5572 return float32_default_nan(status
);
5574 aSig
= extractFloatx80Frac( a
);
5575 aExp
= extractFloatx80Exp( a
);
5576 aSign
= extractFloatx80Sign( a
);
5577 if ( aExp
== 0x7FFF ) {
5578 if ( (uint64_t) ( aSig
<<1 ) ) {
5579 float32 res
= commonNaNToFloat32(floatx80ToCommonNaN(a
, status
),
5581 return float32_silence_nan(res
, status
);
5583 return packFloat32( aSign
, 0xFF, 0 );
5585 shift64RightJamming( aSig
, 33, &aSig
);
5586 if ( aExp
|| aSig
) aExp
-= 0x3F81;
5587 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
5591 /*----------------------------------------------------------------------------
5592 | Returns the result of converting the extended double-precision floating-
5593 | point value `a' to the double-precision floating-point format. The
5594 | conversion is performed according to the IEC/IEEE Standard for Binary
5595 | Floating-Point Arithmetic.
5596 *----------------------------------------------------------------------------*/
5598 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
5602 uint64_t aSig
, zSig
;
5604 if (floatx80_invalid_encoding(a
)) {
5605 float_raise(float_flag_invalid
, status
);
5606 return float64_default_nan(status
);
5608 aSig
= extractFloatx80Frac( a
);
5609 aExp
= extractFloatx80Exp( a
);
5610 aSign
= extractFloatx80Sign( a
);
5611 if ( aExp
== 0x7FFF ) {
5612 if ( (uint64_t) ( aSig
<<1 ) ) {
5613 float64 res
= commonNaNToFloat64(floatx80ToCommonNaN(a
, status
),
5615 return float64_silence_nan(res
, status
);
5617 return packFloat64( aSign
, 0x7FF, 0 );
5619 shift64RightJamming( aSig
, 1, &zSig
);
5620 if ( aExp
|| aSig
) aExp
-= 0x3C01;
5621 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
5625 /*----------------------------------------------------------------------------
5626 | Returns the result of converting the extended double-precision floating-
5627 | point value `a' to the quadruple-precision floating-point format. The
5628 | conversion is performed according to the IEC/IEEE Standard for Binary
5629 | Floating-Point Arithmetic.
5630 *----------------------------------------------------------------------------*/
5632 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
5636 uint64_t aSig
, zSig0
, zSig1
;
5638 if (floatx80_invalid_encoding(a
)) {
5639 float_raise(float_flag_invalid
, status
);
5640 return float128_default_nan(status
);
5642 aSig
= extractFloatx80Frac( a
);
5643 aExp
= extractFloatx80Exp( a
);
5644 aSign
= extractFloatx80Sign( a
);
5645 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
5646 float128 res
= commonNaNToFloat128(floatx80ToCommonNaN(a
, status
),
5648 return float128_silence_nan(res
, status
);
5650 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
5651 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
5655 /*----------------------------------------------------------------------------
5656 | Rounds the extended double-precision floating-point value `a'
5657 | to the precision provided by floatx80_rounding_precision and returns the
5658 | result as an extended double-precision floating-point value.
5659 | The operation is performed according to the IEC/IEEE Standard for Binary
5660 | Floating-Point Arithmetic.
5661 *----------------------------------------------------------------------------*/
5663 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
5665 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5666 extractFloatx80Sign(a
),
5667 extractFloatx80Exp(a
),
5668 extractFloatx80Frac(a
), 0, status
);
5671 /*----------------------------------------------------------------------------
5672 | Rounds the extended double-precision floating-point value `a' to an integer,
5673 | and returns the result as an extended quadruple-precision floating-point
5674 | value. The operation is performed according to the IEC/IEEE Standard for
5675 | Binary Floating-Point Arithmetic.
5676 *----------------------------------------------------------------------------*/
5678 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
5682 uint64_t lastBitMask
, roundBitsMask
;
5685 if (floatx80_invalid_encoding(a
)) {
5686 float_raise(float_flag_invalid
, status
);
5687 return floatx80_default_nan(status
);
5689 aExp
= extractFloatx80Exp( a
);
5690 if ( 0x403E <= aExp
) {
5691 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
5692 return propagateFloatx80NaN(a
, a
, status
);
5696 if ( aExp
< 0x3FFF ) {
5698 && ( (uint64_t) ( extractFloatx80Frac( a
) ) == 0 ) ) {
5701 status
->float_exception_flags
|= float_flag_inexact
;
5702 aSign
= extractFloatx80Sign( a
);
5703 switch (status
->float_rounding_mode
) {
5704 case float_round_nearest_even
:
5705 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
5708 packFloatx80( aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5711 case float_round_ties_away
:
5712 if (aExp
== 0x3FFE) {
5713 return packFloatx80(aSign
, 0x3FFF, UINT64_C(0x8000000000000000));
5716 case float_round_down
:
5719 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5720 : packFloatx80( 0, 0, 0 );
5721 case float_round_up
:
5723 aSign
? packFloatx80( 1, 0, 0 )
5724 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5726 case float_round_to_zero
:
5729 g_assert_not_reached();
5731 return packFloatx80( aSign
, 0, 0 );
5734 lastBitMask
<<= 0x403E - aExp
;
5735 roundBitsMask
= lastBitMask
- 1;
5737 switch (status
->float_rounding_mode
) {
5738 case float_round_nearest_even
:
5739 z
.low
+= lastBitMask
>>1;
5740 if ((z
.low
& roundBitsMask
) == 0) {
5741 z
.low
&= ~lastBitMask
;
5744 case float_round_ties_away
:
5745 z
.low
+= lastBitMask
>> 1;
5747 case float_round_to_zero
:
5749 case float_round_up
:
5750 if (!extractFloatx80Sign(z
)) {
5751 z
.low
+= roundBitsMask
;
5754 case float_round_down
:
5755 if (extractFloatx80Sign(z
)) {
5756 z
.low
+= roundBitsMask
;
5762 z
.low
&= ~ roundBitsMask
;
5765 z
.low
= UINT64_C(0x8000000000000000);
5767 if (z
.low
!= a
.low
) {
5768 status
->float_exception_flags
|= float_flag_inexact
;
5774 /*----------------------------------------------------------------------------
5775 | Returns the result of adding the absolute values of the extended double-
5776 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5777 | negated before being returned. `zSign' is ignored if the result is a NaN.
5778 | The addition is performed according to the IEC/IEEE Standard for Binary
5779 | Floating-Point Arithmetic.
5780 *----------------------------------------------------------------------------*/
5782 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5783 float_status
*status
)
5785 int32_t aExp
, bExp
, zExp
;
5786 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5789 aSig
= extractFloatx80Frac( a
);
5790 aExp
= extractFloatx80Exp( a
);
5791 bSig
= extractFloatx80Frac( b
);
5792 bExp
= extractFloatx80Exp( b
);
5793 expDiff
= aExp
- bExp
;
5794 if ( 0 < expDiff
) {
5795 if ( aExp
== 0x7FFF ) {
5796 if ((uint64_t)(aSig
<< 1)) {
5797 return propagateFloatx80NaN(a
, b
, status
);
5801 if ( bExp
== 0 ) --expDiff
;
5802 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5805 else if ( expDiff
< 0 ) {
5806 if ( bExp
== 0x7FFF ) {
5807 if ((uint64_t)(bSig
<< 1)) {
5808 return propagateFloatx80NaN(a
, b
, status
);
5810 return packFloatx80(zSign
,
5811 floatx80_infinity_high
,
5812 floatx80_infinity_low
);
5814 if ( aExp
== 0 ) ++expDiff
;
5815 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5819 if ( aExp
== 0x7FFF ) {
5820 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5821 return propagateFloatx80NaN(a
, b
, status
);
5826 zSig0
= aSig
+ bSig
;
5828 if ((aSig
| bSig
) & UINT64_C(0x8000000000000000) && zSig0
< aSig
) {
5829 /* At least one of the values is a pseudo-denormal,
5830 * and there is a carry out of the result. */
5835 return packFloatx80(zSign
, 0, 0);
5837 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5843 zSig0
= aSig
+ bSig
;
5844 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5846 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5847 zSig0
|= UINT64_C(0x8000000000000000);
5850 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5851 zSign
, zExp
, zSig0
, zSig1
, status
);
5854 /*----------------------------------------------------------------------------
5855 | Returns the result of subtracting the absolute values of the extended
5856 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5857 | difference is negated before being returned. `zSign' is ignored if the
5858 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5859 | Standard for Binary Floating-Point Arithmetic.
5860 *----------------------------------------------------------------------------*/
5862 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, bool zSign
,
5863 float_status
*status
)
5865 int32_t aExp
, bExp
, zExp
;
5866 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5869 aSig
= extractFloatx80Frac( a
);
5870 aExp
= extractFloatx80Exp( a
);
5871 bSig
= extractFloatx80Frac( b
);
5872 bExp
= extractFloatx80Exp( b
);
5873 expDiff
= aExp
- bExp
;
5874 if ( 0 < expDiff
) goto aExpBigger
;
5875 if ( expDiff
< 0 ) goto bExpBigger
;
5876 if ( aExp
== 0x7FFF ) {
5877 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5878 return propagateFloatx80NaN(a
, b
, status
);
5880 float_raise(float_flag_invalid
, status
);
5881 return floatx80_default_nan(status
);
5888 if ( bSig
< aSig
) goto aBigger
;
5889 if ( aSig
< bSig
) goto bBigger
;
5890 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
5892 if ( bExp
== 0x7FFF ) {
5893 if ((uint64_t)(bSig
<< 1)) {
5894 return propagateFloatx80NaN(a
, b
, status
);
5896 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
5897 floatx80_infinity_low
);
5899 if ( aExp
== 0 ) ++expDiff
;
5900 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5902 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
5905 goto normalizeRoundAndPack
;
5907 if ( aExp
== 0x7FFF ) {
5908 if ((uint64_t)(aSig
<< 1)) {
5909 return propagateFloatx80NaN(a
, b
, status
);
5913 if ( bExp
== 0 ) --expDiff
;
5914 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5916 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
5918 normalizeRoundAndPack
:
5919 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
5920 zSign
, zExp
, zSig0
, zSig1
, status
);
5923 /*----------------------------------------------------------------------------
5924 | Returns the result of adding the extended double-precision floating-point
5925 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5926 | Standard for Binary Floating-Point Arithmetic.
5927 *----------------------------------------------------------------------------*/
5929 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
5933 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5934 float_raise(float_flag_invalid
, status
);
5935 return floatx80_default_nan(status
);
5937 aSign
= extractFloatx80Sign( a
);
5938 bSign
= extractFloatx80Sign( b
);
5939 if ( aSign
== bSign
) {
5940 return addFloatx80Sigs(a
, b
, aSign
, status
);
5943 return subFloatx80Sigs(a
, b
, aSign
, status
);
5948 /*----------------------------------------------------------------------------
5949 | Returns the result of subtracting the extended double-precision floating-
5950 | point values `a' and `b'. The operation is performed according to the
5951 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5952 *----------------------------------------------------------------------------*/
5954 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5958 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5959 float_raise(float_flag_invalid
, status
);
5960 return floatx80_default_nan(status
);
5962 aSign
= extractFloatx80Sign( a
);
5963 bSign
= extractFloatx80Sign( b
);
5964 if ( aSign
== bSign
) {
5965 return subFloatx80Sigs(a
, b
, aSign
, status
);
5968 return addFloatx80Sigs(a
, b
, aSign
, status
);
5973 /*----------------------------------------------------------------------------
5974 | Returns the result of multiplying the extended double-precision floating-
5975 | point values `a' and `b'. The operation is performed according to the
5976 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5977 *----------------------------------------------------------------------------*/
5979 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
5981 bool aSign
, bSign
, zSign
;
5982 int32_t aExp
, bExp
, zExp
;
5983 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5985 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5986 float_raise(float_flag_invalid
, status
);
5987 return floatx80_default_nan(status
);
5989 aSig
= extractFloatx80Frac( a
);
5990 aExp
= extractFloatx80Exp( a
);
5991 aSign
= extractFloatx80Sign( a
);
5992 bSig
= extractFloatx80Frac( b
);
5993 bExp
= extractFloatx80Exp( b
);
5994 bSign
= extractFloatx80Sign( b
);
5995 zSign
= aSign
^ bSign
;
5996 if ( aExp
== 0x7FFF ) {
5997 if ( (uint64_t) ( aSig
<<1 )
5998 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5999 return propagateFloatx80NaN(a
, b
, status
);
6001 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
6002 return packFloatx80(zSign
, floatx80_infinity_high
,
6003 floatx80_infinity_low
);
6005 if ( bExp
== 0x7FFF ) {
6006 if ((uint64_t)(bSig
<< 1)) {
6007 return propagateFloatx80NaN(a
, b
, status
);
6009 if ( ( aExp
| aSig
) == 0 ) {
6011 float_raise(float_flag_invalid
, status
);
6012 return floatx80_default_nan(status
);
6014 return packFloatx80(zSign
, floatx80_infinity_high
,
6015 floatx80_infinity_low
);
6018 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6019 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6022 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6023 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6025 zExp
= aExp
+ bExp
- 0x3FFE;
6026 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
6027 if ( 0 < (int64_t) zSig0
) {
6028 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
6031 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6032 zSign
, zExp
, zSig0
, zSig1
, status
);
6035 /*----------------------------------------------------------------------------
6036 | Returns the result of dividing the extended double-precision floating-point
6037 | value `a' by the corresponding value `b'. The operation is performed
6038 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6039 *----------------------------------------------------------------------------*/
6041 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
6043 bool aSign
, bSign
, zSign
;
6044 int32_t aExp
, bExp
, zExp
;
6045 uint64_t aSig
, bSig
, zSig0
, zSig1
;
6046 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
6048 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6049 float_raise(float_flag_invalid
, status
);
6050 return floatx80_default_nan(status
);
6052 aSig
= extractFloatx80Frac( a
);
6053 aExp
= extractFloatx80Exp( a
);
6054 aSign
= extractFloatx80Sign( a
);
6055 bSig
= extractFloatx80Frac( b
);
6056 bExp
= extractFloatx80Exp( b
);
6057 bSign
= extractFloatx80Sign( b
);
6058 zSign
= aSign
^ bSign
;
6059 if ( aExp
== 0x7FFF ) {
6060 if ((uint64_t)(aSig
<< 1)) {
6061 return propagateFloatx80NaN(a
, b
, status
);
6063 if ( bExp
== 0x7FFF ) {
6064 if ((uint64_t)(bSig
<< 1)) {
6065 return propagateFloatx80NaN(a
, b
, status
);
6069 return packFloatx80(zSign
, floatx80_infinity_high
,
6070 floatx80_infinity_low
);
6072 if ( bExp
== 0x7FFF ) {
6073 if ((uint64_t)(bSig
<< 1)) {
6074 return propagateFloatx80NaN(a
, b
, status
);
6076 return packFloatx80( zSign
, 0, 0 );
6080 if ( ( aExp
| aSig
) == 0 ) {
6082 float_raise(float_flag_invalid
, status
);
6083 return floatx80_default_nan(status
);
6085 float_raise(float_flag_divbyzero
, status
);
6086 return packFloatx80(zSign
, floatx80_infinity_high
,
6087 floatx80_infinity_low
);
6089 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6092 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
6093 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
6095 zExp
= aExp
- bExp
+ 0x3FFE;
6097 if ( bSig
<= aSig
) {
6098 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
6101 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
6102 mul64To128( bSig
, zSig0
, &term0
, &term1
);
6103 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
6104 while ( (int64_t) rem0
< 0 ) {
6106 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
6108 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
6109 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
6110 mul64To128( bSig
, zSig1
, &term1
, &term2
);
6111 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6112 while ( (int64_t) rem1
< 0 ) {
6114 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
6116 zSig1
|= ( ( rem1
| rem2
) != 0 );
6118 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6119 zSign
, zExp
, zSig0
, zSig1
, status
);
6122 /*----------------------------------------------------------------------------
6123 | Returns the remainder of the extended double-precision floating-point value
6124 | `a' with respect to the corresponding value `b'. The operation is performed
6125 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6126 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6127 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of
6128 | the absolute value of the integer quotient.
6129 *----------------------------------------------------------------------------*/
6131 floatx80
floatx80_modrem(floatx80 a
, floatx80 b
, bool mod
, uint64_t *quotient
,
6132 float_status
*status
)
6135 int32_t aExp
, bExp
, expDiff
, aExpOrig
;
6136 uint64_t aSig0
, aSig1
, bSig
;
6137 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
6140 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6141 float_raise(float_flag_invalid
, status
);
6142 return floatx80_default_nan(status
);
6144 aSig0
= extractFloatx80Frac( a
);
6145 aExpOrig
= aExp
= extractFloatx80Exp( a
);
6146 aSign
= extractFloatx80Sign( a
);
6147 bSig
= extractFloatx80Frac( b
);
6148 bExp
= extractFloatx80Exp( b
);
6149 if ( aExp
== 0x7FFF ) {
6150 if ( (uint64_t) ( aSig0
<<1 )
6151 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
6152 return propagateFloatx80NaN(a
, b
, status
);
6156 if ( bExp
== 0x7FFF ) {
6157 if ((uint64_t)(bSig
<< 1)) {
6158 return propagateFloatx80NaN(a
, b
, status
);
6160 if (aExp
== 0 && aSig0
>> 63) {
6162 * Pseudo-denormal argument must be returned in normalized
6165 return packFloatx80(aSign
, 1, aSig0
);
6172 float_raise(float_flag_invalid
, status
);
6173 return floatx80_default_nan(status
);
6175 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
6178 if ( aSig0
== 0 ) return a
;
6179 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6182 expDiff
= aExp
- bExp
;
6184 if ( expDiff
< 0 ) {
6185 if ( mod
|| expDiff
< -1 ) {
6186 if (aExp
== 1 && aExpOrig
== 0) {
6188 * Pseudo-denormal argument must be returned in
6191 return packFloatx80(aSign
, aExp
, aSig0
);
6195 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
6198 *quotient
= q
= ( bSig
<= aSig0
);
6199 if ( q
) aSig0
-= bSig
;
6201 while ( 0 < expDiff
) {
6202 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6203 q
= ( 2 < q
) ? q
- 2 : 0;
6204 mul64To128( bSig
, q
, &term0
, &term1
);
6205 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6206 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
6212 if ( 0 < expDiff
) {
6213 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
6214 q
= ( 2 < q
) ? q
- 2 : 0;
6216 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
6217 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6218 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
6219 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
6221 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
6224 *quotient
<<= expDiff
;
6235 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
6236 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6237 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
6240 aSig0
= alternateASig0
;
6241 aSig1
= alternateASig1
;
6247 normalizeRoundAndPackFloatx80(
6248 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
6252 /*----------------------------------------------------------------------------
6253 | Returns the remainder of the extended double-precision floating-point value
6254 | `a' with respect to the corresponding value `b'. The operation is performed
6255 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6256 *----------------------------------------------------------------------------*/
6258 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
6261 return floatx80_modrem(a
, b
, false, "ient
, status
);
6264 /*----------------------------------------------------------------------------
6265 | Returns the remainder of the extended double-precision floating-point value
6266 | `a' with respect to the corresponding value `b', with the quotient truncated
6268 *----------------------------------------------------------------------------*/
6270 floatx80
floatx80_mod(floatx80 a
, floatx80 b
, float_status
*status
)
6273 return floatx80_modrem(a
, b
, true, "ient
, status
);
6276 /*----------------------------------------------------------------------------
6277 | Returns the square root of the extended double-precision floating-point
6278 | value `a'. The operation is performed according to the IEC/IEEE Standard
6279 | for Binary Floating-Point Arithmetic.
6280 *----------------------------------------------------------------------------*/
6282 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
6286 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
6287 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6289 if (floatx80_invalid_encoding(a
)) {
6290 float_raise(float_flag_invalid
, status
);
6291 return floatx80_default_nan(status
);
6293 aSig0
= extractFloatx80Frac( a
);
6294 aExp
= extractFloatx80Exp( a
);
6295 aSign
= extractFloatx80Sign( a
);
6296 if ( aExp
== 0x7FFF ) {
6297 if ((uint64_t)(aSig0
<< 1)) {
6298 return propagateFloatx80NaN(a
, a
, status
);
6300 if ( ! aSign
) return a
;
6304 if ( ( aExp
| aSig0
) == 0 ) return a
;
6306 float_raise(float_flag_invalid
, status
);
6307 return floatx80_default_nan(status
);
6310 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
6311 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
6313 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
6314 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
6315 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
6316 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6317 doubleZSig0
= zSig0
<<1;
6318 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6319 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6320 while ( (int64_t) rem0
< 0 ) {
6323 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6325 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6326 if ( ( zSig1
& UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6327 if ( zSig1
== 0 ) zSig1
= 1;
6328 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6329 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6330 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6331 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6332 while ( (int64_t) rem1
< 0 ) {
6334 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6336 term2
|= doubleZSig0
;
6337 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6339 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6341 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
6342 zSig0
|= doubleZSig0
;
6343 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
6344 0, zExp
, zSig0
, zSig1
, status
);
6347 /*----------------------------------------------------------------------------
6348 | Returns the result of converting the quadruple-precision floating-point
6349 | value `a' to the 32-bit two's complement integer format. The conversion
6350 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6351 | Arithmetic---which means in particular that the conversion is rounded
6352 | according to the current rounding mode. If `a' is a NaN, the largest
6353 | positive integer is returned. Otherwise, if the conversion overflows, the
6354 | largest integer with the same sign as `a' is returned.
6355 *----------------------------------------------------------------------------*/
6357 int32_t float128_to_int32(float128 a
, float_status
*status
)
6360 int32_t aExp
, shiftCount
;
6361 uint64_t aSig0
, aSig1
;
6363 aSig1
= extractFloat128Frac1( a
);
6364 aSig0
= extractFloat128Frac0( a
);
6365 aExp
= extractFloat128Exp( a
);
6366 aSign
= extractFloat128Sign( a
);
6367 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
6368 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6369 aSig0
|= ( aSig1
!= 0 );
6370 shiftCount
= 0x4028 - aExp
;
6371 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
6372 return roundAndPackInt32(aSign
, aSig0
, status
);
6376 /*----------------------------------------------------------------------------
6377 | Returns the result of converting the quadruple-precision floating-point
6378 | value `a' to the 32-bit two's complement integer format. The conversion
6379 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6380 | Arithmetic, except that the conversion is always rounded toward zero. If
6381 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6382 | conversion overflows, the largest integer with the same sign as `a' is
6384 *----------------------------------------------------------------------------*/
6386 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
6389 int32_t aExp
, shiftCount
;
6390 uint64_t aSig0
, aSig1
, savedASig
;
6393 aSig1
= extractFloat128Frac1( a
);
6394 aSig0
= extractFloat128Frac0( a
);
6395 aExp
= extractFloat128Exp( a
);
6396 aSign
= extractFloat128Sign( a
);
6397 aSig0
|= ( aSig1
!= 0 );
6398 if ( 0x401E < aExp
) {
6399 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
6402 else if ( aExp
< 0x3FFF ) {
6403 if (aExp
|| aSig0
) {
6404 status
->float_exception_flags
|= float_flag_inexact
;
6408 aSig0
|= UINT64_C(0x0001000000000000);
6409 shiftCount
= 0x402F - aExp
;
6411 aSig0
>>= shiftCount
;
6413 if ( aSign
) z
= - z
;
6414 if ( ( z
< 0 ) ^ aSign
) {
6416 float_raise(float_flag_invalid
, status
);
6417 return aSign
? INT32_MIN
: INT32_MAX
;
6419 if ( ( aSig0
<<shiftCount
) != savedASig
) {
6420 status
->float_exception_flags
|= float_flag_inexact
;
6426 /*----------------------------------------------------------------------------
6427 | Returns the result of converting the quadruple-precision floating-point
6428 | value `a' to the 64-bit two's complement integer format. The conversion
6429 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6430 | Arithmetic---which means in particular that the conversion is rounded
6431 | according to the current rounding mode. If `a' is a NaN, the largest
6432 | positive integer is returned. Otherwise, if the conversion overflows, the
6433 | largest integer with the same sign as `a' is returned.
6434 *----------------------------------------------------------------------------*/
6436 int64_t float128_to_int64(float128 a
, float_status
*status
)
6439 int32_t aExp
, shiftCount
;
6440 uint64_t aSig0
, aSig1
;
6442 aSig1
= extractFloat128Frac1( a
);
6443 aSig0
= extractFloat128Frac0( a
);
6444 aExp
= extractFloat128Exp( a
);
6445 aSign
= extractFloat128Sign( a
);
6446 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6447 shiftCount
= 0x402F - aExp
;
6448 if ( shiftCount
<= 0 ) {
6449 if ( 0x403E < aExp
) {
6450 float_raise(float_flag_invalid
, status
);
6452 || ( ( aExp
== 0x7FFF )
6453 && ( aSig1
|| ( aSig0
!= UINT64_C(0x0001000000000000) ) )
6460 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
6463 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6465 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
6469 /*----------------------------------------------------------------------------
6470 | Returns the result of converting the quadruple-precision floating-point
6471 | value `a' to the 64-bit two's complement integer format. The conversion
6472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6473 | Arithmetic, except that the conversion is always rounded toward zero.
6474 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6475 | the conversion overflows, the largest integer with the same sign as `a' is
6477 *----------------------------------------------------------------------------*/
6479 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
6482 int32_t aExp
, shiftCount
;
6483 uint64_t aSig0
, aSig1
;
6486 aSig1
= extractFloat128Frac1( a
);
6487 aSig0
= extractFloat128Frac0( a
);
6488 aExp
= extractFloat128Exp( a
);
6489 aSign
= extractFloat128Sign( a
);
6490 if ( aExp
) aSig0
|= UINT64_C(0x0001000000000000);
6491 shiftCount
= aExp
- 0x402F;
6492 if ( 0 < shiftCount
) {
6493 if ( 0x403E <= aExp
) {
6494 aSig0
&= UINT64_C(0x0000FFFFFFFFFFFF);
6495 if ( ( a
.high
== UINT64_C(0xC03E000000000000) )
6496 && ( aSig1
< UINT64_C(0x0002000000000000) ) ) {
6498 status
->float_exception_flags
|= float_flag_inexact
;
6502 float_raise(float_flag_invalid
, status
);
6503 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
6509 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
6510 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
6511 status
->float_exception_flags
|= float_flag_inexact
;
6515 if ( aExp
< 0x3FFF ) {
6516 if ( aExp
| aSig0
| aSig1
) {
6517 status
->float_exception_flags
|= float_flag_inexact
;
6521 z
= aSig0
>>( - shiftCount
);
6523 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
6524 status
->float_exception_flags
|= float_flag_inexact
;
6527 if ( aSign
) z
= - z
;
6532 /*----------------------------------------------------------------------------
6533 | Returns the result of converting the quadruple-precision floating-point value
6534 | `a' to the 64-bit unsigned integer format. The conversion is
6535 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6536 | Arithmetic---which means in particular that the conversion is rounded
6537 | according to the current rounding mode. If `a' is a NaN, the largest
6538 | positive integer is returned. If the conversion overflows, the
6539 | largest unsigned integer is returned. If 'a' is negative, the value is
6540 | rounded and zero is returned; negative values that do not round to zero
6541 | will raise the inexact exception.
6542 *----------------------------------------------------------------------------*/
6544 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
6549 uint64_t aSig0
, aSig1
;
6551 aSig0
= extractFloat128Frac0(a
);
6552 aSig1
= extractFloat128Frac1(a
);
6553 aExp
= extractFloat128Exp(a
);
6554 aSign
= extractFloat128Sign(a
);
6555 if (aSign
&& (aExp
> 0x3FFE)) {
6556 float_raise(float_flag_invalid
, status
);
6557 if (float128_is_any_nan(a
)) {
6564 aSig0
|= UINT64_C(0x0001000000000000);
6566 shiftCount
= 0x402F - aExp
;
6567 if (shiftCount
<= 0) {
6568 if (0x403E < aExp
) {
6569 float_raise(float_flag_invalid
, status
);
6572 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
6574 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
6576 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
6579 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
6582 signed char current_rounding_mode
= status
->float_rounding_mode
;
6584 set_float_rounding_mode(float_round_to_zero
, status
);
6585 v
= float128_to_uint64(a
, status
);
6586 set_float_rounding_mode(current_rounding_mode
, status
);
6591 /*----------------------------------------------------------------------------
6592 | Returns the result of converting the quadruple-precision floating-point
6593 | value `a' to the 32-bit unsigned integer format. The conversion
6594 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6595 | Arithmetic except that the conversion is always rounded toward zero.
6596 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6597 | if the conversion overflows, the largest unsigned integer is returned.
6598 | If 'a' is negative, the value is rounded and zero is returned; negative
6599 | values that do not round to zero will raise the inexact exception.
6600 *----------------------------------------------------------------------------*/
6602 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
6606 int old_exc_flags
= get_float_exception_flags(status
);
6608 v
= float128_to_uint64_round_to_zero(a
, status
);
6609 if (v
> 0xffffffff) {
6614 set_float_exception_flags(old_exc_flags
, status
);
6615 float_raise(float_flag_invalid
, status
);
6619 /*----------------------------------------------------------------------------
6620 | Returns the result of converting the quadruple-precision floating-point value
6621 | `a' to the 32-bit unsigned integer format. The conversion is
6622 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6623 | Arithmetic---which means in particular that the conversion is rounded
6624 | according to the current rounding mode. If `a' is a NaN, the largest
6625 | positive integer is returned. If the conversion overflows, the
6626 | largest unsigned integer is returned. If 'a' is negative, the value is
6627 | rounded and zero is returned; negative values that do not round to zero
6628 | will raise the inexact exception.
6629 *----------------------------------------------------------------------------*/
6631 uint32_t float128_to_uint32(float128 a
, float_status
*status
)
6635 int old_exc_flags
= get_float_exception_flags(status
);
6637 v
= float128_to_uint64(a
, status
);
6638 if (v
> 0xffffffff) {
6643 set_float_exception_flags(old_exc_flags
, status
);
6644 float_raise(float_flag_invalid
, status
);
6648 /*----------------------------------------------------------------------------
6649 | Returns the result of converting the quadruple-precision floating-point
6650 | value `a' to the single-precision floating-point format. The conversion
6651 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6653 *----------------------------------------------------------------------------*/
6655 float32
float128_to_float32(float128 a
, float_status
*status
)
6659 uint64_t aSig0
, aSig1
;
6662 aSig1
= extractFloat128Frac1( a
);
6663 aSig0
= extractFloat128Frac0( a
);
6664 aExp
= extractFloat128Exp( a
);
6665 aSign
= extractFloat128Sign( a
);
6666 if ( aExp
== 0x7FFF ) {
6667 if ( aSig0
| aSig1
) {
6668 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
6670 return packFloat32( aSign
, 0xFF, 0 );
6672 aSig0
|= ( aSig1
!= 0 );
6673 shift64RightJamming( aSig0
, 18, &aSig0
);
6675 if ( aExp
|| zSig
) {
6679 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
6683 /*----------------------------------------------------------------------------
6684 | Returns the result of converting the quadruple-precision floating-point
6685 | value `a' to the double-precision floating-point format. The conversion
6686 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6688 *----------------------------------------------------------------------------*/
6690 float64
float128_to_float64(float128 a
, float_status
*status
)
6694 uint64_t aSig0
, aSig1
;
6696 aSig1
= extractFloat128Frac1( a
);
6697 aSig0
= extractFloat128Frac0( a
);
6698 aExp
= extractFloat128Exp( a
);
6699 aSign
= extractFloat128Sign( a
);
6700 if ( aExp
== 0x7FFF ) {
6701 if ( aSig0
| aSig1
) {
6702 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
6704 return packFloat64( aSign
, 0x7FF, 0 );
6706 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6707 aSig0
|= ( aSig1
!= 0 );
6708 if ( aExp
|| aSig0
) {
6709 aSig0
|= UINT64_C(0x4000000000000000);
6712 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
6716 /*----------------------------------------------------------------------------
6717 | Returns the result of converting the quadruple-precision floating-point
6718 | value `a' to the extended double-precision floating-point format. The
6719 | conversion is performed according to the IEC/IEEE Standard for Binary
6720 | Floating-Point Arithmetic.
6721 *----------------------------------------------------------------------------*/
6723 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
6727 uint64_t aSig0
, aSig1
;
6729 aSig1
= extractFloat128Frac1( a
);
6730 aSig0
= extractFloat128Frac0( a
);
6731 aExp
= extractFloat128Exp( a
);
6732 aSign
= extractFloat128Sign( a
);
6733 if ( aExp
== 0x7FFF ) {
6734 if ( aSig0
| aSig1
) {
6735 floatx80 res
= commonNaNToFloatx80(float128ToCommonNaN(a
, status
),
6737 return floatx80_silence_nan(res
, status
);
6739 return packFloatx80(aSign
, floatx80_infinity_high
,
6740 floatx80_infinity_low
);
6743 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
6744 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6747 aSig0
|= UINT64_C(0x0001000000000000);
6749 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
6750 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
6754 /*----------------------------------------------------------------------------
6755 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6756 | returns the result as a quadruple-precision floating-point value. The
6757 | operation is performed according to the IEC/IEEE Standard for Binary
6758 | Floating-Point Arithmetic.
6759 *----------------------------------------------------------------------------*/
6761 float128
float128_round_to_int(float128 a
, float_status
*status
)
6765 uint64_t lastBitMask
, roundBitsMask
;
6768 aExp
= extractFloat128Exp( a
);
6769 if ( 0x402F <= aExp
) {
6770 if ( 0x406F <= aExp
) {
6771 if ( ( aExp
== 0x7FFF )
6772 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6774 return propagateFloat128NaN(a
, a
, status
);
6779 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6780 roundBitsMask
= lastBitMask
- 1;
6782 switch (status
->float_rounding_mode
) {
6783 case float_round_nearest_even
:
6784 if ( lastBitMask
) {
6785 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6786 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6789 if ( (int64_t) z
.low
< 0 ) {
6791 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6795 case float_round_ties_away
:
6797 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6799 if ((int64_t) z
.low
< 0) {
6804 case float_round_to_zero
:
6806 case float_round_up
:
6807 if (!extractFloat128Sign(z
)) {
6808 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6811 case float_round_down
:
6812 if (extractFloat128Sign(z
)) {
6813 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6816 case float_round_to_odd
:
6818 * Note that if lastBitMask == 0, the last bit is the lsb
6819 * of high, and roundBitsMask == -1.
6821 if ((lastBitMask
? z
.low
& lastBitMask
: z
.high
& 1) == 0) {
6822 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6828 z
.low
&= ~ roundBitsMask
;
6831 if ( aExp
< 0x3FFF ) {
6832 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6833 status
->float_exception_flags
|= float_flag_inexact
;
6834 aSign
= extractFloat128Sign( a
);
6835 switch (status
->float_rounding_mode
) {
6836 case float_round_nearest_even
:
6837 if ( ( aExp
== 0x3FFE )
6838 && ( extractFloat128Frac0( a
)
6839 | extractFloat128Frac1( a
) )
6841 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6844 case float_round_ties_away
:
6845 if (aExp
== 0x3FFE) {
6846 return packFloat128(aSign
, 0x3FFF, 0, 0);
6849 case float_round_down
:
6851 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6852 : packFloat128( 0, 0, 0, 0 );
6853 case float_round_up
:
6855 aSign
? packFloat128( 1, 0, 0, 0 )
6856 : packFloat128( 0, 0x3FFF, 0, 0 );
6858 case float_round_to_odd
:
6859 return packFloat128(aSign
, 0x3FFF, 0, 0);
6861 case float_round_to_zero
:
6864 return packFloat128( aSign
, 0, 0, 0 );
6867 lastBitMask
<<= 0x402F - aExp
;
6868 roundBitsMask
= lastBitMask
- 1;
6871 switch (status
->float_rounding_mode
) {
6872 case float_round_nearest_even
:
6873 z
.high
+= lastBitMask
>>1;
6874 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6875 z
.high
&= ~ lastBitMask
;
6878 case float_round_ties_away
:
6879 z
.high
+= lastBitMask
>>1;
6881 case float_round_to_zero
:
6883 case float_round_up
:
6884 if (!extractFloat128Sign(z
)) {
6885 z
.high
|= ( a
.low
!= 0 );
6886 z
.high
+= roundBitsMask
;
6889 case float_round_down
:
6890 if (extractFloat128Sign(z
)) {
6891 z
.high
|= (a
.low
!= 0);
6892 z
.high
+= roundBitsMask
;
6895 case float_round_to_odd
:
6896 if ((z
.high
& lastBitMask
) == 0) {
6897 z
.high
|= (a
.low
!= 0);
6898 z
.high
+= roundBitsMask
;
6904 z
.high
&= ~ roundBitsMask
;
6906 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
6907 status
->float_exception_flags
|= float_flag_inexact
;
6913 /*----------------------------------------------------------------------------
6914 | Returns the result of adding the absolute values of the quadruple-precision
6915 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6916 | before being returned. `zSign' is ignored if the result is a NaN.
6917 | The addition is performed according to the IEC/IEEE Standard for Binary
6918 | Floating-Point Arithmetic.
6919 *----------------------------------------------------------------------------*/
6921 static float128
addFloat128Sigs(float128 a
, float128 b
, bool zSign
,
6922 float_status
*status
)
6924 int32_t aExp
, bExp
, zExp
;
6925 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6928 aSig1
= extractFloat128Frac1( a
);
6929 aSig0
= extractFloat128Frac0( a
);
6930 aExp
= extractFloat128Exp( a
);
6931 bSig1
= extractFloat128Frac1( b
);
6932 bSig0
= extractFloat128Frac0( b
);
6933 bExp
= extractFloat128Exp( b
);
6934 expDiff
= aExp
- bExp
;
6935 if ( 0 < expDiff
) {
6936 if ( aExp
== 0x7FFF ) {
6937 if (aSig0
| aSig1
) {
6938 return propagateFloat128NaN(a
, b
, status
);
6946 bSig0
|= UINT64_C(0x0001000000000000);
6948 shift128ExtraRightJamming(
6949 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
6952 else if ( expDiff
< 0 ) {
6953 if ( bExp
== 0x7FFF ) {
6954 if (bSig0
| bSig1
) {
6955 return propagateFloat128NaN(a
, b
, status
);
6957 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6963 aSig0
|= UINT64_C(0x0001000000000000);
6965 shift128ExtraRightJamming(
6966 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
6970 if ( aExp
== 0x7FFF ) {
6971 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6972 return propagateFloat128NaN(a
, b
, status
);
6976 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6978 if (status
->flush_to_zero
) {
6979 if (zSig0
| zSig1
) {
6980 float_raise(float_flag_output_denormal
, status
);
6982 return packFloat128(zSign
, 0, 0, 0);
6984 return packFloat128( zSign
, 0, zSig0
, zSig1
);
6987 zSig0
|= UINT64_C(0x0002000000000000);
6991 aSig0
|= UINT64_C(0x0001000000000000);
6992 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6994 if ( zSig0
< UINT64_C(0x0002000000000000) ) goto roundAndPack
;
6997 shift128ExtraRightJamming(
6998 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
7000 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7004 /*----------------------------------------------------------------------------
7005 | Returns the result of subtracting the absolute values of the quadruple-
7006 | precision floating-point values `a' and `b'. If `zSign' is 1, the
7007 | difference is negated before being returned. `zSign' is ignored if the
7008 | result is a NaN. The subtraction is performed according to the IEC/IEEE
7009 | Standard for Binary Floating-Point Arithmetic.
7010 *----------------------------------------------------------------------------*/
7012 static float128
subFloat128Sigs(float128 a
, float128 b
, bool zSign
,
7013 float_status
*status
)
7015 int32_t aExp
, bExp
, zExp
;
7016 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
7019 aSig1
= extractFloat128Frac1( a
);
7020 aSig0
= extractFloat128Frac0( a
);
7021 aExp
= extractFloat128Exp( a
);
7022 bSig1
= extractFloat128Frac1( b
);
7023 bSig0
= extractFloat128Frac0( b
);
7024 bExp
= extractFloat128Exp( b
);
7025 expDiff
= aExp
- bExp
;
7026 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
7027 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
7028 if ( 0 < expDiff
) goto aExpBigger
;
7029 if ( expDiff
< 0 ) goto bExpBigger
;
7030 if ( aExp
== 0x7FFF ) {
7031 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
7032 return propagateFloat128NaN(a
, b
, status
);
7034 float_raise(float_flag_invalid
, status
);
7035 return float128_default_nan(status
);
7041 if ( bSig0
< aSig0
) goto aBigger
;
7042 if ( aSig0
< bSig0
) goto bBigger
;
7043 if ( bSig1
< aSig1
) goto aBigger
;
7044 if ( aSig1
< bSig1
) goto bBigger
;
7045 return packFloat128(status
->float_rounding_mode
== float_round_down
,
7048 if ( bExp
== 0x7FFF ) {
7049 if (bSig0
| bSig1
) {
7050 return propagateFloat128NaN(a
, b
, status
);
7052 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
7058 aSig0
|= UINT64_C(0x4000000000000000);
7060 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7061 bSig0
|= UINT64_C(0x4000000000000000);
7063 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7066 goto normalizeRoundAndPack
;
7068 if ( aExp
== 0x7FFF ) {
7069 if (aSig0
| aSig1
) {
7070 return propagateFloat128NaN(a
, b
, status
);
7078 bSig0
|= UINT64_C(0x4000000000000000);
7080 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
7081 aSig0
|= UINT64_C(0x4000000000000000);
7083 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
7085 normalizeRoundAndPack
:
7087 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
7092 /*----------------------------------------------------------------------------
7093 | Returns the result of adding the quadruple-precision floating-point values
7094 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7095 | for Binary Floating-Point Arithmetic.
7096 *----------------------------------------------------------------------------*/
7098 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
7102 aSign
= extractFloat128Sign( a
);
7103 bSign
= extractFloat128Sign( b
);
7104 if ( aSign
== bSign
) {
7105 return addFloat128Sigs(a
, b
, aSign
, status
);
7108 return subFloat128Sigs(a
, b
, aSign
, status
);
7113 /*----------------------------------------------------------------------------
7114 | Returns the result of subtracting the quadruple-precision floating-point
7115 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7116 | Standard for Binary Floating-Point Arithmetic.
7117 *----------------------------------------------------------------------------*/
7119 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
7123 aSign
= extractFloat128Sign( a
);
7124 bSign
= extractFloat128Sign( b
);
7125 if ( aSign
== bSign
) {
7126 return subFloat128Sigs(a
, b
, aSign
, status
);
7129 return addFloat128Sigs(a
, b
, aSign
, status
);
7134 /*----------------------------------------------------------------------------
7135 | Returns the result of multiplying the quadruple-precision floating-point
7136 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7137 | Standard for Binary Floating-Point Arithmetic.
7138 *----------------------------------------------------------------------------*/
7140 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
7142 bool aSign
, bSign
, zSign
;
7143 int32_t aExp
, bExp
, zExp
;
7144 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
7146 aSig1
= extractFloat128Frac1( a
);
7147 aSig0
= extractFloat128Frac0( a
);
7148 aExp
= extractFloat128Exp( a
);
7149 aSign
= extractFloat128Sign( a
);
7150 bSig1
= extractFloat128Frac1( b
);
7151 bSig0
= extractFloat128Frac0( b
);
7152 bExp
= extractFloat128Exp( b
);
7153 bSign
= extractFloat128Sign( b
);
7154 zSign
= aSign
^ bSign
;
7155 if ( aExp
== 0x7FFF ) {
7156 if ( ( aSig0
| aSig1
)
7157 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7158 return propagateFloat128NaN(a
, b
, status
);
7160 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
7161 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7163 if ( bExp
== 0x7FFF ) {
7164 if (bSig0
| bSig1
) {
7165 return propagateFloat128NaN(a
, b
, status
);
7167 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7169 float_raise(float_flag_invalid
, status
);
7170 return float128_default_nan(status
);
7172 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7175 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7176 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7179 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7180 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7182 zExp
= aExp
+ bExp
- 0x4000;
7183 aSig0
|= UINT64_C(0x0001000000000000);
7184 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
7185 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
7186 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
7187 zSig2
|= ( zSig3
!= 0 );
7188 if (UINT64_C( 0x0002000000000000) <= zSig0
) {
7189 shift128ExtraRightJamming(
7190 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
7193 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7197 /*----------------------------------------------------------------------------
7198 | Returns the result of dividing the quadruple-precision floating-point value
7199 | `a' by the corresponding value `b'. The operation is performed according to
7200 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7201 *----------------------------------------------------------------------------*/
7203 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
7205 bool aSign
, bSign
, zSign
;
7206 int32_t aExp
, bExp
, zExp
;
7207 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
7208 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7210 aSig1
= extractFloat128Frac1( a
);
7211 aSig0
= extractFloat128Frac0( a
);
7212 aExp
= extractFloat128Exp( a
);
7213 aSign
= extractFloat128Sign( a
);
7214 bSig1
= extractFloat128Frac1( b
);
7215 bSig0
= extractFloat128Frac0( b
);
7216 bExp
= extractFloat128Exp( b
);
7217 bSign
= extractFloat128Sign( b
);
7218 zSign
= aSign
^ bSign
;
7219 if ( aExp
== 0x7FFF ) {
7220 if (aSig0
| aSig1
) {
7221 return propagateFloat128NaN(a
, b
, status
);
7223 if ( bExp
== 0x7FFF ) {
7224 if (bSig0
| bSig1
) {
7225 return propagateFloat128NaN(a
, b
, status
);
7229 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7231 if ( bExp
== 0x7FFF ) {
7232 if (bSig0
| bSig1
) {
7233 return propagateFloat128NaN(a
, b
, status
);
7235 return packFloat128( zSign
, 0, 0, 0 );
7238 if ( ( bSig0
| bSig1
) == 0 ) {
7239 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
7241 float_raise(float_flag_invalid
, status
);
7242 return float128_default_nan(status
);
7244 float_raise(float_flag_divbyzero
, status
);
7245 return packFloat128( zSign
, 0x7FFF, 0, 0 );
7247 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7250 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
7251 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7253 zExp
= aExp
- bExp
+ 0x3FFD;
7255 aSig0
| UINT64_C(0x0001000000000000), aSig1
, 15, &aSig0
, &aSig1
);
7257 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7258 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
7259 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
7262 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7263 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
7264 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
7265 while ( (int64_t) rem0
< 0 ) {
7267 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
7269 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
7270 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
7271 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
7272 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
7273 while ( (int64_t) rem1
< 0 ) {
7275 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
7277 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7279 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
7280 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
7284 /*----------------------------------------------------------------------------
7285 | Returns the remainder of the quadruple-precision floating-point value `a'
7286 | with respect to the corresponding value `b'. The operation is performed
7287 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7288 *----------------------------------------------------------------------------*/
7290 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
7293 int32_t aExp
, bExp
, expDiff
;
7294 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
7295 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
7298 aSig1
= extractFloat128Frac1( a
);
7299 aSig0
= extractFloat128Frac0( a
);
7300 aExp
= extractFloat128Exp( a
);
7301 aSign
= extractFloat128Sign( a
);
7302 bSig1
= extractFloat128Frac1( b
);
7303 bSig0
= extractFloat128Frac0( b
);
7304 bExp
= extractFloat128Exp( b
);
7305 if ( aExp
== 0x7FFF ) {
7306 if ( ( aSig0
| aSig1
)
7307 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
7308 return propagateFloat128NaN(a
, b
, status
);
7312 if ( bExp
== 0x7FFF ) {
7313 if (bSig0
| bSig1
) {
7314 return propagateFloat128NaN(a
, b
, status
);
7319 if ( ( bSig0
| bSig1
) == 0 ) {
7321 float_raise(float_flag_invalid
, status
);
7322 return float128_default_nan(status
);
7324 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
7327 if ( ( aSig0
| aSig1
) == 0 ) return a
;
7328 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7330 expDiff
= aExp
- bExp
;
7331 if ( expDiff
< -1 ) return a
;
7333 aSig0
| UINT64_C(0x0001000000000000),
7335 15 - ( expDiff
< 0 ),
7340 bSig0
| UINT64_C(0x0001000000000000), bSig1
, 15, &bSig0
, &bSig1
);
7341 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
7342 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7344 while ( 0 < expDiff
) {
7345 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7346 q
= ( 4 < q
) ? q
- 4 : 0;
7347 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7348 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
7349 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
7350 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
7353 if ( -64 < expDiff
) {
7354 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
7355 q
= ( 4 < q
) ? q
- 4 : 0;
7357 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7359 if ( expDiff
< 0 ) {
7360 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
7363 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
7365 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
7366 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
7369 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
7370 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
7373 alternateASig0
= aSig0
;
7374 alternateASig1
= aSig1
;
7376 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
7377 } while ( 0 <= (int64_t) aSig0
);
7379 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
7380 if ( ( sigMean0
< 0 )
7381 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
7382 aSig0
= alternateASig0
;
7383 aSig1
= alternateASig1
;
7385 zSign
= ( (int64_t) aSig0
< 0 );
7386 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
7387 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
7391 /*----------------------------------------------------------------------------
7392 | Returns the square root of the quadruple-precision floating-point value `a'.
7393 | The operation is performed according to the IEC/IEEE Standard for Binary
7394 | Floating-Point Arithmetic.
7395 *----------------------------------------------------------------------------*/
7397 float128
float128_sqrt(float128 a
, float_status
*status
)
7401 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
7402 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
7404 aSig1
= extractFloat128Frac1( a
);
7405 aSig0
= extractFloat128Frac0( a
);
7406 aExp
= extractFloat128Exp( a
);
7407 aSign
= extractFloat128Sign( a
);
7408 if ( aExp
== 0x7FFF ) {
7409 if (aSig0
| aSig1
) {
7410 return propagateFloat128NaN(a
, a
, status
);
7412 if ( ! aSign
) return a
;
7416 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
7418 float_raise(float_flag_invalid
, status
);
7419 return float128_default_nan(status
);
7422 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
7423 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
7425 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
7426 aSig0
|= UINT64_C(0x0001000000000000);
7427 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
7428 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
7429 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
7430 doubleZSig0
= zSig0
<<1;
7431 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
7432 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
7433 while ( (int64_t) rem0
< 0 ) {
7436 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
7438 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
7439 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
7440 if ( zSig1
== 0 ) zSig1
= 1;
7441 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
7442 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
7443 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
7444 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7445 while ( (int64_t) rem1
< 0 ) {
7447 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
7449 term2
|= doubleZSig0
;
7450 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
7452 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
7454 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
7455 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
7459 static inline FloatRelation
7460 floatx80_compare_internal(floatx80 a
, floatx80 b
, bool is_quiet
,
7461 float_status
*status
)
7465 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
7466 float_raise(float_flag_invalid
, status
);
7467 return float_relation_unordered
;
7469 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7470 ( extractFloatx80Frac( a
)<<1 ) ) ||
7471 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7472 ( extractFloatx80Frac( b
)<<1 ) )) {
7474 floatx80_is_signaling_nan(a
, status
) ||
7475 floatx80_is_signaling_nan(b
, status
)) {
7476 float_raise(float_flag_invalid
, status
);
7478 return float_relation_unordered
;
7480 aSign
= extractFloatx80Sign( a
);
7481 bSign
= extractFloatx80Sign( b
);
7482 if ( aSign
!= bSign
) {
7484 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7485 ( ( a
.low
| b
.low
) == 0 ) ) {
7487 return float_relation_equal
;
7489 return 1 - (2 * aSign
);
7492 /* Normalize pseudo-denormals before comparison. */
7493 if ((a
.high
& 0x7fff) == 0 && a
.low
& UINT64_C(0x8000000000000000)) {
7496 if ((b
.high
& 0x7fff) == 0 && b
.low
& UINT64_C(0x8000000000000000)) {
7499 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7500 return float_relation_equal
;
7502 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7507 FloatRelation
floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
7509 return floatx80_compare_internal(a
, b
, 0, status
);
7512 FloatRelation
floatx80_compare_quiet(floatx80 a
, floatx80 b
,
7513 float_status
*status
)
7515 return floatx80_compare_internal(a
, b
, 1, status
);
7518 static inline FloatRelation
7519 float128_compare_internal(float128 a
, float128 b
, bool is_quiet
,
7520 float_status
*status
)
7524 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7525 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7526 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7527 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7529 float128_is_signaling_nan(a
, status
) ||
7530 float128_is_signaling_nan(b
, status
)) {
7531 float_raise(float_flag_invalid
, status
);
7533 return float_relation_unordered
;
7535 aSign
= extractFloat128Sign( a
);
7536 bSign
= extractFloat128Sign( b
);
7537 if ( aSign
!= bSign
) {
7538 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7540 return float_relation_equal
;
7542 return 1 - (2 * aSign
);
7545 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7546 return float_relation_equal
;
7548 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7553 FloatRelation
float128_compare(float128 a
, float128 b
, float_status
*status
)
7555 return float128_compare_internal(a
, b
, 0, status
);
7558 FloatRelation
float128_compare_quiet(float128 a
, float128 b
,
7559 float_status
*status
)
7561 return float128_compare_internal(a
, b
, 1, status
);
7564 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7570 if (floatx80_invalid_encoding(a
)) {
7571 float_raise(float_flag_invalid
, status
);
7572 return floatx80_default_nan(status
);
7574 aSig
= extractFloatx80Frac( a
);
7575 aExp
= extractFloatx80Exp( a
);
7576 aSign
= extractFloatx80Sign( a
);
7578 if ( aExp
== 0x7FFF ) {
7580 return propagateFloatx80NaN(a
, a
, status
);
7594 } else if (n
< -0x10000) {
7599 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7600 aSign
, aExp
, aSig
, 0, status
);
7603 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7607 uint64_t aSig0
, aSig1
;
7609 aSig1
= extractFloat128Frac1( a
);
7610 aSig0
= extractFloat128Frac0( a
);
7611 aExp
= extractFloat128Exp( a
);
7612 aSign
= extractFloat128Sign( a
);
7613 if ( aExp
== 0x7FFF ) {
7614 if ( aSig0
| aSig1
) {
7615 return propagateFloat128NaN(a
, a
, status
);
7620 aSig0
|= UINT64_C(0x0001000000000000);
7621 } else if (aSig0
== 0 && aSig1
== 0) {
7629 } else if (n
< -0x10000) {
7634 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1
7639 static void __attribute__((constructor
)) softfloat_init(void)
7641 union_float64 ua
, ub
, uc
, ur
;
7643 if (QEMU_NO_HARDFLOAT
) {
7647 * Test that the host's FMA is not obviously broken. For example,
7648 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7649 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7651 ua
.s
= 0x0020000000000001ULL
;
7652 ub
.s
= 0x3ca0000000000000ULL
;
7653 uc
.s
= 0x0020000000000000ULL
;
7654 ur
.h
= fma(ua
.h
, ub
.h
, uc
.h
);
7655 if (ur
.s
!= 0x0020000000000001ULL
) {
7656 force_soft_fma
= true;